In [180]:
import numpy as np
import mir_eval
import collections
import json
from os.path import join as joinpath
import csv
from statistics import mean
import pandas as pd
import os, os.path, copy

In [181]:
path = "/Users/polinap/Yandex.Disk.localized/RESEARCH_RU/VOCAL_NOTES_PROJECT/RUSSIAN/ANALYSIS/"
song = "Da_po_zoriushke_4"
divider = "__"

# notes file expected at: <path>/<song>/<transcriber>/<song><divider><transcriber><divider>'notes.csv'

outfile_prefix = "howmanynotes" # filename to write out will be of the form:
# <outfile><divider><transcriber1><divider><transcriber2>.csv and it will be written to the song folder

In [182]:
def read_from_csv(notespath):
    notes = pd.read_csv(notespath, header=None) # doesn't read headers

    # remove columns beyond the first three
    if len(notes.columns) > 3:
        notes = notes.iloc[:,:3]

    # remove headers
    if notes.iloc[0,0]=="TIME":
        notes = notes.iloc[1:,:]
        notes.reset_index(drop=True, inplace=True)

    notes.columns = ['TIME', 'VALUE', 'DURATION']
    
    notes['VALUE'] = notes['VALUE'].astype(float)
    if all(notes['VALUE']%1==0):        # all integers
        notes['VALUE'] = notes['VALUE'].astype(int)

    return notes

# alignment cost function
def getOverlap(segment1, segment2):
    return max(0, min(segment1[1], segment2[1]) - max(segment1[0], segment2[0]))
def getOverlapCost(segment1, segment2):
    overlap = getOverlap(segment1, segment2)
    if overlap == 0:
        return 1
    len1 = segment1[1] - segment1[0]
    len2 = segment2[1] - segment2[0]
    return 1 - max(overlap/len1, overlap/len2)

# match segment sequences with dtw
def mydtw(series_1, series_2):
    # copied from simpledtw, changed cost function
	matrix = np.zeros((len(series_1) + 1, len(series_2) + 1))
	matrix[0,:] = np.inf
	matrix[:,0] = np.inf
	matrix[0,0] = 0
	for i, vec1 in enumerate(series_1):
		for j, vec2 in enumerate(series_2):
			cost = getOverlapCost(vec1,vec2)
			matrix[i + 1, j + 1] = cost + min(matrix[i, j + 1], matrix[i + 1, j], matrix[i, j])
	matrix = matrix[1:,1:]
	i = matrix.shape[0] - 1
	j = matrix.shape[1] - 1
	matches = []
	mappings_series_1 = [list() for v in range(matrix.shape[0])]
	mappings_series_2 = [list() for v in range(matrix.shape[1])]
	while i > 0 or j > 0:
		matches.append((i, j))
		mappings_series_1[i].append(j)
		mappings_series_2[j].append(i)
		option_diag = matrix[i - 1, j - 1] if i > 0 and j > 0 else np.inf
		option_up = matrix[i - 1, j] if i > 0 else np.inf
		option_left = matrix[i, j - 1] if j > 0 else np.inf
		move = np.argmin([option_diag, option_up, option_left])
		if move == 0:
			i -= 1
			j -= 1
		elif move == 1:
			i -= 1
		else:
			j -= 1
	matches.append((0, 0))
	mappings_series_1[0].append(0)
	mappings_series_2[0].append(0)
	matches.reverse()
	for mp in mappings_series_1:
		mp.reverse()
	for mp in mappings_series_2:
		mp.reverse()
	
	return matches, mappings_series_1, mappings_series_2

In [183]:
# for a notes file, collect segments
def collect_segments(notespath):
    notes = read_from_csv(notespath)
    onsets = notes['TIME'].astype(float).to_numpy()

    durations = notes['DURATION'].astype(float).to_numpy()
    if min(durations) == 0:
        for idx in range(len(duration)):
            if durations[idx] == 0:
                durations[idx] = 0.00001

    offsets = np.sum([onsets, durations],axis=0)
    if onsets[0] < 0:
        onsets[0] = 0
    if offsets[0] <= 0:
        offsets[0] = 0.000001
    segments = np.array(list(zip(onsets, offsets)))
        
    return segments


# collect note clusters (where number of segments/notes differs between transcribers) from an alignment
def get_note_clusters(segments1, segments2, mapping_1, mapping_2):
    cluster_segments = []
    for idx in range(len(mapping_1)):
        if len(mapping_1[idx]) > 1:
            start1, stop1 = segments1[idx]
            start2 = segments2[mapping_1[idx][0]][0]
            stop2 = segments2[mapping_1[idx][-1]][1]
            start = min(start1, start2)
            stop = max(stop1, stop2)
            cluster_segments.append([start, stop])
    for idx in range(len(mapping_2)):
        if len(mapping_2[idx]) > 1:
            start1, stop1 = segments2[idx]
            start2 = segments1[mapping_2[idx][0]][0]
            stop2 = segments1[mapping_2[idx][-1]][1]
            start = min(start1, start2)
            stop = max(stop1, stop2)
            cluster_segments.append([start, stop])
            
    return cluster_segments

In [190]:
def write_clusters_2csv(cluster_segments, transcriber1, transcriber2):
    df = pd.DataFrame(cluster_segments)
    df.columns = ["ONSET", "OFFSET"]
    df["DURATION"] = df['OFFSET'] - df['ONSET']
    df.columns = ["ONSET", "OFFSET", "DURATION"]
    df = df[["ONSET", "DURATION"]]

    outfile = outfile_prefix + divider + transcriber1 + divider + transcriber2 + '.csv'
    outpath = joinpath(path,song,outfile)
    df.to_csv(outpath, index = False)

In [191]:
# automatically detect transcribers and collect note files
songpath = joinpath(path,song)
transcriber_list = next(os.walk(songpath))[1]

notespath_list = []
tlist = copy.deepcopy(transcriber_list)

for transcriber in tlist:
    filename = song + divider + transcriber + divider + 'notes.csv'
    notespath = joinpath(path,song,transcriber,filename)
    if os.path.isfile(notespath):
        notespath_list.append(notespath)
    else:
        transcriber_list.remove(transcriber)
print("transcribers: ", transcriber_list)
print("note files: ", notespath_list)

transcribers:  ['PP', 'OV']
note files:  ['/Users/polinap/Yandex.Disk.localized/RESEARCH_RU/VOCAL_NOTES_PROJECT/RUSSIAN/ANALYSIS/Da_po_zoriushke_4/PP/Da_po_zoriushke_4__PP__notes.csv', '/Users/polinap/Yandex.Disk.localized/RESEARCH_RU/VOCAL_NOTES_PROJECT/RUSSIAN/ANALYSIS/Da_po_zoriushke_4/OV/Da_po_zoriushke_4__OV__notes.csv']


In [193]:
for count, transcriber1 in enumerate(transcriber_list[:-1]):
    for transcriber2 in transcriber_list[count+1:]:

        notespath_list = []
        filename1 = song + divider + transcriber1 + divider + 'notes.csv'
        filename2 = song + divider + transcriber2 + divider + 'notes.csv'
        notespath1 = joinpath(path,song,transcriber1,filename1)
        notespath2 = joinpath(path,song,transcriber2,filename2)
        
        segments1 = collect_segments(notespath1)
        segments2 = collect_segments(notespath2)
        
        matches, mapping_1, mapping_2 = mydtw(segments1, segments2)
        
        cluster_segments = get_note_clusters(segments1, segments2, mapping_1, mapping_2)
        
        write_clusters_2csv(cluster_segments, transcriber1, transcriber2)