In [16]:
import numpy as np
import mir_eval
import collections
import json
from os.path import join as joinpath
import csv
from statistics import mean
import pandas as pd

In [108]:
path = "/Users/polinap/Yandex.Disk.localized/RESEARCH_RU/VOCAL_NOTES_PROJECT/RUSSIAN/ANALYSIS/"
song = "Da_po_zoriushke_4"
transcriber_list = ["PP", "OV"]
divider = "__"
outfile = "howmanynotes.csv"

In [109]:
notespath_list = []
for transcriber in transcriber_list:
    filename = song + divider + transcriber + divider + 'notes.csv'
    notespath = joinpath(path,song,transcriber,filename)
    notespath_list.append(notespath)
notespath_list

['/Users/polinap/Yandex.Disk.localized/RESEARCH_RU/VOCAL_NOTES_PROJECT/RUSSIAN/ANALYSIS/Da_po_zoriushke_4/PP/Da_po_zoriushke_4__PP__notes.csv',
 '/Users/polinap/Yandex.Disk.localized/RESEARCH_RU/VOCAL_NOTES_PROJECT/RUSSIAN/ANALYSIS/Da_po_zoriushke_4/OV/Da_po_zoriushke_4__OV__notes.csv']

In [110]:
def read_from_csv(notespath):
    notes = pd.read_csv(notespath, header=None) # doesn't read headers

    # remove columns beyond the first three
    if len(notes.columns) > 3:
        notes = notes.iloc[:,:3]

    # remove headers
    if notes.iloc[0,0]=="TIME":
        notes = notes.iloc[1:,:]
        notes.reset_index(drop=True, inplace=True)

    notes.columns = ['TIME', 'VALUE', 'DURATION']
    
    notes['VALUE'] = notes['VALUE'].astype(float)
    if all(notes['VALUE']%1==0):        # all integers
        notes['VALUE'] = notes['VALUE'].astype(int)

    return notes

In [111]:
def getOverlap(segment1, segment2):
    return max(0, min(segment1[1], segment2[1]) - max(segment1[0], segment2[0]))

def getOverlapCost(segment1, segment2):
    overlap = getOverlap(segment1, segment2)
    if overlap == 0:
        return 1
    len1 = segment1[1] - segment1[0]
    len2 = segment2[1] - segment2[0]
    return 1 - max(overlap/len1, overlap/len2)

def mydtw(series_1, series_2):
	matrix = np.zeros((len(series_1) + 1, len(series_2) + 1))
	matrix[0,:] = np.inf
	matrix[:,0] = np.inf
	matrix[0,0] = 0
	for i, vec1 in enumerate(series_1):
		for j, vec2 in enumerate(series_2):
			cost = getOverlapCost(vec1,vec2)
			matrix[i + 1, j + 1] = cost + min(matrix[i, j + 1], matrix[i + 1, j], matrix[i, j])
	matrix = matrix[1:,1:]
	i = matrix.shape[0] - 1
	j = matrix.shape[1] - 1
	matches = []
	mappings_series_1 = [list() for v in range(matrix.shape[0])]
	mappings_series_2 = [list() for v in range(matrix.shape[1])]
	while i > 0 or j > 0:
		matches.append((i, j))
		mappings_series_1[i].append(j)
		mappings_series_2[j].append(i)
		option_diag = matrix[i - 1, j - 1] if i > 0 and j > 0 else np.inf
		option_up = matrix[i - 1, j] if i > 0 else np.inf
		option_left = matrix[i, j - 1] if j > 0 else np.inf
		move = np.argmin([option_diag, option_up, option_left])
		if move == 0:
			i -= 1
			j -= 1
		elif move == 1:
			i -= 1
		else:
			j -= 1
	matches.append((0, 0))
	mappings_series_1[0].append(0)
	mappings_series_2[0].append(0)
	matches.reverse()
	for mp in mappings_series_1:
		mp.reverse()
	for mp in mappings_series_2:
		mp.reverse()
	
	return matches, mappings_series_1, mappings_series_2

In [113]:
notes_list = []
onsets_list = []
pitches_list = []
segments_list = []
for notespath in notespath_list:
    notes = read_from_csv(notespath)
    notes_list.append(notes)
    onsets = notes['TIME'].astype(float).to_numpy()
    pitches = notes['VALUE'].astype(float).to_numpy()
    
    durations = notes['DURATION'].astype(float).to_numpy()
    if min(durations) == 0:
        for idx in range(len(duration)):
            if durations[idx] == 0:
                durations[idx] = 0.00001
                
    offsets = np.sum([onsets, durations],axis=0)
    if onsets[0] < 0:
        onsets[0] = 0
    if offsets[0] <= 0:
        offsets[0] = 0.000001
    segments = np.array(list(zip(onsets, offsets)))
    
    onsets_list.append(onsets)
    pitches_list.append(pitches)
    segments_list.append(segments)

In [114]:
matches, mapping_1, mapping_2 = mydtw(segments_list[0], segments_list[1])

In [115]:
cluster_segments = []
segments1 = segments_list[0]
segments2 = segments_list[1]
for idx in range(len(mapping_1)):
    if len(mapping_1[idx]) > 1:
        start1, stop1 = segments1[idx]
        start2 = segments2[mapping_1[idx][0]][0]
        stop2 = segments2[mapping_1[idx][-1]][1]
        start = min(start1, start2)
        stop = max(stop1, stop2)
        cluster_segments.append([start, stop])
for idx in range(len(mapping_2)):
    if len(mapping_2[idx]) > 1:
        start1, stop1 = segments2[idx]
        start2 = segments1[mapping_2[idx][0]][0]
        stop2 = segments1[mapping_2[idx][-1]][1]
        start = min(start1, start2)
        stop = max(stop1, stop2)
        cluster_segments.append([start, stop])

In [116]:
df = pd.DataFrame(cluster_segments)

In [117]:
df.columns = ["ONSET", "OFFSET"]
df["DURATION"] = df['OFFSET'] - df['ONSET']
df.columns = ["ONSET", "OFFSET", "DURATION"]
df = df[["ONSET", "DURATION"]]

In [118]:
outpath = joinpath(path,song,outfile)
df.to_csv(outpath, index = False)

In [44]:
mir_eval.transcription.validate(segments_list[0], pitches_list[0], segments_list[1], pitches_list[1])
matched_onsets = mir_eval.transcription.match_note_onsets(segments_list[0], segments_list[1], onset_tolerance=0.1, strict=False)
#matched_notes = mir_eval.transcription.match_notes(segments_list[0], pitches_list[0], segments_list[1], pitches_list[1])
print(matched_onsets)

[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 9), (9, 10), (10, 14), (11, 18), (12, 19), (13, 21), (14, 22), (15, 23), (16, 24), (17, 25), (18, 26), (19, 27), (20, 30), (21, 31), (22, 32), (23, 33), (24, 34), (25, 35), (26, 36), (27, 37), (28, 38), (29, 39), (30, 40), (31, 41), (32, 42), (33, 44), (35, 46), (36, 47), (37, 48), (38, 49), (39, 50), (40, 51), (41, 52), (42, 53), (43, 54), (44, 55), (45, 56), (46, 61), (47, 65), (48, 66), (49, 68), (50, 69), (51, 70), (52, 71), (53, 72), (54, 73), (55, 74), (56, 77), (57, 79), (58, 80), (59, 81), (60, 82), (61, 83), (62, 84), (63, 85), (64, 86), (65, 87), (66, 88), (67, 93), (68, 94), (69, 95), (70, 96), (71, 97)]


In [45]:
alignment = [(0,0), (1,1)]
clusters = []
prev_onset1 = 1
prev_onset2 = 1
for onset1, onset2 in matched_onsets:
    if onset1 >= 2:
        if onset2 - onset1 > prev_onset2 - prev_onset1:
            clusters.append((prev_onset1, prev_onset2))
            for insert in range(prev_onset2+1, onset2):
                alignment.append((prev_onset1, insert))
                clusters.append((prev_onset1, insert))
        if onset2 - onset1 < prev_onset2 - prev_onset1:
            clusters.append((prev_onset1, prev_onset2))
            for insert in range(prev_onset1+1, onset1):
                alignment.append((insert, prev_onset2))
                clusters.append((insert, prev_onset2))
        alignment.append((onset1, onset2))
        prev_onset1 = onset1
        prev_onset2 = onset2
clusters

[(7, 7),
 (7, 8),
 (9, 10),
 (9, 11),
 (9, 12),
 (9, 13),
 (10, 14),
 (10, 15),
 (10, 16),
 (10, 17),
 (12, 19),
 (12, 20),
 (19, 27),
 (19, 28),
 (19, 29),
 (32, 42),
 (32, 43),
 (45, 56),
 (45, 57),
 (45, 58),
 (45, 59),
 (45, 60),
 (46, 61),
 (46, 62),
 (46, 63),
 (46, 64),
 (48, 66),
 (48, 67),
 (55, 74),
 (55, 75),
 (55, 76),
 (56, 77),
 (56, 78),
 (66, 88),
 (66, 89),
 (66, 90),
 (66, 91),
 (66, 92)]

In [46]:
cluster_segments = []
segments1 = segments_list[0]
segments2 = segments_list[1]
for ind1, ind2 in clusters:
    start = min(segments1[ind1][0], segments2[ind2][0])
    stop = max(segments1[ind1][1], segments2[ind2][1])
    cluster_segments.append([start, stop])
print(cluster_segments)
ind = len(cluster_segments)-2
while ind >= 0:
    if cluster_segments[ind][1] >= cluster_segments[ind+1][0]:
        cluster_segments[ind][1] = cluster_segments[ind+1][1]
        del cluster_segments[ind+1]
    ind = ind - 1    
cluster_segments

[[2.016507937, 2.367346939], [2.047346939, 2.402539682], [2.693877551, 3.303424036], [2.693877551, 3.303424036], [2.693877551, 3.303424036], [2.693877551, 3.303424036], [3.213061224, 3.881337868], [3.308843537, 3.881337868], [3.308843537, 3.881337868], [3.308843537, 3.882448979], [4.37877551, 4.623673469], [4.37877551, 4.683900227], [6.533877551, 7.190204082], [6.533877551, 7.190204082], [6.533877551, 7.190204082], [13.319183673, 13.606530612], [13.319183673, 13.606530612], [19.121632653, 19.751111111], [19.121632653, 19.751111111], [19.121632653, 19.751111111], [19.121632653, 19.751111111], [19.121632653, 19.762358276], [19.751111111, 20.290612245], [19.751111111, 20.290612245], [19.751111111, 20.290612245], [19.751111111, 20.292063492], [20.790204082, 21.025306123], [20.790204082, 21.039455782], [22.968163265, 23.255510204], [22.968163265, 23.255510204], [22.968163265, 23.255510204], [23.238095238, 23.542857143], [23.25877551, 23.542857143], [29.082993197, 29.726258503], [29.08299319

[[2.016507937, 2.402539682],
 [2.693877551, 3.882448979],
 [4.37877551, 4.683900227],
 [6.533877551, 7.190204082],
 [13.319183673, 13.606530612],
 [19.121632653, 20.292063492],
 [20.790204082, 21.039455782],
 [22.968163265, 23.542857143],
 [29.082993197, 29.768707483]]