In [1]:
# imports

import numpy as np
import pandas as pd
import pickle
from itertools import islice
from scipy import sparse


In [2]:
# Load pattern frequency datatable
freq_dir = '/Users/dannydiamond/NUIG/Polifonia/thesession/revised_feat_seq_corpus_no_pickups/acc_pattern_corpus'
freq_filename = '456gram_freq_table.pkl'
freq_in_path = freq_dir + '/' + freq_filename
freq_table = pd.read_pickle(freq_in_path)
# check size
print(len(freq_table))


137374


In [3]:
# filter frequency datatable, retaining patterns which occur twice or more
freq_table.reset_index(inplace=True)
indices = freq_table.pop('index')
freq_table = freq_table.astype('Sparse[int16, 0]')
freq_table.fillna(value=0, inplace=True)
freq_table['freq'] = freq_table.sum(axis=1)
freq_table.set_index(indices, inplace=True, drop=True)
freq_table = freq_table[freq_table['freq'] >= 2]
print(freq_table.head())
# check size
print(len(freq_table))



                    BoysOfPatstaiThe4772  MouldyPintThe7496  \
index                                                         
[1, 1, 1, 1]                           0                  4   
[1, 1, 1, 1, 1]                        0                  0   
[1, 1, 1, 1, 1, 1]                     0                  0   
[1, 1, 1, 1, 1, 2]                     0                  0   
[1, 1, 1, 1, 1, 3]                     0                  0   

                    LongfordCollectorThe24745  BraesOfTulliemetThe23904  \
index                                                                     
[1, 1, 1, 1]                                0                         0   
[1, 1, 1, 1, 1]                             0                         0   
[1, 1, 1, 1, 1, 1]                          0                         0   
[1, 1, 1, 1, 1, 2]                          0                         0   
[1, 1, 1, 1, 1, 3]                          0                         0   

                    CamowenThe11

In [4]:
# extract filtered dataframe index; reformat index vals (patterns) representation from np arrays to tuples
indices_reformatted = [tuple(i) for i in freq_table.index]
print(len(indices_reformatted))
test_subset = indices_reformatted[:3]
print(test_subset)
print(type(test_subset[0][0]))
filtered_patterns = set(indices_reformatted)


127991
[(1, 1, 1, 1), (1, 1, 1, 1, 1), (1, 1, 1, 1, 1, 1)]
<class 'numpy.int16'>


In [5]:
# read locations dict
locations_dir = '/Users/dannydiamond/NUIG/Polifonia/thesession/revised_feat_seq_corpus_no_pickups/locations'
locations_filename = '456gram_locations.pkl'
locations_in_path = locations_dir + '/' + locations_filename
with open(locations_in_path, 'rb') as locations_raw:
    locations = pickle.load(locations_raw)
    print(len(locations))

40152


In [6]:
from string import digits
# extract tune titles:
titles  = [tune.rstrip(digits) for tune in locations]
print(titles[:10])
# extract tune id numbers:
variant_id_nums = [''.join(i for i in tune if i.isdigit()) for tune in locations]
print(variant_id_nums[:10])

id_dict = dict(zip(variant_id_nums, titles))


['BoysOfPatstaiThe', 'MouldyPintThe', 'LongfordCollectorThe', 'BraesOfTulliemetThe', 'CamowenThe', 'QuimperThe', 'WellAllLieTogether', 'LInconnuDeLimoise', 'MerryReapersThe', 'GanAinm']
['4772', '7496', '24745', '23904', '11581', '20663', '19130', '17950', '35815', '7377']


In [7]:
id_dict_filenames = dict(zip(variant_id_nums, list(locations)))

# check output
for k, v in list(id_dict_filenames.items())[:10]:
    print(k, v)

4772 BoysOfPatstaiThe4772
7496 MouldyPintThe7496
24745 LongfordCollectorThe24745
23904 BraesOfTulliemetThe23904
11581 CamowenThe11581
20663 QuimperThe20663
19130 WellAllLieTogether19130
17950 LInconnuDeLimoise17950
35815 MerryReapersThe35815
7377 GanAinm7377


In [8]:
# filter locations dict
# test_subset = dict(islice(locations.items(), 1000))

filtered_locations = {}
for tune in locations:
    # read value, which holds a dict of patterns and locations
    pattern_locations = locations[tune]
    # print(len(pattern_locations))
    # filter
    filtered = {pattern: pattern_locations[pattern] for pattern in pattern_locations if pattern in filtered_patterns}
    # print(len(filtered))
    filtered_locations[tune] = filtered

# check output
res = dict(zip(list(id_dict), list(filtered_locations.values())))
for k, v in list(res.items())[:10]:
    print(f"{k}:\n{v}")


4772:
{(3, 4, 3, 2): [0, 8], (4, 3, 2, 3): [1, 9], (3, 2, 3, 4): [2, 10], (2, 3, 4, 1): [3, 11], (3, 4, 1, 2): [4, 12], (4, 1, 2, 3): [5], (1, 2, 3, 4): [6], (2, 3, 4, 3): [7], (4, 1, 2, 1): [13], (1, 2, 1, 6): [14, 22], (2, 1, 6, 1): [15, 23], (1, 6, 1, 6): [16, 18, 24, 26], (6, 1, 6, 1): [17, 19, 25, 27], (1, 6, 1, 2): [20, 28], (6, 1, 2, 1): [21], (3, 4, 3, 2, 3): [29, 37], (4, 3, 2, 3, 4): [30, 38], (3, 2, 3, 4, 1): [31, 39], (2, 3, 4, 1, 2): [32, 40], (3, 4, 1, 2, 3): [33], (4, 1, 2, 3, 4): [34], (1, 2, 3, 4, 3): [35], (2, 3, 4, 3, 2): [36], (3, 4, 1, 2, 1): [41], (4, 1, 2, 1, 6): [42], (1, 2, 1, 6, 1): [43, 51], (2, 1, 6, 1, 6): [44, 52], (1, 6, 1, 6, 1): [45, 47, 53, 55], (6, 1, 6, 1, 6): [46, 54], (6, 1, 6, 1, 2): [48, 56], (1, 6, 1, 2, 1): [49], (6, 1, 2, 1, 6): [50], (3, 4, 3, 2, 3, 4): [57, 65], (4, 3, 2, 3, 4, 1): [58, 66], (3, 2, 3, 4, 1, 2): [59, 67], (2, 3, 4, 1, 2, 3): [60], (3, 4, 1, 2, 3, 4): [61], (4, 1, 2, 3, 4, 3): [62], (1, 2, 3, 4, 3, 2): [63], (2, 3, 4, 3, 2, 3)

In [9]:
# calculate tune lengths (beats and 1/8 notes):

import os

def calculate_tune_lengths(target_dir):
    tune_durations = []
    for file_name in os.listdir(target_dir):
        if file_name.endswith('.csv'):
            file_path = f"{target_dir}/{file_name}"
            tune_title = file_name[:-4]
            with open(file_path) as content:
                counter = len(content.readlines()) + 1
                tune_durations.append(counter)
    tune_durations = dict(zip(list(id_dict), tune_durations))

    return tune_durations

# counting beats:
beat_path = '/Users/dannydiamond/NUIG/Polifonia/thesession/revised_feat_seq_corpus_no_pickups/feat_seq_accents'
duration_beats = calculate_tune_lengths(beat_path)
# # counting (1/8) notes
# notes_path = '/Users/dannydiamond/NUIG/Polifonia/thesession/revised_feat_seq_corpus_no_pickups/duration_weighted'
# duration_eighth_notes = calculate_tune_lengths(notes_path)

In [10]:
# Combine dicts generated in above cell: title (key): dict (val -- 'duration_beats': num, 'duration_eighth_notes': num)

# durations = {}
# for tune in duration_beats:
#     durations[tune] = {'duration_beats': duration_beats[tune], 'duration_eighth_notes': duration_eighth_notes[tune]}
#
# for tune in duration_beats:
#     durations[tune] = duration_beats[tune]

In [10]:
import csv
# Extract pitch class sequences from csv files for all tunes:
feature = 'diatonic_scale_degree'
in_dir = '/Users/dannydiamond/NUIG/Polifonia/thesession/revised_feat_seq_corpus_no_pickups/feat_seq_accents'
filenames = [filename[:-4] for filename in os.listdir(in_dir)]
inpaths = [f"{in_dir}/{filename}" for filename in os.listdir(in_dir) if filename.endswith(".csv")]
# identify csv col names
with open(inpaths[0]) as testfile:
    csv_reader = csv.reader(testfile, delimiter=',')
    cols = next(csv_reader)
    colsmap = {col_name: i for i, col_name in enumerate(cols)}
    assert feature in colsmap
# identify index of target column
target_col_idx = colsmap[feature]
# extract target column from all tunes to list
raw_data = [
    np.genfromtxt(
        path,
        dtype=str,
        delimiter=',',
        usecols=target_col_idx,
            skip_header=1)
        for path in inpaths
    ]


In [11]:


feat_seq_data = []
for i in raw_data:
    numeric = []
    for j in i:
        num = int(float(j)) if j != '' else 0
        numeric.append(num)
    feat_seq_data.append(numeric)

feat_seq_data =dict(zip(variant_id_nums, feat_seq_data))

# # confirm ordering matches 'locations' variable above
print(filenames == list(locations))
# inspect sample output
# print(feat_seq_data[0].tolist())

# check output
for k, v in list(feat_seq_data.items())[:2]:
    print(k, v)

True
4772 [3, 5, 5, 3, 4, 6, 6, 4, 3, 5, 5, 3, 2, 3, 2, 1, 3, 5, 5, 3, 4, 6, 6, 6, 6, 1, 2, 3, 2, 1, 2, 1, 3, 5, 5, 3, 4, 6, 6, 4, 3, 5, 5, 3, 2, 3, 2, 1, 3, 5, 5, 3, 4, 6, 6, 6, 6, 1, 2, 3, 2, 1, 2, 1, 1, 2, 3, 2, 1, 6, 2, 2, 2, 2, 1, 2, 3, 2, 1, 6, 5, 1, 2, 3, 2, 1, 6, 2, 2, 2, 2, 1, 6, 5, 3, 2, 1, 1, 2, 3, 2, 1, 6, 2, 2, 2, 2, 1, 2, 3, 2, 1, 6, 5, 1, 2, 3, 2, 1, 6, 2, 2, 2, 2, 1, 6, 5, 3, 2, 1]
7496 [3, 1, 2, 7, 1, 1, 1, 7, 5, 6, 7, 5, 3, 4, 3, 4, 5, 3, 4, 2, 3, 1, 1, 2, 3, 1, 2, 7, 1, 5, 4, 3, 1, 2, 7, 1, 1, 1, 7, 5, 6, 7, 5, 3, 7, 5, 7, 1, 7, 5, 7, 5, 4, 3, 5, 4, 3, 1, 7, 1, 5, 7, 3, 1, 2, 7, 1, 1, 1, 7, 5, 6, 7, 5, 3, 4, 3, 4, 5, 3, 4, 2, 3, 1, 1, 2, 3, 1, 2, 7, 1, 5, 4, 3, 1, 2, 7, 1, 1, 1, 7, 5, 6, 7, 5, 3, 7, 5, 7, 1, 7, 5, 7, 5, 4, 3, 5, 4, 3, 1, 7, 1, 5, 7, 1, 1, 1, 7, 5, 1, 1, 7, 4, 5, 5, 7, 1, 1, 1, 7, 5, 3, 4, 2, 3, 7, 1, 5, 7, 1, 1, 1, 7, 5, 1, 1, 7, 4, 5, 5, 7, 3, 1, 2, 7, 1, 5, 5, 4, 3, 1, 2, 7, 1, 5, 7, 1, 1, 1, 7, 5, 1, 1, 7, 4, 5, 5, 7, 1, 1, 1, 7, 5, 3, 4, 2, 3, 7,

In [12]:
# add tune family annotation -- step 1: read and format tune family names:
tune_fams_dir = '/Users/dannydiamond/NUIG/Polifonia/thesession/test_corpus/tune_fams_ground_truth'
tune_family_names = [filename for filename in os.listdir(tune_fams_dir) if filename.endswith('.csv')]

tune_family_names_formatted = [
    "Jenny's Welcome to Charlie",
    "Drowsy Maggie",
    "Gilderoy",
    "Greig's Pipes",
    "Blackbird",
    "Hob or Nob",
    "O'Sullivan's March",
    "Road to Lisdoonvarna",
    "Lord McDonald's",
    "Johnny Cope"
]

fam_names = dict(zip(tune_family_names_formatted, tune_family_names))
print(fam_names)


{"Jenny's Welcome to Charlie": 'jennys_welcome_to_charlie.csv', 'Drowsy Maggie': 'drowsy_maggie.csv', 'Gilderoy': 'gilderoy.csv', "Greig's Pipes": 'greigs_pipes.csv', 'Blackbird': 'blackbird.csv', 'Hob or Nob': 'hob_or_nob.csv', "O'Sullivan's March": 'osullivans_march.csv', 'Road to Lisdoonvarna': 'road_to_lisdoonvarna.csv', "Lord McDonald's": 'lord_mcdonalds.csv', 'Johnny Cope': 'johnny_cope.csv'}


In [13]:
# add tune family annotation -- step 2: compile all tune variants in each family

# base path to The Session ground truth dir:
grd_truth_basepath = '/Users/dannydiamond/NUIG/Polifonia/thesession/test_corpus/tune_fams_ground_truth/'
grd_truth_paths  = {}
for formatted, raw in fam_names.items():
    grd_truth_path = grd_truth_basepath + raw # 'tune_family_name' variable is defined in above code cell.
    assert os.path.isfile(grd_truth_path)
    # print(f"Ground truth path for {formatted}:")
    # print(grd_truth_path)
    # fix
    grd_truth_paths[formatted] = grd_truth_path

print(grd_truth_paths)


def format_input_data(in_path):
    """For a given tune family csv file located at 'in_path', this function extracts and lists all tune titles."""
    if 'precision' not in in_path:
        data = pd.read_csv(in_path)
        return data['title'].tolist()

# Execute above function and print sample output:
grd_truth = {}
tot = 0
for fam in grd_truth_paths:
    grd_truth[fam] = format_input_data(grd_truth_paths[fam])
    check = len(grd_truth[fam])
    print(f"Ground truth data for {fam}:")
    for title in grd_truth[fam]:
        print(title)
    print(check)
    tot += check
    print('\n')
print(tot)


{"Jenny's Welcome to Charlie": '/Users/dannydiamond/NUIG/Polifonia/thesession/test_corpus/tune_fams_ground_truth/jennys_welcome_to_charlie.csv', 'Drowsy Maggie': '/Users/dannydiamond/NUIG/Polifonia/thesession/test_corpus/tune_fams_ground_truth/drowsy_maggie.csv', 'Gilderoy': '/Users/dannydiamond/NUIG/Polifonia/thesession/test_corpus/tune_fams_ground_truth/gilderoy.csv', "Greig's Pipes": '/Users/dannydiamond/NUIG/Polifonia/thesession/test_corpus/tune_fams_ground_truth/greigs_pipes.csv', 'Blackbird': '/Users/dannydiamond/NUIG/Polifonia/thesession/test_corpus/tune_fams_ground_truth/blackbird.csv', 'Hob or Nob': '/Users/dannydiamond/NUIG/Polifonia/thesession/test_corpus/tune_fams_ground_truth/hob_or_nob.csv', "O'Sullivan's March": '/Users/dannydiamond/NUIG/Polifonia/thesession/test_corpus/tune_fams_ground_truth/osullivans_march.csv', 'Road to Lisdoonvarna': '/Users/dannydiamond/NUIG/Polifonia/thesession/test_corpus/tune_fams_ground_truth/road_to_lisdoonvarna.csv', "Lord McDonald's": '/User

In [14]:
# add tune family annotation -- step 3: add annotation for all tunes in corpus
tune_fam_annotation_numeric = {}
for tune in list(locations):
    tune_fam_annotation_numeric[''.join(i for i in tune if i.isdigit())] = ''
    for fam in grd_truth:
        if tune in grd_truth[fam]:
            # print(f"{tune} in {fam}")
            tune_fam_annotation_numeric[''.join(i for i in tune if i.isdigit())] = fam

print({i for i in tune_fam_annotation_numeric if tune_fam_annotation_numeric[i]=="Jenny's Welcome to Charlie"})
print(len(tune_fam_annotation_numeric))


{'31925', '29604', '29417', '12863', '25776', '12864', '26779', '12865', '28852', '9850', '41051', '24754', '5194', '34293', '1272', '27336', '40380', '1424', '1682', '13180', '370', '31218', '13179', '22961', '34110', '14710', '37509', '202', '22924', '33011', '12862', '29543', '24195', '24747', '14585', '14709', '33110', '11942', '14586', '41560', '37501', '1357', '40434'}
40149


In [15]:
# Setup namedtuple:
from typing import NamedTuple
KG_Data = NamedTuple('KG_Data', [
    ('id_number', str),
    ('tune_title', str),
    ('tune_family', str),
    ('feature', str),
    ('level', str),
    ('n_vals', tuple),
    ('duration_beats', int),
    ('locations', dict),
    ('feature_sequence_data', list)
])

for tune_id_num in list(id_dict)[:10]:
    print(tune_id_num)
    print(id_dict[tune_id_num])
    print(tune_fam_annotation_numeric[tune_id_num])
    print(feat_seq_data[tune_id_num])
    print(duration_beats[tune_id_num])
    print(res[tune_id_num])
    print('\n')







4772
BoysOfPatstaiThe

[3, 5, 5, 3, 4, 6, 6, 4, 3, 5, 5, 3, 2, 3, 2, 1, 3, 5, 5, 3, 4, 6, 6, 6, 6, 1, 2, 3, 2, 1, 2, 1, 3, 5, 5, 3, 4, 6, 6, 4, 3, 5, 5, 3, 2, 3, 2, 1, 3, 5, 5, 3, 4, 6, 6, 6, 6, 1, 2, 3, 2, 1, 2, 1, 1, 2, 3, 2, 1, 6, 2, 2, 2, 2, 1, 2, 3, 2, 1, 6, 5, 1, 2, 3, 2, 1, 6, 2, 2, 2, 2, 1, 6, 5, 3, 2, 1, 1, 2, 3, 2, 1, 6, 2, 2, 2, 2, 1, 2, 3, 2, 1, 6, 5, 1, 2, 3, 2, 1, 6, 2, 2, 2, 2, 1, 6, 5, 3, 2, 1]
132
{(3, 4, 3, 2): [0, 8], (4, 3, 2, 3): [1, 9], (3, 2, 3, 4): [2, 10], (2, 3, 4, 1): [3, 11], (3, 4, 1, 2): [4, 12], (4, 1, 2, 3): [5], (1, 2, 3, 4): [6], (2, 3, 4, 3): [7], (4, 1, 2, 1): [13], (1, 2, 1, 6): [14, 22], (2, 1, 6, 1): [15, 23], (1, 6, 1, 6): [16, 18, 24, 26], (6, 1, 6, 1): [17, 19, 25, 27], (1, 6, 1, 2): [20, 28], (6, 1, 2, 1): [21], (3, 4, 3, 2, 3): [29, 37], (4, 3, 2, 3, 4): [30, 38], (3, 2, 3, 4, 1): [31, 39], (2, 3, 4, 1, 2): [32, 40], (3, 4, 1, 2, 3): [33], (4, 1, 2, 3, 4): [34], (1, 2, 3, 4, 3): [35], (2, 3, 4, 3, 2): [36], (3, 4, 1, 2, 1): [41], (4, 1, 2, 1,

In [16]:
# populate output class instances

output = []
for tune_id_num in id_dict:
    tune_data = KG_Data(id_number= tune_id_num, tune_title=id_dict[tune_id_num], tune_family=tune_fam_annotation_numeric[tune_id_num], feature='diatonic scale degree', level='accent', feature_sequence_data=feat_seq_data[tune_id_num], n_vals=(4, 5, 6), duration_beats=duration_beats[tune_id_num], locations=res[tune_id_num])
    output.append(tune_data)

print(output[0])
print(len(output))

KG_Data(id_number='4772', tune_title='BoysOfPatstaiThe', tune_family='', feature='diatonic scale degree', level='accent', n_vals=(4, 5, 6), duration_beats=132, locations={(3, 4, 3, 2): [0, 8], (4, 3, 2, 3): [1, 9], (3, 2, 3, 4): [2, 10], (2, 3, 4, 1): [3, 11], (3, 4, 1, 2): [4, 12], (4, 1, 2, 3): [5], (1, 2, 3, 4): [6], (2, 3, 4, 3): [7], (4, 1, 2, 1): [13], (1, 2, 1, 6): [14, 22], (2, 1, 6, 1): [15, 23], (1, 6, 1, 6): [16, 18, 24, 26], (6, 1, 6, 1): [17, 19, 25, 27], (1, 6, 1, 2): [20, 28], (6, 1, 2, 1): [21], (3, 4, 3, 2, 3): [29, 37], (4, 3, 2, 3, 4): [30, 38], (3, 2, 3, 4, 1): [31, 39], (2, 3, 4, 1, 2): [32, 40], (3, 4, 1, 2, 3): [33], (4, 1, 2, 3, 4): [34], (1, 2, 3, 4, 3): [35], (2, 3, 4, 3, 2): [36], (3, 4, 1, 2, 1): [41], (4, 1, 2, 1, 6): [42], (1, 2, 1, 6, 1): [43, 51], (2, 1, 6, 1, 6): [44, 52], (1, 6, 1, 6, 1): [45, 47, 53, 55], (6, 1, 6, 1, 6): [46, 54], (6, 1, 6, 1, 2): [48, 56], (1, 6, 1, 2, 1): [49], (6, 1, 2, 1, 6): [50], (3, 4, 3, 2, 3, 4): [57, 65], (4, 3, 2, 3, 4, 1)

In [18]:
# slice output & write to disc

sample_output = output[:200]
print(len(sample_output))
print(sample_output[0])

# write output
sample_output_filename = 'thesession_kg_data.pkl'
sample_output_path =  '/Users/dannydiamond/Desktop/' + sample_output_filename
with open(sample_output_path, 'wb') as f_out:
    pickle.dump(sample_output, f_out)

200
KG_Data(id_number='4772', tune_title='BoysOfPatstaiThe', tune_family='', feature='diatonic scale degree', level='accent', n_vals=(4, 5, 6), duration_beats=132, locations={(3, 4, 3, 2): [0, 8], (4, 3, 2, 3): [1, 9], (3, 2, 3, 4): [2, 10], (2, 3, 4, 1): [3, 11], (3, 4, 1, 2): [4, 12], (4, 1, 2, 3): [5], (1, 2, 3, 4): [6], (2, 3, 4, 3): [7], (4, 1, 2, 1): [13], (1, 2, 1, 6): [14, 22], (2, 1, 6, 1): [15, 23], (1, 6, 1, 6): [16, 18, 24, 26], (6, 1, 6, 1): [17, 19, 25, 27], (1, 6, 1, 2): [20, 28], (6, 1, 2, 1): [21], (3, 4, 3, 2, 3): [29, 37], (4, 3, 2, 3, 4): [30, 38], (3, 2, 3, 4, 1): [31, 39], (2, 3, 4, 1, 2): [32, 40], (3, 4, 1, 2, 3): [33], (4, 1, 2, 3, 4): [34], (1, 2, 3, 4, 3): [35], (2, 3, 4, 3, 2): [36], (3, 4, 1, 2, 1): [41], (4, 1, 2, 1, 6): [42], (1, 2, 1, 6, 1): [43, 51], (2, 1, 6, 1, 6): [44, 52], (1, 6, 1, 6, 1): [45, 47, 53, 55], (6, 1, 6, 1, 6): [46, 54], (6, 1, 6, 1, 2): [48, 56], (1, 6, 1, 2, 1): [49], (6, 1, 2, 1, 6): [50], (3, 4, 3, 2, 3, 4): [57, 65], (4, 3, 2, 3, 4

In [34]:
# write output
output_filename = '456_gram_locations_filtered.pkl'
locations_output_path = locations_dir + '/' + output_filename
with open(locations_output_path, 'wb') as f_out:
    pickle.dump(output, f_out)
