## This notebook reads the csv metadata file for the annotated subset of The Session corpus (created by thesession_metadata_extraction.py) and adds tune family membership ground-truth annotation. This supplementary information is taken from a separate csv file holding manually-verified tune family membership data, which was obtained via a review of the musicological literature on Irish traditional music.

In [1]:
# imports

import pandas as pd

In [2]:
# TODO: Change local file paths throughout this notebook to Patterns Knowledge Graph repo paths.

In [3]:
# Read corpus metadata file
corpus_metadata_path = "../metadata/thesession_subset_metadata.csv"
metadata = pd.read_csv(corpus_metadata_path, index_col=0)
metadata.set_index('identifiers', inplace=True, drop=True)
print(metadata.head())

             Unnamed: 0                title    M  \
identifiers                                         
1029                  0  An Seanduine DÃ³ite  6/8   
14252                 1  An Seanduine DÃ³ite  6/8   
25163                 2  An Seanduine DÃ³ite  6/8   
27315                 3  An Seanduine DÃ³ite  6/8   
27316                 4  An Seanduine DÃ³ite  6/8   

                                            N    L       K    R  \
identifiers                                                       
1029          tune id: 1029; setting id: 1029  NaN  Dmajor  jig   
14252        tune id: 1029; setting id: 14252  NaN  Dmajor  jig   
25163        tune id: 1029; setting id: 25163  NaN  Dmajor  jig   
27315        tune id: 1029; setting id: 27315  NaN  Gmajor  jig   
27316        tune id: 1029; setting id: 27316  NaN  Gmajor  jig   

                                                     abc_score  
identifiers                                                     
1029          X:1029 T:An Se

In [4]:
import numpy as np
# Read tune family annotation file
tune_fams_path = "../metadata/thesession_tune_family_annotation.csv"
tune_fams = pd.read_csv(tune_fams_path, index_col=0)
# drop rows with duplicate index value. drop_duplicates doesn't do 
# what we need here. soln from: 
# https://stackoverflow.com/questions/13035764/remove-pandas-rows-with-duplicate-indices
idx = np.unique(tune_fams.index.values, return_index = True )[1]
tune_fams = tune_fams.iloc[idx]
print(tune_fams.head())

                            tune_family
identifiers                            
27                        Drowsy Maggie
75                           Hob or Nob
84                             Gilderoy
202          Jenny's Welcome to Charlie
249                Road to Lisdoonvarna


In [5]:
# lookup and add tune family membership information to metadata table
res = metadata.join(tune_fams)
# print and write to file
print(res.head())
print(len(res))
out_path = "../metadata/thesession_subset_metadata_with_tune_family_annotation.csv"
res.to_csv(out_path)

             Unnamed: 0                title    M  \
identifiers                                         
1029                  0  An Seanduine DÃ³ite  6/8   
14252                 1  An Seanduine DÃ³ite  6/8   
25163                 2  An Seanduine DÃ³ite  6/8   
27315                 3  An Seanduine DÃ³ite  6/8   
27316                 4  An Seanduine DÃ³ite  6/8   

                                            N    L       K    R  \
identifiers                                                       
1029          tune id: 1029; setting id: 1029  NaN  Dmajor  jig   
14252        tune id: 1029; setting id: 14252  NaN  Dmajor  jig   
25163        tune id: 1029; setting id: 25163  NaN  Dmajor  jig   
27315        tune id: 1029; setting id: 27315  NaN  Gmajor  jig   
27316        tune id: 1029; setting id: 27316  NaN  Gmajor  jig   

                                                     abc_score tune_family  
identifiers                                                                 
1029