In [1]:
run -i 'ismir2020_make_datasets.py'

In [2]:
from MTCFeatures import MTCFeatureLoader
import music21 as m21
import pickle
from collections import Counter

# Construct the Melody sequences

In MTCFeatures 1.1: If a melody ends with a melisma, the melismastate of the last note is 'in' instead of 'end'.
This function does repair:

In [3]:
def fixFinalMelismastate(seq_iter):
    for seq in seq_iter:
        seq['features']['melismastate'][-1] = 'end'
        yield seq

In MTCFeatures 1.1: 234 songs have type 'vocal' but no lyrics. So we need another filter for vocal:

In [4]:
def vocalFilter(seq_iter):
    for seq in seq_iter:
        if 'melismastate' in seq['features'].keys():
            yield seq

Functions to add grouper predictions, and IDyOM information content to the sequences as features

In [5]:
def addGrouper(seq_iter, grouperfilename):
    grouper_seqs = list(MTCFeatureLoader(grouperfilename).sequences())
    grouper_seqs_dict = {seq['id'] : seq for seq in grouper_seqs}
    for seq in seq_iter:
        if seq['id'] in grouper_seqs_dict.keys():
            seq['features']['grouper'] = grouper_seqs_dict[seq['id']]['features']['grouper']
            if len(grouper_seqs_dict[seq['id']]['features']['grouper']) != len(seq['features']['scaledegree']):
                print ("Different length in grouper: ", seq['id'])
        yield seq
        
def addInformationContent(seq_iter, icfilename):
    ic_seqs = list(MTCFeatureLoader(icfilename).sequences())
    ic_seqs_dict = {seq['id'] : seq for seq in ic_seqs}
    for seq in seq_iter:
        if seq['id'] in ic_seqs_dict.keys():
            seq['features']['informationcontent'] = ic_seqs_dict[seq['id']]['features']['informationcontent']
            if len(ic_seqs_dict[seq['id']]['features']['informationcontent']) != len(seq['features']['scaledegree']):
                print ("Different length in grouper: ", seq['id'])
        yield seq

In [6]:
readFSFromDisk = True

if readFSFromDisk:
    vocal_seqs = MTCFeatureLoader('ismir2020_seqs_mtc.jsonl.gz').sequences()
    vocal_seqs = list(vocal_seqs)
else:
    fsinst_dl = MTCFeatureLoader('MTC-FS-INST-2.0')
    vocal_seqs = fsinst_dl.applyFilters([
            {'mfilter':'freemeter', 'invert':True},
            {'mfilter':("afteryear",1850)} ]
    )
    vocal_seqs = vocalFilter(vocal_seqs)
    vocal_seqs = fixFinalMelismastate(vocal_seqs)
    vocal_seqs = extractFeatures(vocal_seqs, vocalfeatures=True)
    vocal_seqs = addGrouper(vocal_seqs, 'ismir2020_melisma_IDyOM_sel/mtcfsinst_vocal_meter_after1850_1pertf_grouper.jsonl.gz')
    vocal_seqs = list(vocal_seqs)
    MTCFeatureLoader.writeJSON('ismir2020_seqs_mtc.jsonl.gz', vocal_seqs)

100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 

In [7]:
#add the right informationcontent (trained on the target set - either full or the 1pertf selection)
#vocal_seqs = addInformationContent(vocal_seqs, 'ismir2020_melisma_IDyOM_sel/mtcfsinst_vocal_meter_after1850_informationcontent.jsonl.gz')
vocal_seqs = addInformationContent(vocal_seqs, 'ismir2020_melisma_IDyOM_sel/mtcfsinst_vocal_meter_after1850_1pertf_informationcontent.jsonl.gz')
vocal_seqs = list(vocal_seqs)

In [8]:
readESSENFromDisk = True

if readESSENFromDisk:
    essen_seqs = MTCFeatureLoader('ismir2020_seqs_essen.jsonl.gz').sequences()
    essen_seqs = list(essen_seqs)
else:
    essen_dl = MTCFeatureLoader('ESSEN')
    essen_seqs = essen_dl.sequences()
    essen_seqs = [seq for seq in essen_seqs if seq['freemeter']==False]
    essen_seqs = extractFeatures(essen_seqs, vocalfeatures=False)
    essen_seqs = addGrouper(essen_seqs, 'ismir2020_melisma_IDyOM_sel/essen_erk_meter_grouper.jsonl.gz')
    essen_seqs = list(essen_seqs)
    MTCFeatureLoader.writeJSON('ismir2020_seqs_essen.jsonl.gz', essen_seqs)

100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 

In [9]:
#add the right informationcontent (trained on the target set - either full or Erk)
#essen_seqs = addInformationContent(essen_seqs, 'ismir2020_melisma_IDyOM_sel/essen_meter_informationcontent.jsonl.gz')
essen_seqs = addInformationContent(essen_seqs, 'ismir2020_melisma_IDyOM_sel/essen_erk_meter_informationcontent.jsonl.gz')
essen_seqs = list(essen_seqs)

In [10]:
readCHORFromDisk = True

if readCHORFromDisk:
    chor_seqs = MTCFeatureLoader('ismir2020_seqs_chor.jsonl.gz').sequences()
    chor_seqs = list(chor_seqs)
else:
    chor_dl = MTCFeatureLoader('chorale_sequences.jsonl.gz')
    chor_seqs = chor_dl.sequences()
    chor_seqs = [seq for seq in chor_seqs if seq['freemeter']==False]
    chor_seqs = extractFeatures(chor_seqs, vocalfeatures=False)
    chor_seqs = addGrouper(chor_seqs, 'ismir2020_melisma_IDyOM_sel/chor_meter_grouper.jsonl.gz')
    chor_seqs = addInformationContent(chor_seqs, 'ismir2020_melisma_IDyOM_sel/chor_meter_informationcontent.jsonl.gz')
    chor_seqs = list(chor_seqs)
    MTCFeatureLoader.writeJSON('ismir2020_seqs_chor.jsonl.gz', chor_seqs)

100 200 300 

# Selection of subsets

We do selection of songs here to prevent storing 5-grams for the full data sets, which results in very large files.

In [11]:
#MTC
with open('ismir2020_songids_mtc.txt', 'r') as f:
    songids_mtc = [line.rstrip() for line in f.readlines()]
vocal_seqs = [seq for seq in vocal_seqs if seq['id'] in songids_mtc]

MTCFeatureLoader.writeJSON('ismir2020_seqs_mtc_sel.jsonl.gz', vocal_seqs)

In [12]:
#ESSEN
with open('ismir2020_songids_essen.txt', 'r') as f:
    songids_essen = [line.rstrip() for line in f.readlines()]
essen_seqs = [seq for seq in essen_seqs if seq['id'] in songids_essen]

MTCFeatureLoader.writeJSON('ismir2020_seqs_essen_sel.jsonl.gz', essen_seqs)

In [13]:
#CHOR
with open('ismir2020_songids_chor.txt', 'r') as f:
    songids_chor = [line.rstrip() for line in f.readlines()]
chor_seqs = [seq for seq in chor_seqs if seq['id'] in songids_chor]

MTCFeatureLoader.writeJSON('ismir2020_seqs_chor_sel.jsonl.gz', chor_seqs)

# Extract the 5-grams

In [14]:
readPGramsFromDisk = True

if readPGramsFromDisk:
    pgrams = pd.read_pickle("ismir2020_pgrams_mtc_sel.pkl")
    arfftype = pickle.load( open( "ismir2020_arfftype_mtc_sel.pkl", "rb" ) )
else:
    pgrams, arfftype = extractPgramsFromCorpus(vocal_seqs, pgram_type='note')
    pgrams.to_pickle("ismir2020_pgrams_mtc_sel.pkl")
    pickle.dump(arfftype, open('ismir2020_arfftype_mtc_sel.pkl','wb'))

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 

In [15]:
readESSENPGramsFromDisk = True

if readESSENPGramsFromDisk:
    essen_pgrams = pd.read_pickle("ismir2020_pgrams_essen_sel.pkl")
    essen_arfftype = pickle.load( open( "ismir2020_arfftype_essen_sel.pkl", "rb" ) )
else:
    essen_pgrams, essen_arfftype = extractPgramsFromCorpus(essen_seqs, pgram_type='note')
    essen_pgrams.to_pickle("ismir2020_pgrams_essen_sel.pkl")
    pickle.dump(essen_arfftype, open("ismir2020_arfftype_essen_sel.pkl",'wb'))

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 

In [16]:
readCHORPGramsFromDisk = True

if readCHORPGramsFromDisk:
    chor_pgrams = pd.read_pickle("ismir2020_pgrams_chor_sel.pkl")
    chor_arfftype = pickle.load( open( "ismir2020_arfftype_chor.pkl", "rb" ) )
else:
    chor_pgrams, chor_arfftype = extractPgramsFromCorpus(chor_seqs, pgram_type='note')
    chor_pgrams.to_pickle("ismir2020_pgrams_chor_sel.pkl")
    pickle.dump(chor_arfftype, open("ismir2020_arfftype_chor_sel.pkl",'wb'))

0 100 200 300 