In [None]:
# ingest processing of JKUPDD corpus through FoNN pipeline

In [1]:
# 1. iterate through all files in subfolders and extract MIDI files

from pathlib import Path

import music21

basepath = Path('/Users/dannydiamond/NUIG/Polifonia/jkupdd2/JKUPDD-Aug2013/groundTruth')
# Return a list of kern and midi files only, not directories
types = ('.krn', '.mid', '.midi')
file_list = [f for f in basepath.resolve().glob('**/*') if f.suffix in types]

# open them in music21
scores = dict(zip(file_list, [music21.converter.parse(f) for f in file_list]))




In [2]:
# 2. iterate through all parts of each score, store in dict w filenames

inputs = {}
for filepath, score in scores.items():
    for idx, p in enumerate(score.parts):
        inputs[f"{str(filepath)}_part{str(idx+1)}"] = p



In [3]:
import numpy as np
import pandas as pd


In [4]:
# 3. apply FoNN ingest processing, treating each part as an individual input
# 4. Write output to csv

primary_feat_seq = {}

for k, v in inputs.items():
    content = v.recurse().notes
    data = []
    for idx, note in enumerate(content):
        prev_element = content[idx - 1]
        if note.isNote:
            midi_note = float(note.pitch.ps)
            diatonic_note_num = float(note.pitch.diatonicNoteNum)
            pitch_class = float(note.pitch.pitchClass)
        # if chords are encountered, take their root: (this is rare in our corpora)
        if note.isChord:
            midi_note = float(note.root().ps)
            diatonic_note_num = float(note.root().diatonicNoteNum)
            pitch_class = float(note.root().pitchClass)

        feat_seq_data = np.asarray([
            midi_note,  # MIDI (chromatic) note number
            diatonic_note_num,  # Diatonic note number
            pitch_class,  # (absolute) chromatic pitch class
            round(float(note.offset), 2),  # onset
            round(float(note.duration.quarterLength), 2),  # duration
        ])
        data.append(feat_seq_data)
        # print(feat_seq_data)
        # # convert to Dataframe

    output = pd.DataFrame(data, columns=["midi_note_num", "diatonic_note_num", "chromatic_pitch_class", "offset", "duration",])

    # force types (to save memory):
    output["midi_note_num"] = output["midi_note_num"].astype('int8')
    output["diatonic_note_num"] = output["diatonic_note_num"].astype('int8')
    output["chromatic_pitch_class"] = output["chromatic_pitch_class"].astype('int8')
    output["offset"] = output["offset"].astype('float16')
    output["duration"] = output["duration"].astype('float16')
    output = output[output.midi_note_num != 1]

    primary_feat_seq[k] = output


In [5]:
for k, v in primary_feat_seq.items():
    v["diatonic_pitch_class"] = (v["diatonic_note_num"] % 7).astype('int8')
    v['chromatic_interval'] = (v['midi_note_num'] - v['midi_note_num'].shift(1)).fillna(0).astype('int8')
    v['diatonic_interval'] = (v['diatonic_note_num'] - v['diatonic_note_num'].shift(1)).fillna(0).astype('int8')
    print(k, v.head())
    v.to_csv(f"{k}.csv")

/Users/dannydiamond/NUIG/Polifonia/jkupdd2/JKUPDD-Aug2013/groundTruth/gibbonsSilverSwan1612/monophonic/kern/silverswan.krn_part1    midi_note_num  diatonic_note_num  chromatic_pitch_class  offset  duration  \
0             72                 36                      0     1.0       1.0   
1             72                 36                      0     2.0       1.0   
2             74                 37                      2     3.0       1.0   
3             76                 38                      4     0.0       3.0   
4             77                 39                      5     3.0       1.0   

   diatonic_pitch_class  chromatic_interval  diatonic_interval  
0                     1                   0                  0  
1                     1                   0                  0  
2                     2                   2                  1  
3                     3                   2                  1  
4                     4                   1                  1  


In [27]:
import os
for path, subdirs, files in os.walk(basepath):

    for name in files:
        if not name.startswith('.'):
            # print(name)

            # # get file path
            file_path = os.path.join(path, name)
            if "_p" in file_path:
                print(file_path)
                os.remove(file_path)

/Users/dannydiamond/NUIG/Polifonia/jkupdd2/JKUPDD-Aug2013/groundTruth/gibbonsSilverSwan1612/monophonic/kern/silverswan.krn_part5.csv
/Users/dannydiamond/NUIG/Polifonia/jkupdd2/JKUPDD-Aug2013/groundTruth/gibbonsSilverSwan1612/monophonic/kern/silverswan.krn_part4.csv
/Users/dannydiamond/NUIG/Polifonia/jkupdd2/JKUPDD-Aug2013/groundTruth/gibbonsSilverSwan1612/monophonic/kern/silverswan.krn_part1.csv
/Users/dannydiamond/NUIG/Polifonia/jkupdd2/JKUPDD-Aug2013/groundTruth/gibbonsSilverSwan1612/monophonic/kern/silverswan.krn_part3.csv
/Users/dannydiamond/NUIG/Polifonia/jkupdd2/JKUPDD-Aug2013/groundTruth/gibbonsSilverSwan1612/monophonic/kern/silverswan.krn_part2.csv
/Users/dannydiamond/NUIG/Polifonia/jkupdd2/JKUPDD-Aug2013/groundTruth/gibbonsSilverSwan1612/monophonic/repeatedPatterns/tomCollins/G/occurrences/midi/occ2.midi_part1.csv
/Users/dannydiamond/NUIG/Polifonia/jkupdd2/JKUPDD-Aug2013/groundTruth/gibbonsSilverSwan1612/monophonic/repeatedPatterns/tomCollins/G/occurrences/midi/occ1.midi_part1