## Polifonia [Patterns Knowledge Graph](https://github.com/polifonia-project/patterns-knowledge-graph) (KG) ingest pipeline. Step 2: Data processing.

In [137]:
# imports

import numpy as np
import pandas as pd
import pickle

This is step 2 of a 2-step data preprocessing pipeline. Step 1 can be found in ```./patters_kg_data_extraction.ipynb``` notebook and must be applied to any input corpus before running this notebook.

After running ```patters_kg_data_extraction.ipynb```, point 'kg_data_dir' path in the cell below to ```/[corpus name]/kg_pipeline_input_data``` dir (which contains files outputted by the first preprocessing step).

NOTE: This notebook must be run in full for each individual pattern length. Pattern length is set via 'n' variable in cell below. E.g.: In our sample data below, ```../FoNN/mtc_ann_corpus/kg_pipeline_input_data``` contains patterns of 4, 5, and 6 elements in length outputted by ```patters_kg_data_extraction.ipynb```. Accordingly, users must run this notebook in three passes, for n=4, n=5 and n=6.

Output of this notebook is written to ```../FoNN/mtc_ann_corpus/kg_pipeline_output_data```.
To create a KG from this output data, please see Polifonia [Patterns Knowledge Graph](https://github.com/polifonia-project/patterns-knowledge-graph) repo for further information.


In [138]:
# Load pattern occurrences matrix from kg_pipeline_input_data directory:

kg_data_in_dir = '../mtc_ann_corpus/kg_pipeline_input_data'
n = 5  # set pattern length under investigation
in_file = f'{n}gram_patterns.pkl'
in_path = kg_data_in_dir + '/' + in_file
pattern_occurrences = pd.read_pickle(in_path)
# print number of patterns in input matrices
print(f"{len(pattern_occurrences)} patterns extracted of length {n}")


2580 patterns extracted of length 5


In [139]:
# filter pattern occurrences matrix, retaining patterns which occur twice or more
occurrence_counts = pattern_occurrences.astype('Sparse[int16, 0]')
occurrence_counts.fillna(value=0, inplace=True)
occurrence_counts['freq'] = occurrence_counts.sum(axis=1)
filtered_pattern_occurrences = occurrence_counts[occurrence_counts['freq'] >= 2]
# print number of patterns retained after filtration
print(f"{len(filtered_pattern_occurrences)} patterns retained after filtering.")


1780 patterns retained after filtering.


In [140]:
# change index col type from numpy array to tuples
patterns_reformatted = [tuple(i) for i in filtered_pattern_occurrences.index]
sample_output = patterns_reformatted[:3]
print(f"Sample output:")
print(sample_output)
filtered_patterns = set(patterns_reformatted)


Sample output:
[(1, 1, 1, 1, 1), (1, 1, 1, 1, 2), (1, 1, 1, 1, 3)]


In [141]:
# read locations dict outputted by patterns_kg_data_extraction.ipynb
locations_filename = f'{n}gram_locations.pkl'
locations_in_path = kg_data_in_dir + '/' + locations_filename
with open(locations_in_path, 'rb') as locations_raw:
    locations = pickle.load(locations_raw)


In [142]:
# extract tune titles and id numbers from locations data:

# from string import digits
titles  = [tune for tune in locations]  # titles  = [tune.rstrip(digits) for tune in locations]
# extract tune id numbers:
tune_id_nums = [''.join(i for i in tune if not  i.isalpha()) for tune in locations]
# create lookup dict of tune id numbers (keys): tune titles (vals)
id_dict = dict(zip(tune_id_nums, titles))
# create second dict, of tune id numbers (keys): to pattern locations data (vals)
id_dict_filenames = dict(zip(tune_id_nums, list(locations)))

In [143]:
# filter locations dict

filtered_locations = {}
for tune in locations:
    # read value, which holds a dict of patterns and locations
    pattern_locations = locations[tune]
    # filter
    filtered = {pattern: pattern_locations[pattern] for pattern in pattern_locations if pattern in filtered_patterns}
    filtered_locations[tune] = filtered

# check output
locations_out = dict(zip(list(id_dict), list(filtered_locations.values())))
print('Sample output for single tune:')
for k, v in list(locations_out.items())[:1]:
    print(f"identifiers: {k}:\npattern locations: {v}")


Sample output for single tune:
identifiers: 072355_01:
pattern locations: {(4, 4, 4, 5, 5): [0], (4, 4, 5, 5, 5): [1], (4, 5, 5, 5, 4): [2], (5, 5, 5, 4, 3): [3], (5, 5, 4, 3, 3): [4], (5, 4, 3, 3, 1): [5], (4, 3, 3, 1, 1): [6], (3, 3, 1, 1, 1): [7], (3, 1, 1, 1, 1): [8], (1, 1, 1, 1, 3): [9], (1, 1, 1, 3, 3): [10], (1, 1, 3, 3, 2): [11], (1, 3, 3, 2, 2): [12], (3, 3, 2, 2, 2): [13, 37], (3, 2, 2, 2, 2): [14, 38], (2, 2, 2, 2, 3): [15], (2, 2, 2, 3, 3): [16], (2, 2, 3, 3, 2): [17, 35], (2, 3, 3, 2, 1): [18], (3, 3, 2, 1, 1): [19], (3, 2, 1, 1, 1): [20], (2, 1, 1, 1, 2): [21], (1, 1, 1, 2, 2): [22], (1, 1, 2, 2, 2): [23], (1, 2, 2, 2, 2): [24], (2, 2, 2, 2, 5): [25], (2, 2, 2, 5, 5): [26], (2, 2, 5, 5, 5): [27], (2, 5, 5, 5, 3): [28], (5, 5, 5, 3, 2): [29], (5, 5, 3, 2, 1): [30], (5, 3, 2, 1, 2): [31], (3, 2, 1, 2, 2): [32], (2, 1, 2, 2, 3): [33], (1, 2, 2, 3, 3): [34], (2, 3, 3, 2, 2): [36], (2, 2, 2, 2, 1): [39], (2, 2, 2, 1, 1): [40], (2, 2, 1, 1, 1): [41], (2, 1, 1, 1, 1): [42], (1,

In [144]:
# calculate tune lengths (length of feature sequences):

import os

def calculate_tune_lengths(target_dir):
    # calculate length of all feature sequences in input corpus
    tune_durations = []
    for file_name in os.listdir(target_dir):
        if file_name.endswith('.csv'):
            file_path = f"{target_dir}/{file_name}"
            with open(file_path) as content:
                counter = len(content.readlines()) + 1
                tune_durations.append(counter)
    # store in dict per tune id numbers (keys): tune durations (vals)
    tune_durations = dict(zip(list(id_dict), tune_durations))

    return tune_durations

# run:
feat_seq_path = '../mtc_ann_corpus/feature_sequence_data/duration_weighted'
tune_lengths = calculate_tune_lengths(feat_seq_path)


In [145]:
import csv
# Extract diatonic scale degree sequences from csv feature sequence files for all tunes
# NOTE: If investigating a different musical feature, please specify below via 'feature' variable.
# A full list of FoNN features is available in ../README.md
feature = 'diatonic_scale_degree'
filenames = [filename[:-4] for filename in os.listdir(feat_seq_path) if filename.endswith('.csv')]
inpaths = [f"{feat_seq_path}/{filename}" for filename in os.listdir(feat_seq_path) if filename.endswith(".csv")]
# identify csv col names
with open(inpaths[0]) as testfile:
    csv_reader = csv.reader(testfile, delimiter=',')
    cols = next(csv_reader)
    colsmap = {col_name: i for i, col_name in enumerate(cols)}
    assert feature in colsmap
# identify index of target column
target_col_idx = colsmap[feature]
# extract target column from all tunes to list
numeric_data = [
    np.genfromtxt(
        path,
        dtype=str,
        delimiter=',',
        usecols=target_col_idx,
            skip_header=1)
        for path in inpaths
    ]
# convert feature sequence data from numeric to string
formatted_feat_seq_data = []
for i in numeric_data:
    numeric = []
    for j in i:
        num = int(float(j)) if j != '' else 0
        numeric.append(num)
    formatted_feat_seq_data.append(numeric)
# store in dict per tune id numbers (keys): feature sequence data (vals)
feat_seq_data =dict(zip(tune_id_nums, formatted_feat_seq_data))

# confirm order of dict items matches 'locations' variable above
print(filenames == list(locations))
# check output
for k, v in list(feat_seq_data.items())[:1]:
    print(f"identifiers: {k}: \n data ({' '.join(feature.split('_'))} feature sequence values): {v}")


True
identifiers: 072355_01: 
 data (diatonic scale degree feature sequence values): [4, 4, 4, 5, 5, 5, 4, 3, 3, 1, 1, 1, 1, 3, 3, 2, 2, 2, 2, 3, 3, 2, 1, 1, 1, 2, 2, 2, 2, 5, 5, 5, 3, 2, 1, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1]


In [146]:
# Setup namedtuple to store output for KG setup:
from typing import NamedTuple
KG_Data = NamedTuple('KG_Data', [
    ('identifiers', str),           # tune id number
    ('title', str),                 # tune title
    ('feature', str),               # name of musical feature under investigation
    ('level', str),                 # level of granularity of input data. Must be manually defined below in KG_Data instance.
    ('n_vals', tuple),              # range of n-values (i.e.: pattern lengths) under investigation. Must be manually defined below in KG_Data instance.
    ('duration', int),              # length of tune feature sequence
    ('pattern_locations', dict),    # dict of patterns and their locations within the feature sequence
    ('data', list)                  # feature sequence data content
])

# to illustrate and test output mappings,
# use tune id numbers (common keys) to look up values of each of the above data fields from dicts created above
print("Test output mappings for single tune:")
for tune_id_num in list(id_dict)[:1]:
    print(f"identifiers: {tune_id_num}")
    print(f"title: {id_dict[tune_id_num]}")
    print(f"data: {feat_seq_data[tune_id_num]}")
    print(f"duration: {tune_lengths[tune_id_num]}")
    print(f"pattern_locations: {locations_out[tune_id_num]}")
    print('\n')

Test output mappings for single tune:
identifiers: 072355_01
title: NLB072355_01
data: [4, 4, 4, 5, 5, 5, 4, 3, 3, 1, 1, 1, 1, 3, 3, 2, 2, 2, 2, 3, 3, 2, 1, 1, 1, 2, 2, 2, 2, 5, 5, 5, 3, 2, 1, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1]
duration: 50
pattern_locations: {(4, 4, 4, 5, 5): [0], (4, 4, 5, 5, 5): [1], (4, 5, 5, 5, 4): [2], (5, 5, 5, 4, 3): [3], (5, 5, 4, 3, 3): [4], (5, 4, 3, 3, 1): [5], (4, 3, 3, 1, 1): [6], (3, 3, 1, 1, 1): [7], (3, 1, 1, 1, 1): [8], (1, 1, 1, 1, 3): [9], (1, 1, 1, 3, 3): [10], (1, 1, 3, 3, 2): [11], (1, 3, 3, 2, 2): [12], (3, 3, 2, 2, 2): [13, 37], (3, 2, 2, 2, 2): [14, 38], (2, 2, 2, 2, 3): [15], (2, 2, 2, 3, 3): [16], (2, 2, 3, 3, 2): [17, 35], (2, 3, 3, 2, 1): [18], (3, 3, 2, 1, 1): [19], (3, 2, 1, 1, 1): [20], (2, 1, 1, 1, 2): [21], (1, 1, 1, 2, 2): [22], (1, 1, 2, 2, 2): [23], (1, 2, 2, 2, 2): [24], (2, 2, 2, 2, 5): [25], (2, 2, 2, 5, 5): [26], (2, 2, 5, 5, 5): [27], (2, 5, 5, 5, 3): [28], (5, 5, 5, 3, 2): [29], (5, 5, 3, 2, 1): [30], (5, 3, 2, 1, 2): [31

In [147]:
# populate output class instances
output = []
for tune_id_num in id_dict:
    tune_data = KG_Data(
        identifiers= tune_id_num,
        title=id_dict[tune_id_num],
        feature=feature,
        level='duration-weighted note-level',
        data=feat_seq_data[tune_id_num],
        n_vals=(4, 5, 6),
        duration=tune_lengths[tune_id_num],
        pattern_locations=locations_out[tune_id_num])
    output.append(tune_data)

print("Test output:")
print(output[0])
print(len(output))

Test output:
KG_Data(identifiers='072355_01', title='NLB072355_01', feature='diatonic_scale_degree', level='duration-weighted note-level', n_vals=(4, 5, 6), duration=50, pattern_locations={(4, 4, 4, 5, 5): [0], (4, 4, 5, 5, 5): [1], (4, 5, 5, 5, 4): [2], (5, 5, 5, 4, 3): [3], (5, 5, 4, 3, 3): [4], (5, 4, 3, 3, 1): [5], (4, 3, 3, 1, 1): [6], (3, 3, 1, 1, 1): [7], (3, 1, 1, 1, 1): [8], (1, 1, 1, 1, 3): [9], (1, 1, 1, 3, 3): [10], (1, 1, 3, 3, 2): [11], (1, 3, 3, 2, 2): [12], (3, 3, 2, 2, 2): [13, 37], (3, 2, 2, 2, 2): [14, 38], (2, 2, 2, 2, 3): [15], (2, 2, 2, 3, 3): [16], (2, 2, 3, 3, 2): [17, 35], (2, 3, 3, 2, 1): [18], (3, 3, 2, 1, 1): [19], (3, 2, 1, 1, 1): [20], (2, 1, 1, 1, 2): [21], (1, 1, 1, 2, 2): [22], (1, 1, 2, 2, 2): [23], (1, 2, 2, 2, 2): [24], (2, 2, 2, 2, 5): [25], (2, 2, 2, 5, 5): [26], (2, 2, 5, 5, 5): [27], (2, 5, 5, 5, 3): [28], (5, 5, 5, 3, 2): [29], (5, 5, 3, 2, 1): [30], (5, 3, 2, 1, 2): [31], (3, 2, 1, 2, 2): [32], (2, 1, 2, 2, 3): [33], (1, 2, 2, 3, 3): [34], (2, 

In [148]:
# write output
output_filename = f'{n}gram_kg_data.pkl'
out_dir = '../mtc_ann_corpus/kg_pipeline_output_data'
if not os.path.exists(out_dir):
    os.makedirs(out_dir)
out_path = out_dir + '/' + output_filename
with open(out_path, 'wb') as f_out:
    pickle.dump(output, f_out)
