# Label Data Processing
In this notebook we take the labeled data that's been output from Prodigy and process into a format usable for machine learning.

Steps:
1. Load in `.jsonl` file, keeping only the fields we need.
2. Create features from the audio clips based on the [Mel scale](https://en.wikipedia.org/wiki/Mel_scale)
3. Save the features and labels to disk.

In [1]:
import jsonlines
import librosa
import librosa.display
import numpy as np
import pandas as pd
import pylab

%matplotlib inline

In [2]:
with jsonlines.open("output/syntax-label.jsonl", "r") as reader:
    labels = [(line.get("text"), line.get("accept")) for line in reader]

In [3]:
label_map = {'scott': 0, 'wes': 1, 'other': 2}

In [6]:
data = []
for text, label in labels:
    episode, second = [i[1:] for i in text.split(":")]
    if label:
        label_name = label[0]
        label_id = label_map[label[0]]
    else:
        label_id = label_map['other']
        label_name = "other"
    data.append(dict(episode=episode, second=second.zfill(4),
                     label_name=label_name, label_id=label_id))

In [7]:
data[0]

{'episode': '044', 'second': '0000', 'label_name': 'other', 'label_id': 2}

In [8]:
%%time
raw_data = []
mel_data = []
raw_trash = []
mel_trash = []
for i, row in enumerate(data):
    second = row['second']
    episode = row['episode']
    y, sr = librosa.load(f"syntax-clips/syntax{episode}-{second}.mp3")
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
    mel_ravel = mel_spec.ravel()
    if i == 0:
        y_shape = y.shape
        mel_shape = mel_ravel.shape
    if y.shape == y_shape:
        raw_data.append(y)
    else:
        raw_trash.append(i)
    if mel_ravel.shape == mel_shape:
        mel_data.append(mel_ravel)
    else:
        mel_trash.append(i)

CPU times: user 24min 5s, sys: 17.5 s, total: 24min 22s
Wall time: 4min 6s


In [9]:
mel_array = np.array(mel_data)
raw_array = np.array(raw_data)

In [10]:
mel_array.shape

(5260, 5888)

In [11]:
X = pd.DataFrame(mel_array, columns=[
                 f"f{i}" for i in range(mel_array.shape[1])])
X.to_csv("./training-data/X.csv")

In [12]:
label_df = pd.DataFrame((d for i, d in enumerate(data) if i not in mel_trash))
label_df.to_csv("./training-data/labels.csv")