# Make Predictions
In this notebook, we make predictions on the entire dataset of clips and save those predictions for future use.

In [1]:
import warnings
from pathlib import Path

import librosa
import pandas as pd
from tqdm import tqdm
from sklearn.externals import joblib

In [2]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
full_model = joblib.load("syntax-speaker-predictor.pkl")

### Create Pipeline Functions


In [4]:
def get_features(mp3_path):
    y, sr = librosa.load(mp3_path)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
    mel_ravel = mel_spec.ravel()
    return mel_ravel


def make_prediction(mp3_path):
    try:
        x = get_features(mp3_path)
    except EOFError:
        return 2
    if x.shape != (5888,):
        return 2  # other (under 1s)
    else:
        pred = full_model.predict([x])
        return pred[0]

### Full Data Prediction

In [None]:
all_clips = list(sorted(Path("./syntax-clips/").glob(f"syntax*.mp3")))
prediction_data_all = []
for clip in tqdm(all_clips, total=len(all_clips)):
    prediction_data_all.append((clip.stem, make_prediction(str(clip))))

100%|██████████| 316599/316599 [4:13:50<00:00, 20.79it/s]   


In [8]:
prediction_df_all = (pd.DataFrame(prediction_data_all,
                                  columns=["segment", "prediction"]))

In [9]:
prediction_df_all.to_csv("all_predictions.csv", index=False)