In [36]:
# Imports

import pandas as pd
import pandas_profiling
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

pd.options.display.max_rows = 70

# Creating the perfect dataset

The objective goal of this model is to classify my favorites and liked playlists into song and not songs. Both these playlists are already mainly songs (I estimate 99% music for my favorites and 95% music for the liked videos)

Should my training dataset also be ~95% music and only 5% other videos? I think so as this is similar to the distribution of the target I truly care about.

Metric of choice: F1 score

### Type of music videos in my playlists:
* Individual songs (with and without music videos)
* Mixes

### Types of 'other' videos found in my playlists:
* Gaming videos
* Podcasts
* Video essays


In [39]:
songsPath = '../data/music_data/'
nonSongsPath = '../data/non_music_data'

# Populate a dataframe each for all songs and non-songs

# Songs first
songsCSVList = Path(songsPath).rglob('*.csv')
music_df = pd.concat([pd.read_csv(str(i), index_col = 0) for i in songsCSVList])

# Now non-songs
nonSongsCSVList = Path(nonSongsPath).rglob('*.csv')
nonMusic_df = pd.concat([pd.read_csv(str(i), index_col = 0) for i in nonSongsCSVList])


In [46]:
# Adding a target variable column where music = 1, not music = 0

music_df['is_song'] = 1
nonMusic_df['is_song'] = 0

# Joining them into one dataset
# Got to balance the dataset first to ensure no more than 5% of the video are non-songs

num_songs = len(music_df)
num_nonSongs = len(nonMusic_df)
percent_songs = 0.95 # desired class balance in terms of percentage of songs
desired_num = round((num_songs / percent_songs) - num_songs)
# Shuffle non_music df
balancedNonMusic_df = nonMusic_df.sample(frac=1).reset_index(drop=True)
balancedNonMusic_df = balancedNonMusic_df[:desired_num]

print(f"Desired number of non_songs = {desired_num} relative to {num_songs} songs")


df = pd.concat([music_df, balancedNonMusic_df], axis=0).reset_index().drop(labels='index', axis=1)
is_song_pct, is_not_song_pct = df['is_song'].value_counts(normalize=True)
print(f'The shape of the full dataframe is {df.shape} where {round(is_song_pct, 2)*100}% of the observations are songs and {round(is_not_song_pct, 2)*100}% are not')

Desired number of non_songs = 44 relative to 828 songs
The shape of the full dataframe is (872, 5) where 95.0% of the observations are songs and 5.0% are not


# MVP:
## Random forest model baseline

In [47]:
# Having a look at the NaN rows and detemining I can probably drop them
df.loc[[i for i in df.index if df.loc[i, :].isna().sum() != 0], :]

Unnamed: 0,Title,URL,Index,Duration,is_song


In [48]:
# Dropping NaN rows
old_n = df.shape[0]
df = df.dropna(axis=0)
print(f'Dropped {old_n - df.shape[0]} observations (old shape = {(old_n, df.shape[1])}), new shape = {df.shape}')

Dropped 0 observations (old shape = (872, 5)), new shape = (872, 5)


In [49]:
df.columns

Index(['Title', 'URL', 'Index', 'Duration', 'is_song'], dtype='object')

In [66]:
# Baseline RF model
# Imports 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score as roc_auc
from sklearn.metrics import accuracy_score as acc

# Splitting the data

features = ['Title', 'Duration']

target = 'is_song'
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2)
indexes = {'train': X_train.index.tolist(), 'val': X_val.index.tolist(), 'test': X_test.index.tolist()}
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((592, 2), (592,), (149, 2), (149,), (131, 2), (131,))

In [67]:
from sklearn.preprocessing import OrdinalEncoder

# Encoding features
oe = OrdinalEncoder()
X_train_matrix = oe.fit_transform(X_train)
X_test_matrix = oe.fit_transform(X_test)
X_val_matrix = oe.fit_transform(X_val)
X_train = pd.DataFrame(X_train_matrix, columns=X_train.columns)
X_test = pd.DataFrame(X_test_matrix, columns=X_test.columns)
X_val = pd.DataFrame(X_val_matrix, columns=X_val.columns)

In [68]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

simple_pipeline = Pipeline(steps=[
    ('simple_imputer', SimpleImputer(strategy='mean')),
    ('classifier', RandomForestClassifier(n_jobs = -1))
    ]
)

simple_pipeline.fit(X_train, y_train)

Pipeline(steps=[('simple_imputer', SimpleImputer()),
                ('classifier', RandomForestClassifier(n_jobs=-1))])

In [70]:
# Results: 

y_pred = simple_pipeline.predict(X_val)
y_pred_train = simple_pipeline.predict(X_train)

roc_auc_val = roc_auc(y_val, y_pred)
roc_auc_train = roc_auc(y_train, y_pred_train)

print(f'The ROC AUC score of my baseline model is: {roc_auc_train}')
print(f'The validation ROC AUC score of my baseline model is: {roc_auc_val}')

# Calculating accuracy
acc_val = acc(y_val, y_pred)
acc_train = acc(y_train, y_pred_train)

print(f'\nThe accuracy score of my baseline model is: {acc_train}')
print(f'The validation accuracy score of my baseline model is: {acc_val}')

The ROC AUC score of my baseline model is: 1.0
The validation ROC AUC score of my baseline model is: 0.5284172661870504

The accuracy score of my baseline model is: 1.0
The validation accuracy score of my baseline model is: 0.8993288590604027


In [63]:
# Let's have a look at which validation observations the model got incorrect
X_val_check = X_val.set_index([indexes['val']])
X_val_check['is_song_pred'] = y_pred
X_val_check['is_song'] = y_val
X_val_check = X_val_check[X_val_check['is_song_pred'] != X_val_check['is_song']]
print(f'The number of observations that were misclassified: {X_val_check.shape[0]}\n'
    f'{(1-round(X_val_check.shape[0] /  X_val.shape[0], 2))*100}% accuracy')

titles = df[['Title']]
to_inspect = X_val_check.merge(titles, how='inner', left_index=True, right_index=True)
to_inspect

The number of observations that were misclassified: 11
93.0% accuracy


Unnamed: 0,Title_x,Duration,is_song_pred,is_song,Title_y
844,130.0,12.0,1,0,The Social Media Version Of Your Ex-Girlfriend
442,105.0,2.0,0,1,Rays of Hope
840,141.0,101.0,1,0,Venom - About as Bad as You Expect
831,124.0,97.0,1,0,The (Only) Problem With Thor Ragnarok
870,117.0,86.0,1,0,Star Wars Cantina Band Auditions
90,14.0,27.0,0,1,BROODS - Bridges
466,3.0,0.0,0,1,2FeetBino- Naked [Official Audio]
855,142.0,30.0,1,0,What It’s Like To Be Super High (POV)
862,50.0,22.0,1,0,Font Conference
349,122.0,3.0,0,1,TRAVELING


In [92]:
# Data leakage? Let's check out the permutation scores
# using eli5
import eli5
from eli5.sklearn import PermutationImportance


# Defining our main preprocessing function:
def preprocess(X):
    X = X.copy()
    
    # Changing all the NaN values to zeros
    X = X.fillna(0)

    oe = OrdinalEncoder()
    X = oe.fit_transform(X)


    return X

X_train_temp = preprocess(X_train)
X_val_temp = preprocess(X_val)

model = RandomForestClassifier(n_jobs=-1)

model.fit(X_train_temp, y_train)

permuter = PermutationImportance(
    model,
    n_iter = 5
)
permuter.fit(X_val_temp, y_val)

# Viewing the top permutation importance features
feature_names = X_train.columns.to_list()

eli5.show_weights(
    permuter,
    top=10,
    feature_names = feature_names
)

ModuleNotFoundError: No module named 'eli5'

In [93]:
### Me trying to check permutation importances using sklearn's library

from sklearn.inspection import permutation_importance

r = permutation_importance(simple_pipeline, X_val, y_val, n_repeats=40)
for i in r.importances_mean.argsort()[::-1]:
     if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
         print(f"{X.columns[i]} "
               f"{r.importances_mean[i]:.3f}"
               f" +/- {r.importances_std[i]:.3f}")

categories 0.255 +/- 0.013
duration 0.067 +/- 0.007
upload_date 0.025 +/- 0.005


# Results of my baseline model:
it looks like I'm achieving a perfect score on these data. Could it be that this data is easily distinguishable between
songs and non-songs, do I have data leakage going on, or is my data unrealistically simplified?

Reasons why this might be:
* My non-music videos consists of tiny channels (Mia Maria, Payo, the sweedish investor) which might be giving my model a simple way to determine which videos are not music by this metric
* this is evident in the permutation importance scores as all three 'view_count', 'like_count', 'dislike_count' for in the top 3

## Things to try a bit differently:
* Remove mia maria's videos and get some non-music videos in there that have similar view counts to the music videos

## Results after axing 'view_count', 'like_count', 'dislike_count':
* F1 score for the validation set has dropped to 0.996
* Permutation score is now showing the most important feature is average rating of 0.0717, followed by categories at about half that (0.0409)

Maybe this shows the model is actually surprisingly great for this (relatively) simple task?


# Things I could try next:
* Oversampling the non-songs in my dataset to make it more balanced, and then measuring accuracy

In [103]:
simple_pipeline

RandomForestClassifier(n_jobs=-1)

In [124]:
# Hyperparameter tuning

from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV

param_distributions = { 
    'classifier__n_estimators': randint(50, 500), 
    'classifier__max_depth': [5, 10, 15, 20, None], 
    'classifier__max_features': uniform(0, 1), 
}

search = RandomizedSearchCV(
    simple_pipeline, 
    param_distributions=param_distributions, 
    n_iter=10, 
    cv=2, 
    scoring='f1', 
    verbose=10, 
    return_train_score=True, 
    n_jobs=-1
)

search.fit(X_train, y_train);

Fitting 2 folds for each of 10 candidates, totalling 20 fits


In [125]:
print('Best hyperparameters', search.best_params_)
print('Cross-validation F1 score', search.best_score_)
tuned_model = search.best_estimator_

Best hyperparameters {'classifier__max_depth': 15, 'classifier__max_features': 0.7708555135882567, 'classifier__n_estimators': 163}
Cross-validation F1 score 0.9910689543811397


In [126]:
# Double checking my F1 scores
# Results: 

y_pred = tuned_model.predict(X_val)
y_pred_train = tuned_model.predict(X_train)

f1_val = f1(y_val, y_pred)
f1_train = f1(y_train, y_pred_train)

print(f'The F1 score of my tuned model is: {f1_train}')
print(f'The validation F1 score of my tuned model is: {f1_val}')

The F1 score of my tuned model is: 1.0
The validation F1 score of my tuned model is: 0.9511201629327903
