In [67]:
# Imports

import pandas as pd
import pandas_profiling
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

pd.options.display.max_rows = 70

# Creating the perfect dataset

The objective goal of this model is to classify my favorites and liked playlists into song and not songs. Both these playlists are already mainly songs (I estimate 99% music for my favorites and 95% music for the liked videos)

Should my training dataset also be ~95% music and only 5% other videos? I think so as this is similar to the distribution of the target I truly care about.

Metric of choice: ROC-AUC score

### Type of music videos in my playlists:
* Individual songs (with and without music videos)
* Mixes

### Types of 'other' videos found in my playlists:
* Gaming videos
* Podcasts
* Video essays


In [68]:
songsPath = '../data/music_data/'
nonSongsPath = '../data/non_music_data'

# Populate a dataframe each for all songs and non-songs

# Songs first
songsCSVList = Path(songsPath).rglob('*.csv')
music_df = pd.concat([pd.read_csv(str(i), index_col = 0) for i in songsCSVList])

# Now non-songs
nonSongsCSVList = Path(nonSongsPath).rglob('*.csv')
nonMusic_df = pd.concat([pd.read_csv(str(i), index_col = 0) for i in nonSongsCSVList])


In [69]:
# Adding a target variable column where music = 1, not music = 0

music_df['is_song'] = 1
nonMusic_df['is_song'] = 0

# Joining them into one dataset
# Got to balance the dataset first to ensure no more than 5% of the video are non-songs

num_songs = len(music_df)
num_nonSongs = len(nonMusic_df)
percent_songs = 0.95 # desired class balance in terms of percentage of songs
desired_num = round((num_songs / percent_songs) - num_songs)
# Shuffle non_music df
balancedNonMusic_df = nonMusic_df.sample(frac=1).reset_index(drop=True)
balancedNonMusic_df = balancedNonMusic_df[:desired_num]

print(f"Desired number of non_songs = {desired_num} relative to {num_songs} songs")


df = pd.concat([music_df, balancedNonMusic_df], axis=0).reset_index().drop(labels='index', axis=1)
is_song_pct, is_not_song_pct = df['is_song'].value_counts(normalize=True)
print(f'The shape of the full dataframe is {df.shape} where {round(is_song_pct, 2)*100}% of the observations are songs and {round(is_not_song_pct, 2)*100}% are not')

Desired number of non_songs = 46 relative to 868 songs
The shape of the full dataframe is (914, 5) where 95.0% of the observations are songs and 5.0% are not


# Cleaning the song Titles and tokenizing

In [70]:
def clean_data(data):
    # Remove non-alphanumeric characters
    data = data.apply(lambda x: re.sub('[^a-zA-Z\-]', ' ', x))

    # Remove extra whitespace and lowercase text 
    data = data.apply(lambda x: ' '.join(x.lower().split()))

    # Remove short words
    data = data.apply(lambda x: ' '.join(x for x in x.split() if len(x) > 2 or x == '-'))
    
    # Stop words will be removed in vectorizer
    return data

# Tokenizer function
def tokenizer(title):
    # Create a list of tokens
    tokens = []
    # Split title into words
    words = title.split()
    # Iterate through the words in the title
    for word in words:
        tokens.append(word)
          
    return tokens

In [71]:
df['cleanTitles'] = clean_data(df['Title'])
df['Title Tokens'] = df['cleanTitles'].apply(tokenizer)
df.head()

Unnamed: 0,Title,URL,Index,Duration,is_song,cleanTitles,Title Tokens
0,Illenium - Crawl Outta Love (feat. Annika Wells),https://www.youtube.com/watch?v=5CMuZrTy6jw,1,242,1,illenium - crawl outta love feat annika wells,"[illenium, -, crawl, outta, love, feat, annika..."
1,Bronze Whale - Patterns,https://www.youtube.com/watch?v=ifHi5TV1Uzk,2,193,1,bronze whale - patterns,"[bronze, whale, -, patterns]"
2,LissA x MEMBA - Sun's Up,https://www.youtube.com/watch?v=opwluvYuiyI,3,200,1,lissa memba - sun,"[lissa, memba, -, sun]"
3,Two Feet - Love Is A Bitch,https://www.youtube.com/watch?v=_DjE4gbIVZk,4,178,1,two feet - love bitch,"[two, feet, -, love, bitch]"
4,Flux Pavilion - Somebody Else (feat. GLNNA),https://www.youtube.com/watch?v=eH-55GN9Dos,5,240,1,flux pavilion - somebody else feat glnna,"[flux, pavilion, -, somebody, else, feat, glnna]"


In [72]:
def count(tokens):
    """
    Calculates some basic statistics about tokens in our corpus (i.e. corpus means collections text data)
    """
    # stores the count of each token
    word_counts = Counter()
    
    # stores the number of docs that each token appears in 
    appears_in = Counter()

    total_docs = len(tokens)

    for token in tokens:
        # stores count of every appearance of a token 
        word_counts.update(token)
        # use set() in order to not count duplicates, thereby count the num of docs that each token appears in
        appears_in.update(set(token))

    # build word count dataframe
    temp = zip(word_counts.keys(), word_counts.values())
    wc = pd.DataFrame(temp, columns = ['word', 'count'])

    # rank the the word counts
    wc['rank'] = wc['count'].rank(method='first', ascending=False)
    total = wc['count'].sum()

    # calculate the percent total of each token
    wc['pct_total'] = wc['count'].apply(lambda token_count: token_count / total)

    # calculate the cumulative percent total of word counts 
    wc = wc.sort_values(by='rank')
    wc['cul_pct_total'] = wc['pct_total'].cumsum()

    # create dataframe for document stats
    t2 = zip(appears_in.keys(), appears_in.values())
    ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
    
    # merge word count stats with doc stats
    wc = ac.merge(wc, on='word')

    wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)

    return wc.sort_values(by='rank')

In [73]:
wc  = count(df['Title Tokens'])
wc.head()

Unnamed: 0,word,appears_in,count,rank,pct_total,cul_pct_total,appears_in_pct
7,-,738,758,1.0,0.131346,0.131346,0.80744
535,official,244,244,2.0,0.04228,0.173627,0.266958
538,video,228,228,3.0,0.039508,0.213135,0.249453
102,the,131,149,4.0,0.025819,0.238953,0.143326
1,feat,109,109,5.0,0.018888,0.257841,0.119256


# Splitting our data in validation and test sets

In [74]:
from sklearn.model_selection import train_test_split

# Splitting the data
features = ['cleanTitles', 'Duration']

target = 'is_song'
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2)
indexes = {'train': X_train.index.tolist(), 'val': X_val.index.tolist(), 'test': X_test.index.tolist()}
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((620, 2), (620,), (156, 2), (156,), (138, 2), (138,))

In [75]:
X.isna().sum().sum()

0

# Extracting features from song titles using TF-IDF

In [92]:
# TFIDF vectorizer
tfidf = TfidfVectorizer(
    stop_words='english', ngram_range=(1,2),
    min_df=5,
    max_features=1000,
    tokenizer=tokenizer)

# Create a vocabulary and get word counts per document
# First create our TF-IDF from our training dataset
X_train_dtm = tfidf.fit_transform(X_train['cleanTitles'])
X_train_dtm = pd.DataFrame(data=X_train_dtm.toarray(), columns=tfidf.get_feature_names(), index=X_train.index)
X_train_dtm['Duration'] = X_train['Duration']
print(f"X_train_dtm dataframe shape: {X_train_dtm.shape}")

# Transoforming X_val
X_val_dtm = tfidf.transform(X_val['cleanTitles'])
X_val_dtm = pd.DataFrame(data=X_val_dtm.toarray(), columns=tfidf.get_feature_names(), index=X_val.index)
X_val_dtm['Duration'] = X_val['Duration']
print(f"X_val_dtm dataframe shape: {X_val_dtm.shape}")

# And X_test
X_test_dtm = tfidf.transform(X_test['cleanTitles'])
X_test_dtm = pd.DataFrame(data=X_test_dtm.toarray(), columns=tfidf.get_feature_names(), index=X_test.index)
X_test_dtm['Duration'] = X_test['Duration']
print(f"X_test_dtm dataframe shape: {X_test_dtm.shape}")

X_train_dtm dataframe shape: (620, 104)
X_val_dtm dataframe shape: (156, 104)
X_test_dtm dataframe shape: (138, 104)


# MVP:
## Random forest model baseline

In [94]:
# Baseline RF model
# Imports 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score as roc_auc
from sklearn.metrics import accuracy_score as acc


In [96]:
clf = RandomForestClassifier(n_jobs = -1)


clf.fit(X_train_dtm, y_train)

RandomForestClassifier(n_jobs=-1)

In [100]:
# Results: 

y_pred = clf.predict(X_val_dtm)
y_pred_train = clf.predict(X_train_dtm)

roc_auc_val = roc_auc(y_val, y_pred)
roc_auc_train = roc_auc(y_train, y_pred_train)

print(f'The training ROC AUC score of my baseline model is: {roc_auc_train}')
print(f'The validation ROC AUC score of my baseline model is: {roc_auc_val}')

# Calculating accuracy
acc_val = acc(y_val, y_pred)
acc_train = acc(y_train, y_pred_train)

print(f'\nThe training accuracy score of my baseline model is: {acc_train}')
print(f'The validation accuracy score of my baseline model is: {acc_val}')

The training ROC AUC score of my baseline model is: 0.9991496598639455
The validation ROC AUC score of my baseline model is: 0.7756471716203259

The training accuracy score of my baseline model is: 0.9983870967741936
The validation accuracy score of my baseline model is: 0.9615384615384616


In [102]:
# Let's have a look at which validation observations the model got incorrect
X_val_check = X_val.set_index([indexes['val']])
X_val_check['is_song_pred'] = y_pred
X_val_check['is_song'] = y_val
X_val_check = X_val_check[X_val_check['is_song_pred'] != X_val_check['is_song']]
print(f'The number of observations that were misclassified: {X_val_check.shape[0]}\n'
    f'{(1-round(X_val_check.shape[0] /  X_val.shape[0], 2))*100}% accuracy')

titles = df[['Title']]
to_inspect = X_val_check.merge(titles, how='inner', left_index=True, right_index=True)
to_inspect.head(50)

The number of observations that were misclassified: 6
96.0% accuracy


Unnamed: 0,cleanTitles,Duration,is_song_pred,is_song,Title
450,morning rain,131,0,1,Morning Rain
906,crash test for dogs,171,1,0,Crash Test For Dogs
393,the nomad,180,0,1,The Nomad
876,the boy who cried literally,154,1,0,The Boy Who Cried Literally
891,photoshop has gone too far,101,1,0,Photoshop Has Gone Too Far
383,india lament,162,0,1,India's Lament


In [104]:
### Me trying to check permutation importances using sklearn's library

from sklearn.inspection import permutation_importance

r = permutation_importance(clf, X_val_dtm, y_val, n_repeats=40)
for i in r.importances_mean.argsort()[::-1]:
     if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
         print(f"{X.columns[i]} "
               f"{r.importances_mean[i]:.3f}"
               f" +/- {r.importances_std[i]:.3f}")

IndexError: index 103 is out of bounds for axis 0 with size 2

In [129]:
# Hyperparameter tuning

from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV

param_distributions = { 
    'n_estimators': randint(50, 500), 
    'max_depth': [5, 10, 15, 20, None], 
    'max_features': uniform(0, 1), 
}

search = RandomizedSearchCV(
    clf, 
    param_distributions=param_distributions, 
    n_iter=50, 
    cv=5, 
    scoring='roc_auc', 
    verbose=10, 
    return_train_score=True, 
    n_jobs=-1
)

search.fit(X_train_dtm, y_train);

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [130]:
print('Best hyperparameters', search.best_params_)
print('Cross-validation ROC-AUC score', search.best_score_)
tuned_model = search.best_estimator_

Best hyperparameters {'max_depth': 5, 'max_features': 0.8997099441178155, 'n_estimators': 488}
Cross-validation ROC-AUC score 0.9333943833943834


In [131]:
# Double checking my ROC-AUC scores
# Results: 

y_pred = tuned_model.predict(X_val_dtm)
y_pred_train = tuned_model.predict(X_train_dtm)

roc_auc_val = roc_auc(y_val, y_pred)
roc_auc_train = roc_auc(y_train, y_pred_train)

print(f'The training ROC AUC score of my baseline model is: {roc_auc_train}')
print(f'The validation ROC AUC score of my baseline model is: {roc_auc_val}')

# Calculating accuracy
acc_val = acc(y_val, y_pred)
acc_train = acc(y_train, y_pred_train)

print(f'\nThe training accuracy score of my baseline model is: {acc_train}')
print(f'The validation accuracy score of my baseline model is: {acc_val}')

The training ROC AUC score of my baseline model is: 0.640625
The validation ROC AUC score of my baseline model is: 0.6428571428571428

The training accuracy score of my baseline model is: 0.9629032258064516
The validation accuracy score of my baseline model is: 0.967948717948718


### Best I could seem to get the randomized search CV tuned hyperparamter model is an ROC-AUCscore of 64.3, worse by 13 points than the naive model

Why is this? 
 - Not exhaustive enough a search
 - Doing something wrong?