In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix,mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import os
import math

In [3]:
path = "../data"

In [12]:
df = pd.read_csv(os.path.join(path,"spotifyqueriesresults.csv"),index_col = None).drop("Unnamed: 0",axis =1)

In [13]:
df.groupby("category").mean()

Unnamed: 0_level_0,acousticness,danceability,energy,instrumentalness,key,loudness,mode,speechiness,tempo,valence
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
joyful activation,0.180035,0.664345,0.6945,0.083653,5.529013,-7.001051,0.532027,0.110526,119.468653,0.519891
nostalgia,0.46972,0.561393,0.45411,0.210555,5.203631,-10.738125,0.686661,0.067877,116.28088,0.371597
peacefullness,0.679829,0.484904,0.284925,0.493209,5.140549,-16.803195,0.66559,0.065582,109.501991,0.279912
power,0.217506,0.627859,0.662724,0.097844,5.196121,-7.594899,0.633621,0.105222,121.271582,0.491749
sadness,0.40443,0.589158,0.507265,0.029341,5.296602,-8.320311,0.684114,0.086441,118.146968,0.34866
tenderness,0.396266,0.597379,0.486002,0.106779,5.149522,-8.911364,0.549043,0.089847,114.084594,0.376336
tension,0.180047,0.60416,0.706699,0.053508,5.134168,-6.47129,0.653846,0.107523,123.787971,0.48769
transcendence,0.403627,0.506168,0.498595,0.23323,5.141631,-11.564384,0.644635,0.059453,115.233833,0.448944
wonder,0.216284,0.656309,0.65819,0.058842,5.242748,-6.717692,0.677863,0.0884,119.137156,0.549995


## Baseline randomforest

In [17]:
model_frame = df.drop(["artist","class","playlist_id","title"],axis = 1)

In [18]:
X = model_frame.drop("category",axis =1)
y = model_frame["category"]

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [23]:
clf = RandomForestClassifier()

In [24]:
clf.fit(X_train,y_train)

RandomForestClassifier()

In [25]:
y_pred = clf.predict(X_test)

In [26]:
print(classification_report(y_test,y_pred))

                   precision    recall  f1-score   support

joyful activation       0.40      0.41      0.40       285
        nostalgia       0.27      0.31      0.29       248
    peacefullness       0.54      0.48      0.51       255
            power       0.20      0.11      0.14       188
          sadness       0.35      0.37      0.36       221
       tenderness       0.23      0.19      0.21       145
          tension       0.30      0.28      0.29       229
    transcendence       0.40      0.45      0.42       222
           wonder       0.30      0.37      0.33       263

         accuracy                           0.34      2056
        macro avg       0.33      0.33      0.33      2056
     weighted avg       0.34      0.34      0.34      2056



## Baseline SVM

In [27]:
svm = SVC()

In [28]:
svm.fit(X_train,y_train)

SVC()

In [29]:
y_pred = svm.predict(X_test)
print(classification_report(y_test,y_pred))

                   precision    recall  f1-score   support

joyful activation       0.19      0.73      0.30       285
        nostalgia       0.18      0.31      0.23       248
    peacefullness       0.54      0.40      0.46       255
            power       0.00      0.00      0.00       188
          sadness       0.00      0.00      0.00       221
       tenderness       0.00      0.00      0.00       145
          tension       0.15      0.11      0.13       229
    transcendence       0.00      0.00      0.00       222
           wonder       0.19      0.13      0.15       263

         accuracy                           0.22      2056
        macro avg       0.14      0.19      0.14      2056
     weighted avg       0.16      0.22      0.16      2056



  _warn_prf(average, modifier, msg_start, len(result))


## Baseline XGBoost

In [46]:
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)

In [48]:
print(classification_report(y_test,y_pred))

                   precision    recall  f1-score   support

joyful activation       0.38      0.46      0.42       285
        nostalgia       0.26      0.22      0.24       248
    peacefullness       0.60      0.46      0.52       255
            power       0.29      0.11      0.16       188
          sadness       0.28      0.41      0.33       221
       tenderness       0.31      0.17      0.22       145
          tension       0.31      0.25      0.28       229
    transcendence       0.33      0.41      0.36       222
           wonder       0.27      0.38      0.31       263

         accuracy                           0.33      2056
        macro avg       0.34      0.32      0.31      2056
     weighted avg       0.34      0.33      0.33      2056



## TF-IDF
$w$ = $tf$ $\times$ $log{\frac{N}{docfreq}}$

"TF-IDF compares the number of times a word appears in a doc with the number of docs the word appears in"

in our case,words are musicians and docss are playlist categories

N = total playlists with label $j$

tf = number of times artists $i$ is in playlist category $j$

docfreq = number of playlists that have artist $i$

In [51]:
def tf_idf(artist,label):
    tf = len(df[df.category == label ][df.artist == artist])
    N = len(df[df.category == label ])
    docfreq = len(df[df.artist == "Drake" ])
    
    return tf * math.log(N/docfreq)

In [61]:
#example
tf_idf("Meek Mill","wonder")

  


10.620588769406435

In [62]:
tf_idf("Meek Mill","sadness")

  


0.0

In [63]:
df[df.artist == "Meek Mill" ]

Unnamed: 0,acousticness,artist,class,danceability,energy,instrumentalness,key,loudness,mode,playlist_id,speechiness,tempo,title,valence,category
957,0.0038,Meek Mill,smile,0.901,0.65,0.0,2,-4.137,1,6s9wratabPxp4NwDZAbnwl,0.0981,100.298,1 AM,0.295,wonder
965,0.468,Meek Mill,smile,0.631,0.774,0.0,11,-2.459,1,6s9wratabPxp4NwDZAbnwl,0.3,79.745,Dreams and Nightmares,0.44,wonder
969,0.0854,Meek Mill,smile,0.851,0.702,0.0,1,-4.172,1,6s9wratabPxp4NwDZAbnwl,0.268,126.94,Millidelphia (feat. Swizz Beatz),0.179,wonder
2684,0.0581,Meek Mill,sensual,0.763,0.441,0.0,0,-8.211,1,29sDXBnOYG5ECGA1q10QtQ,0.256,86.093,Dangerous (feat. Jeremih and PnB Rock),0.538,tenderness
5889,0.259,Meek Mill,hype,0.889,0.496,0.0,4,-6.365,0,75SM1OtJ03mrmfWSIQIHkV,0.0905,86.003,Going Bad (feat. Drake),0.544,power
8094,0.468,Meek Mill,pissed,0.631,0.774,0.0,11,-2.459,1,36lqaSvXXuBxi3oYIaEI4B,0.3,79.745,Dreams and Nightmares,0.44,tension
8112,0.317,Meek Mill,pissed,0.727,0.599,0.0,1,-6.941,1,36lqaSvXXuBxi3oYIaEI4B,0.327,167.909,R.I.C.O. (feat. Drake),0.386,tension
