In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix,mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import os
import math

In [3]:
path = "../data"

In [159]:
df = pd.read_csv(os.path.join(path,"spotifyqueriesresults.csv"),index_col = None).drop("Unnamed: 0",axis =1)
#prune bad lists
df = df[df['playlist_id'] != "44ihc7hk5ewgfWsy5x9QKL"] 
df = df[df['playlist_id'] != "5LjyNDmQhoODlJZaNV7fxj"]
df = df[df["playlist_id"] != "6s9wratabPxp4NwDZAbnwl"]
df = df[df["playlist_id"] != "69CBhCYkS3jzhmzRoghVWG"]
df = df[df["playlist_id"] != "4MRjoBGxVSA8JcBiJr6C3Y"]
df = df[df["playlist_id"] != "171HLt90V1fhorEJWF45VO"]
df = df[df["playlist_id"] != "4XHLVq8ke4mxKeZvqgJjNZ"]
df = df[df["playlist_id"] != "2fFSLnFhz5nem6mWRWEJaD"]

In [160]:
df.groupby("category").mean()

Unnamed: 0_level_0,acousticness,danceability,energy,instrumentalness,key,loudness,mode,speechiness,tempo,valence
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
joyful activation,0.168452,0.67313,0.711562,0.09074,5.498336,-6.984493,0.537438,0.112456,119.615384,0.536217
nostalgia,0.479898,0.555012,0.44956,0.217544,5.205546,-10.844982,0.693312,0.0643,115.925433,0.372304
peacefullness,0.679829,0.484904,0.284925,0.493209,5.140549,-16.803195,0.66559,0.065582,109.501991,0.279912
power,0.217689,0.623068,0.668679,0.104055,5.213365,-7.638308,0.630715,0.10297,121.651104,0.49326
sadness,0.40443,0.589158,0.507265,0.029341,5.296602,-8.320311,0.684114,0.086441,118.146968,0.34866
tenderness,0.408639,0.593442,0.480337,0.109199,5.10401,-8.967726,0.547619,0.087788,113.502149,0.368376
tension,0.183656,0.599337,0.708396,0.056457,5.171239,-6.524758,0.663198,0.108359,123.853668,0.48492
transcendence,0.403627,0.506168,0.498595,0.23323,5.141631,-11.564384,0.644635,0.059453,115.233833,0.448944
wonder,0.220666,0.651592,0.66045,0.060559,5.260664,-6.736567,0.676935,0.083493,119.018748,0.554548


In [161]:
df.shape

(9894, 15)

## Baseline randomforest

In [162]:
model_frame = df.drop(["artist","class","playlist_id","title"],axis = 1)

In [163]:
X = model_frame.drop("category",axis =1)
y = model_frame["category"]

In [164]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [165]:
clf = RandomForestClassifier()

In [166]:
clf.fit(X_train,y_train)

RandomForestClassifier()

In [167]:
y_pred = clf.predict(X_test)

In [168]:
print(classification_report(y_test,y_pred))

                   precision    recall  f1-score   support

joyful activation       0.41      0.43      0.42       255
        nostalgia       0.29      0.27      0.28       256
    peacefullness       0.56      0.52      0.54       260
            power       0.19      0.12      0.14       173
          sadness       0.28      0.36      0.31       196
       tenderness       0.23      0.20      0.21       147
          tension       0.31      0.28      0.29       202
    transcendence       0.42      0.50      0.46       231
           wonder       0.34      0.38      0.36       259

         accuracy                           0.36      1979
        macro avg       0.34      0.34      0.34      1979
     weighted avg       0.35      0.36      0.35      1979



## Baseline XGBoost

In [169]:
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)

In [170]:
print(classification_report(y_test,y_pred))

                   precision    recall  f1-score   support

joyful activation       0.40      0.48      0.44       255
        nostalgia       0.29      0.25      0.27       256
    peacefullness       0.64      0.50      0.56       260
            power       0.23      0.08      0.12       173
          sadness       0.24      0.42      0.30       196
       tenderness       0.30      0.15      0.20       147
          tension       0.30      0.21      0.25       202
    transcendence       0.42      0.52      0.46       231
           wonder       0.32      0.40      0.35       259

         accuracy                           0.35      1979
        macro avg       0.35      0.34      0.33      1979
     weighted avg       0.36      0.35      0.34      1979



## TF-IDF
$w$ = $tf$ $\times$ $log{\frac{N}{docfreq}}$

"TF-IDF compares the number of times a word appears in a doc with the number of docs the word appears in"

in our case,words are musicians and docss are playlist categories

N = total playlists with label $j$

tf = number of times artists $i$ is in playlist category $j$

docfreq = number of playlists that have artist $i$

In [222]:
categories = ["wonder","transcendence","nostalgia","tenderness","peacefullness","power","joyful activation","tension","sadness"]

In [261]:
def tf_idf(artist,label):
    tf = len(df[df.category == label ][df.artist == artist])
    N = len(df[df.category == label ])
    docfreq = len(df[df.artist == artist ])
    
    return tf * math.log(N/docfreq)

In [262]:
def get_tf_idf_wonder(row):
    return tf_idf(row["artist"],"wonder")
def get_tf_idf_transcendence(row):
    return tf_idf(row["artist"],"transcendence")
def get_tf_idf_nostalgia(row):
    return tf_idf(row["artist"],"nostalgia")
def get_tf_idf_tenderness(row):
    return tf_idf(row["artist"],"tenderness")
def get_tf_idf_peacefullness(row):
    return tf_idf(row["artist"],"peacefullness")
def get_tf_idf_power(row):
    return tf_idf(row["artist"],"power")
def get_tf_idf_joy(row):
    return tf_idf(row["artist"],"joyful activation")
def get_tf_idf_tension(row):
    return tf_idf(row["artist"],"tension")
def get_tf_idf_sadness(row):
    return tf_idf(row["artist"],"sadness")

In [263]:
df["tf_idf_wonder"] = df.apply(get_tf_idf_wonder, axis = 1)

  


In [265]:
df["tf_idf_transcendence"] = df.apply(get_tf_idf_transcendence, axis = 1)
df["tf_idf_nostalgia"] = df.apply(get_tf_idf_nostalgia, axis = 1)
df["tf_idf_tenderness"] = df.apply(get_tf_idf_tenderness, axis = 1)
df["tf_idf_peacefullness"] = df.apply(get_tf_idf_peacefullness, axis = 1)
df["tf_idf_power"] = df.apply(get_tf_idf_power, axis = 1)
df["tf_idf_joyful"] = df.apply(get_tf_idf_joy, axis = 1)
df["tf_idf_tension"] = df.apply(get_tf_idf_tension, axis = 1)
df["tf_idf_sadness"] = df.apply(get_tf_idf_sadness, axis = 1)

  


In [None]:
df.to_csv(os.path.join(path,"spotifyquerieswithtfidflabels.csv"))

## Modelling again

In [266]:
model_frame = df.drop(["artist","class","playlist_id","title"],axis = 1)

In [267]:
X = model_frame.drop("category",axis =1)
y = model_frame["category"]

In [268]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [269]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [270]:
print(classification_report(y_test,y_pred))

                   precision    recall  f1-score   support

joyful activation       0.76      0.81      0.78       219
        nostalgia       0.76      0.71      0.73       248
    peacefullness       0.76      0.82      0.79       243
            power       0.64      0.58      0.61       177
          sadness       0.64      0.67      0.66       231
       tenderness       0.68      0.58      0.63       170
          tension       0.70      0.68      0.69       211
    transcendence       0.83      0.82      0.82       227
           wonder       0.67      0.72      0.69       253

         accuracy                           0.72      1979
        macro avg       0.71      0.71      0.71      1979
     weighted avg       0.72      0.72      0.72      1979



In [271]:
df

Unnamed: 0,acousticness,artist,class,danceability,energy,instrumentalness,key,loudness,mode,playlist_id,...,category,tf_idf_wonder,tf_idf_transcendence,tf_idf_nostalgia,tf_idf_tenderness,tf_idf_peacefullness,tf_idf_power,tf_idf_joyful,tf_idf_tension,tf_idf_sadness
0,0.069700,Ava Max,happy,0.614,0.9340,0.000000,9,-3.709,0,37i9dQZF1DXdPec7aLTmlC,...,wonder,9.697801,0.000000,0.000000,0.000000,0.000000,17.831318,4.797442,9.340042,4.699571
1,0.054400,Sia,happy,0.572,0.6230,0.000000,8,-4.828,1,37i9dQZF1DXdPec7aLTmlC,...,wonder,22.944422,0.000000,4.557111,0.000000,4.566749,0.000000,13.612650,4.410465,8.879865
2,0.086300,Sam Feldt,happy,0.667,0.8270,0.000000,11,-5.631,1,37i9dQZF1DXdPec7aLTmlC,...,wonder,14.263121,0.000000,0.000000,0.000000,0.000000,4.363677,23.514783,0.000000,9.210340
3,0.105000,Jason Derulo,happy,0.738,0.8500,0.000000,8,-3.049,1,37i9dQZF1DXdPec7aLTmlC,...,wonder,14.860431,0.000000,0.000000,4.496099,0.000000,9.124061,4.901977,0.000000,9.608042
4,0.217000,Tones And I,happy,0.806,0.5130,0.000000,4,-6.208,1,37i9dQZF1DXdPec7aLTmlC,...,wonder,6.452049,0.000000,0.000000,0.000000,0.000000,0.000000,6.400257,0.000000,0.000000
5,0.151000,24kGoldn,happy,0.721,0.7450,0.000001,7,-3.508,0,37i9dQZF1DXdPec7aLTmlC,...,wonder,6.452049,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,6.301703
6,0.017000,Engelwood,happy,0.570,0.9520,0.579000,7,-5.365,1,37i9dQZF1DXdPec7aLTmlC,...,wonder,7.144407,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,0.025500,Nelly,happy,0.707,0.6890,0.000000,4,-7.097,0,37i9dQZF1DXdPec7aLTmlC,...,wonder,5.760478,0.000000,0.000000,10.601628,0.000000,0.000000,0.000000,5.580673,0.000000
8,0.058000,Ritt Momney,happy,0.399,0.4920,0.001010,6,-10.777,0,37i9dQZF1DXdPec7aLTmlC,...,wonder,12.904098,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,0.131000,Shawn Mendes,happy,0.333,0.6370,0.000018,1,-4.904,0,37i9dQZF1DXdPec7aLTmlC,...,wonder,15.777186,0.000000,3.912823,17.470402,39.223693,0.000000,3.893452,3.767691,11.390510
