<a href="https://colab.research.google.com/github/ryanczhang7/spotifyproject/blob/master/Spotify_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction import text

In [0]:
df_personality_sentiment = pd.read_csv(
    "/content/drive/My Drive/df_personality_sentiment.csv")
df_songs = pd.read_csv("/content/drive/My Drive/df_songs.csv")

**I am trying to predict the genre of a song. Besides all the feature selection and engineering, model selection, and hyperparameter tuning, I have audio features and textual features of a song. Between audio features and textual features, which one will be better for predicting popularity and genre of a song? Because I am comparing audio features and textual features, I will not use artist or album as categorical features.**

# Audio Features

In [67]:
df_songs.head()

Unnamed: 0,artist,album,name,explicit,key,mode,time_signature,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,genre,duration_s
0,Kendrick Lamar,Black Panther The Album Music From And Inspire...,Black Panther,1,1,1,4,0.625,0.618,0.582,4e-06,0.265,-9.454,0.297,90.035,0.48,57,hiphop,130.613
1,Kendrick Lamar,Black Panther The Album Music From And Inspire...,All The Stars,1,8,1,4,0.0605,0.698,0.633,0.000194,0.0926,-4.946,0.0597,96.924,0.552,78,hiphop,232.186
2,Kendrick Lamar,Black Panther The Album Music From And Inspire...,X,1,2,1,4,0.0201,0.768,0.471,0.0,0.268,-8.406,0.259,131.023,0.405,69,hiphop,267.426
3,Kendrick Lamar,Black Panther The Album Music From And Inspire...,The Ways,1,11,0,4,0.0626,0.727,0.72,1e-06,0.176,-5.856,0.0488,140.08,0.589,65,hiphop,238.893
4,Kendrick Lamar,Black Panther The Album Music From And Inspire...,Opps,1,1,1,4,0.152,0.706,0.775,3.3e-05,0.416,-6.819,0.335,127.929,0.847,59,hiphop,180.893


**I will mainly be choosing between Logistic Regression, KNeighbors Classifier, and Random Forest Classifier. I start out with just Logistic and KNeighbors because I haven't chosen features yet, and Random Forest won't work well on just one feature.**

In [27]:
pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression()
)

X_train = df_songs[["danceability"]]
y_train = df_songs["genre"]

pipeline.fit(X_train, y_train)
y_train_ = pipeline.predict(X_train)
(f1_score(y_train, y_train_, pos_label="hiphop"), 
 f1_score(y_train, y_train_, pos_label="pop"))

(0.4046094750320103, 0.6463878326996197)

In [28]:
pipeline = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier()
)

X_train = df_songs[["danceability"]]
y_train = df_songs["genre"]

pipeline.fit(X_train, y_train)
y_train_ = pipeline.predict(X_train)
(f1_score(y_train, y_train_, pos_label="hiphop"), 
 f1_score(y_train, y_train_, pos_label="pop"))

(0.6719999999999999, 0.7007299270072992)

**KNeighbors works better so I stick with KNeighbors when choosing features. I am not using explicit as an audio feature because it is more accurate to describe it as a textual feature.**

In [64]:
pop_f1 = []
hiphop_f1 = []
feature = []
for features in [["danceability"],
                 ["key"],
                 ["mode"],
                 ["energy"],
                 ["instrumentalness"],
                 ["liveness"],
                 ["loudness"],
                 ["acousticness"],
                 ["speechiness"],
                 ["tempo"],
                 ["valence"],
                 ["time_signature"],
                 ["duration_s"],
                 ["popularity"]]:

  ct = make_column_transformer(
      (StandardScaler(), features),
      remainder = "drop"
  )

  pipeline = make_pipeline(
      ct,
      KNeighborsClassifier()
  )

  X_train = df_songs[["danceability", "key", "mode", "energy", "acousticness",
                      "instrumentalness", "liveness", "loudness", "speechiness", 
                      "tempo", "valence", "time_signature", "duration_s",
                      "popularity"]]
  y_train = df_songs["genre"]

  pipeline.fit(X_train, y_train)
  y_train_ = pipeline.predict(X_train)
  feature.append(features)
  hiphop_f1.append(f1_score(y_train, y_train_, pos_label="hiphop"))
  pop_f1.append(f1_score(y_train, y_train_, pos_label="pop"))

df = {}
df["feature"] = feature
df["pop"] = pop_f1
df["hiphop"] = hiphop_f1
df1 = pd.DataFrame(df)
df1["avg"] = (df1["pop"] + df1["hiphop"]) / 2
df1

Unnamed: 0,feature,pop,hiphop,avg
0,[danceability],0.70073,0.672,0.686365
1,[key],0.541863,0.528558,0.53521
2,[mode],0.430584,0.486388,0.458486
3,[energy],0.717668,0.689379,0.703524
4,[instrumentalness],0.43202,0.658518,0.545269
5,[liveness],0.721739,0.661734,0.691736
6,[loudness],0.697842,0.658537,0.678189
7,[acousticness],0.702997,0.671357,0.687177
8,[speechiness],0.795394,0.761117,0.778256
9,[tempo],0.71916,0.663169,0.691165


**Explicit and speechiness are the best. Duration_s and energy are above 70.Liveness, tempo,  and valence aren't bad.**

In [42]:
pop_f1 = []
hiphop_f1 = []
feature = []
for features in [["speechiness"],
                 ["speechiness", "duration_s", "energy"],
                 ["speechiness", "duration_s", "energy", "valence"],
                 ["speechiness", "duration_s", "energy", "valence", "liveness"],
                 ["speechiness", "duration_s", "energy", "valence", "tempo"],
                 ["speechiness", "duration_s", "energy", "liveness", 
                  "tempo", "valence"]]:
  ct = make_column_transformer(
      (StandardScaler(), features),
      remainder = "drop"
  )

  pipeline = make_pipeline(
      ct,
      KNeighborsClassifier()
  )

  X_train = df_songs[["energy", "liveness", "speechiness", "tempo", 
                      "valence", "duration_s", "explicit"]]
  y_train = df_songs["genre"]

  pipeline.fit(X_train, y_train)
  y_train_ = pipeline.predict(X_train)
  feature.append(features)
  hiphop_f1.append(f1_score(y_train, y_train_, pos_label="hiphop"))
  pop_f1.append(f1_score(y_train, y_train_, pos_label="pop"))

df = {}
df["feature"] = feature
df["pop"] = pop_f1
df["hiphop"] = hiphop_f1
df1 = pd.DataFrame(df)
df1["avg"] = (df1["pop"] + df1["hiphop"]) / 2
df1

Unnamed: 0,feature,pop,hiphop,avg
0,[speechiness],0.795394,0.761117,0.778256
1,"[speechiness, duration_s, energy]",0.813743,0.791919,0.802831
2,"[speechiness, duration_s, energy, valence]",0.825737,0.800409,0.813073
3,"[speechiness, duration_s, energy, valence, liv...",0.819469,0.78882,0.804144
4,"[speechiness, duration_s, energy, valence, tempo]",0.817621,0.784599,0.80111
5,"[speechiness, duration_s, energy, liveness, te...",0.817467,0.780231,0.798849


**So I choose speechiness, duration_s, and energy and valence. Now that I have features, I want to try RandomForest. And I want to estimate test error now.**

In [61]:
pipeline = make_pipeline(
    StandardScaler(),
    RandomForestClassifier()
)

cv_errs = cross_val_score(pipeline,
             df_songs[["speechiness", "duration_s", "energy", "valence"]],
             df_songs["genre"], scoring="f1_macro", cv=10)

cv_errs.mean()

0.7466510479697905

# Textual Features

In [71]:
df_personality_sentiment.head()

Unnamed: 0,artist,album,name,explicit,genre,chain_lyrics,lyrics,polarity,magnitude,artistic,emotion,imagination,assertive,cheeful,outgoing,modesty,sympathy,fiery,melancholy
0,Kendrick Lamar,Black Panther The Album Music From And Inspire...,Black Panther,1,hiphop,why i go easy \n know why i go easy \n wait \n...,why i go easy know why i go easy wait king of ...,-1.0,1.2,0.999983,0.931594,0.998384,0.095272,0.387043,0.101374,0.421143,0.985013,0.386352,0.921542
1,Kendrick Lamar,Black Panther The Album Music From And Inspire...,All The Stars,1,hiphop,love lets talk about love \n is it anything an...,love lets talk about love is it anything and e...,-1.0,1.3,0.996511,0.986234,0.991554,0.747748,0.98604,0.705842,0.441873,0.842244,0.157679,0.626636
2,Kendrick Lamar,Black Panther The Album Music From And Inspire...,I Am,1,hiphop,everybody put three fingers in the air \n the ...,everybody put three fingers in the air the sky...,-1.0,2.2,0.906971,0.318395,0.99498,0.574279,0.469771,0.320161,0.02456,0.785487,0.582447,0.669157
3,Kendrick Lamar,Black Panther The Album Music From And Inspire...,Big Shot,1,hiphop,wakanda welcome \n big shot hol up wait peanut...,wakanda welcome big shot hol up wait peanut bu...,-1.0,1.3,0.967339,0.229457,0.975867,0.382183,0.880943,0.477181,0.000747,0.270318,0.787414,0.197281
4,Kendrick Lamar,Black Panther The Album Music From And Inspire...,Pray For Me,1,hiphop,im always ready for a war again \n go down tha...,im always ready for a war again go down that r...,-1.0,1.1,0.98011,0.889895,0.977369,0.307116,0.962168,0.668731,0.864149,0.875111,0.464351,0.871338


**Testing between Logistic Regression, KNeighbors, and Random Forest again.**

In [73]:
pipeline = make_pipeline(
    TfidfVectorizer(),
    LogisticRegression()
)

X_train = df_personality_sentiment["lyrics"]
y_train = df_personality_sentiment["genre"]

pipeline.fit(X_train, y_train)
y_train_ = pipeline.predict(X_train)
(f1_score(y_train, y_train_, pos_label="hiphop"),
    f1_score(y_train, y_train_, pos_label="pop"))

(0.9146853146853148, 0.9341963322545848)

In [74]:
pipeline = make_pipeline(
    TfidfVectorizer(),
    KNeighborsClassifier()
)

X_train = df_personality_sentiment["lyrics"]
y_train = df_personality_sentiment["genre"]

pipeline.fit(X_train, y_train)
y_train_ = pipeline.predict(X_train)
(f1_score(y_train, y_train_, pos_label="hiphop"),
    f1_score(y_train, y_train_, pos_label="pop"))

(0.8333333333333334, 0.8577878103837472)

In [75]:
pipeline = make_pipeline(
    TfidfVectorizer(),
    RandomForestClassifier(max_features = 'sqrt')
)

X_train = df_personality_sentiment["lyrics"]
y_train = df_personality_sentiment["genre"]

pipeline.fit(X_train, y_train)
y_train_ = pipeline.predict(X_train)
(f1_score(y_train, y_train_, pos_label="hiphop"),
 f1_score(y_train, y_train_, pos_label="pop"))

(1.0, 1.0)

**Random Forest wins again, but a score of 1 is too good. This is likely just overfitting. I want to try to estimate the test error.**

In [76]:
pipeline = make_pipeline(
    TfidfVectorizer(),
    RandomForestClassifier(max_features = 'sqrt')
)

cv_errs = cross_val_score(pipeline,
             df_personality_sentiment["lyrics"],
             df_personality_sentiment["genre"], scoring="f1_macro", cv=10)

cv_errs.mean()

0.8327213212954568

**Now I try using the same stopwords that I used for my ngram bar chart.**

In [77]:
my_stop_words = text.ENGLISH_STOP_WORDS.union(
      ["oh", "yeah", "im", "hey"])

pipeline = make_pipeline(
    TfidfVectorizer(stop_words=my_stop_words),
    RandomForestClassifier(max_features = 'sqrt')
)

cv_errs = cross_val_score(pipeline,
             df_personality_sentiment["lyrics"],
             df_personality_sentiment["genre"], scoring="f1_macro", cv=10)

cv_errs.mean()

0.8412252156376174

**This lyric model has already beat the audio feature model, but now we can check textual features.**

In [78]:
f1 = []
feature = []
for features in [["polarity"],
                 ["magnitude"],
                 ["artistic"],
                 ["imagination"],
                 ["emotion"],
                 ["assertive"],
                 ["cheeful"],
                 ["outgoing"],
                 ["modesty"],
                 ["sympathy"],
                 ["fiery"],
                 ["melancholy"]]:
  
  ct = make_column_transformer(
      (StandardScaler(), features),
      remainder = "drop"
  )

  pipeline = make_pipeline(
      ct,
      RandomForestClassifier(max_features = 'sqrt')
  )
  cv_errs = cross_val_score(pipeline,
              df_personality_sentiment[
                      ["polarity", "magnitude", "artistic", 
                      "imagination", "emotion", "assertive", "cheeful", 
                      "outgoing", "outgoing", "modesty", "sympathy", 
                      "fiery", "melancholy"]
                      ],
              df_personality_sentiment["genre"], scoring="f1_macro", cv=10)
  
  feature.append(features)
  f1.append(cv_errs.mean())

df = {}
df["feature"] = feature
df["f1"] = f1
df1 = pd.DataFrame(df)
df1

Unnamed: 0,feature,f1
0,[polarity],0.609134
1,[magnitude],0.670388
2,[artistic],0.522308
3,[imagination],0.50502
4,[emotion],0.611611
5,[assertive],0.61283
6,[cheeful],0.519345
7,[outgoing],0.579484
8,[modesty],0.708254
9,[sympathy],0.552419


In [79]:
f1 = []
feature = []
for features in [["modesty"],
                 ["emotion"],
                 ["assertive"],
                 ["modesty", "assertive"],
                 ["modesty", "emotion"],
                 ["modesty", "assertive", "emotion"],]:
  
  ct = make_column_transformer(
      (StandardScaler(), features),
      remainder = "drop"
  )

  pipeline = make_pipeline(
      ct,
      RandomForestClassifier(max_features = 'sqrt')
  )
  cv_errs = cross_val_score(pipeline,
              df_personality_sentiment[ ["emotion", "assertive", "modesty"]],
              df_personality_sentiment["genre"], scoring="f1_macro", cv=10)
  
  feature.append(features)
  f1.append(cv_errs.mean())

df = {}
df["feature"] = feature
df["f1"] = f1
df1 = pd.DataFrame(df)
df1

Unnamed: 0,feature,f1
0,[modesty],0.708254
1,[emotion],0.610225
2,[assertive],0.61283
3,"[modesty, assertive]",0.755847
4,"[modesty, emotion]",0.733876
5,"[modesty, assertive, emotion]",0.740625


**Modesty and assertive is best. However, I remember that I decided to consider explicit as a textul feature.**

In [81]:
f1 = []
feature = []
for features in [["modesty", "explicit"],
                 ["modesty", "assertive", "explicit"],
                 ["modesty", "assertive", "emotion", "explicit"]]:
  
  ct = make_column_transformer(
      (StandardScaler(), features),
      remainder = "drop"
  )

  pipeline = make_pipeline(
      ct,
      RandomForestClassifier(max_features = 'sqrt')
  )
  cv_errs = cross_val_score(pipeline,
              df_personality_sentiment[["emotion", "assertive", "modesty", 
                                        "explicit"]],
              df_personality_sentiment["genre"], scoring="f1_macro", cv=10)
  
  feature.append(features)
  f1.append(cv_errs.mean())

df = {}
df["feature"] = feature
df["f1"] = f1
df1 = pd.DataFrame(df)
df1

Unnamed: 0,feature,f1
0,"[modesty, explicit]",0.763685
1,"[modesty, assertive, explicit]",0.803143
2,"[modesty, assertive, emotion, explicit]",0.804934


**Hyperparameter tuning using random search cv to randomly select different combinations of parameters at the same time.**

In [91]:
pipeline = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(max_features = 'sqrt')
)

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] + [None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'randomforestclassifier__n_estimators': n_estimators,
               'randomforestclassifier__max_depth': max_depth,
               'randomforestclassifier__min_samples_split': min_samples_split,
               'randomforestclassifier__min_samples_leaf': min_samples_leaf,
               'randomforestclassifier__bootstrap': bootstrap}

rs = RandomizedSearchCV(
    pipeline, param_distributions = random_grid, n_iter = 100, 
    scoring="f1_macro", cv = 5)

model = rs.fit(
    df_personality_sentiment[["emotion", "assertive", "modesty", "explicit"]],
    df_personality_sentiment["genre"])

model.best_params_

{'randomforestclassifier__bootstrap': True,
 'randomforestclassifier__max_depth': 10,
 'randomforestclassifier__min_samples_leaf': 2,
 'randomforestclassifier__min_samples_split': 2,
 'randomforestclassifier__n_estimators': 900}

In [98]:
pipeline = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(max_features = 'sqrt', bootstrap=True, max_depth=10,
                           min_samples_leaf=2, min_samples_split=2,
                           n_estimators=900)
)
cv_errs = cross_val_score(pipeline,
            df_personality_sentiment[["emotion", "assertive", "modesty", 
                                      "explicit"]],
            df_personality_sentiment["genre"], scoring="f1_macro", cv=10)
cv_errs.mean()

0.8096435003675915

**Hyperparameter tuning the lyric model.**

In [95]:
my_stop_words = text.ENGLISH_STOP_WORDS.union(
      ["oh", "yeah", "im", "hey"])

pipeline = make_pipeline(
    TfidfVectorizer(stop_words=my_stop_words),
    RandomForestClassifier(max_features = 'sqrt')
)

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] + [None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'randomforestclassifier__n_estimators': n_estimators,
               'randomforestclassifier__max_depth': max_depth,
               'randomforestclassifier__min_samples_split': min_samples_split,
               'randomforestclassifier__min_samples_leaf': min_samples_leaf,
               'randomforestclassifier__bootstrap': bootstrap}

rs = RandomizedSearchCV(
    pipeline, param_distributions = random_grid, n_iter = 100, 
    scoring="f1_macro", cv = 5)

model = rs.fit(
    df_personality_sentiment["lyrics"],
    df_personality_sentiment["genre"])

model.best_params_

{'randomforestclassifier__bootstrap': False,
 'randomforestclassifier__max_depth': 70,
 'randomforestclassifier__min_samples_leaf': 2,
 'randomforestclassifier__min_samples_split': 2,
 'randomforestclassifier__n_estimators': 100}

**The best lyric model.**

In [99]:
pipeline = make_pipeline(
    TfidfVectorizer(),
    RandomForestClassifier(max_features = 'sqrt', bootstrap=False, max_depth=70,
                           min_samples_leaf=2, min_samples_split=2,
                           n_estimators=100)
)
cv_errs = cross_val_score(pipeline,
            df_personality_sentiment["lyrics"],
            df_personality_sentiment["genre"], scoring="f1_macro", cv=10)
cv_errs.mean()

0.8535691829107798