In [105]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [32]:
spotify = pd.read_csv('./Data/Cleaned_SpotifyFeatures-Copy1.csv')
spotify.head()

Unnamed: 0,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,mode,speechiness,tempo,...,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,key_G,key_G#
0,13,0.234,0.617,0.862,0.976,0.141,-12.855,1,0.0514,129.578,...,0,0,0,0,0,0,0,0,1,0
1,5,0.249,0.518,0.805,0.0,0.333,-6.248,1,0.0407,79.124,...,0,0,0,0,0,0,1,0,0,0
2,30,0.366,0.631,0.513,4e-06,0.109,-6.376,1,0.0293,120.365,...,0,0,0,1,0,0,0,0,0,0
3,39,0.815,0.768,0.137,0.922,0.113,-13.284,0,0.0747,76.43,...,0,0,1,0,0,0,0,0,0,0
4,70,0.131,0.748,0.627,0.0,0.0852,-6.029,1,0.0644,120.963,...,0,0,0,0,0,0,0,0,1,0


In [37]:
spotify['popularity'] = np.where(spotify['popularity'] <= 33, 0, spotify['popularity']) 
spotify['popularity'] = np.where((spotify['popularity'] >= 34) & (spotify['popularity'] <= 66), 1, spotify['popularity']) 
spotify['popularity'] = np.where(spotify['popularity'] >= 67, 2, spotify['popularity']) 

spotify.head()

Unnamed: 0,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,mode,speechiness,tempo,...,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,key_G,key_G#
0,0,0.234,0.617,0.862,0.976,0.141,-12.855,1,0.0514,129.578,...,0,0,0,0,0,0,0,0,1,0
1,0,0.249,0.518,0.805,0.0,0.333,-6.248,1,0.0407,79.124,...,0,0,0,0,0,0,1,0,0,0
2,0,0.366,0.631,0.513,4e-06,0.109,-6.376,1,0.0293,120.365,...,0,0,0,1,0,0,0,0,0,0
3,1,0.815,0.768,0.137,0.922,0.113,-13.284,0,0.0747,76.43,...,0,0,1,0,0,0,0,0,0,0
4,2,0.131,0.748,0.627,0.0,0.0852,-6.029,1,0.0644,120.963,...,0,0,0,0,0,0,0,0,1,0


In [119]:
spotify['popularity'].value_counts(normalize=True).sort_index()

popularity
0    0.393900
1    0.567331
2    0.038769
Name: proportion, dtype: float64

In [122]:
X = spotify[['loudness', 'energy', 'danceability', 'time_signature', 'tempo', 'valence', 'acousticness', 'instrumentalness', 'liveness', 'mode', 'speechiness']]
y = spotify['popularity']


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2024)

0.35513147977556314

In [123]:
tree = DecisionTreeClassifier(max_depth=10, min_samples_split=5)

In [124]:
tree.fit(X_train, y_train)

In [125]:
tree.feature_importances_

array([0.05279019, 0.06687141, 0.07329532, 0.00165421, 0.01606584,
       0.08130697, 0.46513408, 0.05968493, 0.03660016, 0.00262978,
       0.14396712])

In [126]:
pd.DataFrame({
    'variable': X.columns,
    'importance': tree.feature_importances_
    }).sort_values('importance', ascending=False)

Unnamed: 0,variable,importance
6,acousticness,0.465134
10,speechiness,0.143967
5,valence,0.081307
2,danceability,0.073295
1,energy,0.066871
7,instrumentalness,0.059685
0,loudness,0.05279
8,liveness,0.0366
4,tempo,0.016066
9,mode,0.00263


In [128]:
tree.score(X_train, y_train), tree.score(X_test, y_test)

(0.7008765318370879, 0.6802194121095385)

In [129]:
rf = RandomForestClassifier(oob_score=True, max_features='sqrt')
rf.fit(X_train, y_train)

In [130]:
rf.score(X_train, y_train), rf.score(X_test, y_test)

(0.99623845015772, 0.7040448873628674)

In [51]:
rf.oob_score_

0.6692767216592692

In [54]:
p = X.shape[1]

In [59]:
rf2 = RandomForestClassifier(n_estimators=150, oob_score=True, max_features='sqrt')

params = {
    'max_depth': np.append(np.arange(1, 21), None),
    'max_features': np.arange(1, p + 1),
    'min_samples_leaf': np.arange(1, 31)
}

rs = RandomizedSearchCV(rf2, params, n_iter=50, cv=5, n_jobs=4)

In [60]:
rs.fit(X_train, y_train)

In [61]:
rs.score(X_train, y_train), rs.score(X_test, y_test)

(0.7570555229880245, 0.6727032911816431)

In [62]:
rs.best_params_

{'min_samples_leaf': 4, 'max_features': 6, 'max_depth': 15}

In [147]:
et = ExtraTreesClassifier(n_estimators=100, max_depth=35, min_samples_leaf=3)
et.fit(X_train, y_train)

In [148]:
et.score(X_train, y_train), et.score(X_test, y_test)

(0.860515590542389, 0.6986852022443681)

In [76]:
spotify2 = pd.read_csv('./Data/Cleaned_SpotifyFeatures-Copy1.csv')

In [72]:
spotify2['popularity'].value_counts()

popularity
0      6252
47     4188
42     4183
41     4175
49     4171
       ... 
94        5
96        4
99        2
100       1
98        1
Name: count, Length: 101, dtype: int64

In [77]:
spotify2['popularity'] = np.where(spotify2['popularity'] <= 20, 0, spotify2['popularity']) 
spotify2['popularity'] = np.where((spotify2['popularity'] >= 21) & (spotify2['popularity'] <= 40), 1, spotify2['popularity']) 
spotify2['popularity'] = np.where((spotify2['popularity'] >= 41) & (spotify2['popularity'] <= 60), 2, spotify2['popularity'])
spotify2['popularity'] = np.where((spotify2['popularity'] >= 61) & (spotify2['popularity'] <= 80), 3, spotify2['popularity'])
spotify2['popularity'] = np.where(spotify2['popularity'] >= 81, 4, spotify2['popularity']) 


spotify2.head()

Unnamed: 0,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,mode,speechiness,tempo,...,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,key_G,key_G#
0,0,0.234,0.617,0.862,0.976,0.141,-12.855,1,0.0514,129.578,...,0,0,0,0,0,0,0,0,1,0
1,0,0.249,0.518,0.805,0.0,0.333,-6.248,1,0.0407,79.124,...,0,0,0,0,0,0,1,0,0,0
2,1,0.366,0.631,0.513,4e-06,0.109,-6.376,1,0.0293,120.365,...,0,0,0,1,0,0,0,0,0,0
3,1,0.815,0.768,0.137,0.922,0.113,-13.284,0,0.0747,76.43,...,0,0,1,0,0,0,0,0,0,0
4,3,0.131,0.748,0.627,0.0,0.0852,-6.029,1,0.0644,120.963,...,0,0,0,0,0,0,0,0,1,0


In [120]:
spotify2['popularity'].value_counts(normalize=True)

popularity
2    0.373445
1    0.369154
0    0.169312
3    0.085179
4    0.002910
Name: proportion, dtype: float64

In [121]:
spotify2.columns

Index(['popularity', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'mode', 'speechiness',
       'tempo', 'time_signature', 'valence', 'genre_A Capella',
       'genre_Alternative', 'genre_Anime', 'genre_Blues',
       'genre_Children's Music', 'genre_Children’s Music', 'genre_Classical',
       'genre_Comedy', 'genre_Country', 'genre_Dance', 'genre_Electronic',
       'genre_Folk', 'genre_Hip-Hop', 'genre_Indie', 'genre_Jazz',
       'genre_Movie', 'genre_Opera', 'genre_Pop', 'genre_R&B', 'genre_Rap',
       'genre_Reggae', 'genre_Reggaeton', 'genre_Rock', 'genre_Ska',
       'genre_Soul', 'genre_Soundtrack', 'genre_World', 'key_A#', 'key_B',
       'key_C', 'key_C#', 'key_D', 'key_D#', 'key_E', 'key_F', 'key_F#',
       'key_G', 'key_G#'],
      dtype='object')

In [79]:
X2 = spotify2[['loudness', 'energy', 'danceability', 'time_signature', 'tempo', 'valence']]
y2 = spotify2['popularity']

In [80]:
logreg = LogisticRegression()

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=2024)

In [82]:
logreg.fit(X2_train, y2_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [87]:
logreg.score(X2_train, y2_train), logreg.score(X2_test, y2_test)

(0.44726153588476675, 0.44768026128464955)

In [115]:
pipe = Pipeline([
    ('sc', StandardScaler()),
    ('logreg', LogisticRegression())
])

param_grid = {
    'logreg__C': np.logspace(-3, 3, 20),
    'logreg__penalty': ['l1', 'l2'],
    'logreg__solver': ['liblinear'] 
}

gs=GridSearchCV(pipe, param_grid, cv=5, n_jobs=4)

In [116]:
gs.fit(X2_train, y2_train)

In [117]:
gs.best_params_

{'logreg__C': 12.742749857031322,
 'logreg__penalty': 'l2',
 'logreg__solver': 'liblinear'}

In [118]:
gs.score(X2_train, y2_train), gs.score(X2_test, y2_test)

(0.45849035535828936, 0.45865086676157774)

In [101]:
et2 = ExtraTreesClassifier(n_estimators=200)
et2.fit(X2_train, y2_train)
et2.score(X2_train, y2_train), et2.score(X2_test, y2_test)

(0.991597576975686, 0.5104681349970689)

In [103]:
rf3 = RandomForestClassifier(n_estimators=150, oob_score=True, max_features='sqrt')

params2 = {
    'max_depth': np.append(np.arange(1, 21), None),
    'max_features': np.arange(1, p + 1),
    'min_samples_leaf': np.arange(1, 31)
}

rs2 = RandomizedSearchCV(rf3, params2, n_iter=50, cv=5, n_jobs=4)
rs2.fit(X2_train, y2_train), rs2.score(X2_test, y2_test)

(RandomizedSearchCV(cv=5,
                    estimator=RandomForestClassifier(n_estimators=150,
                                                     oob_score=True),
                    n_iter=50, n_jobs=4,
                    param_distributions={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        20, None], dtype=object),
                                         'max_features': array([1, 2, 3, 4, 5, 6]),
                                         'min_samples_leaf': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30])}),
 0.5050665773385814)

In [104]:
rs2.score(X2_test, y2_test)

0.5050665773385814