In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [4]:
spotify = pd.read_csv('./Data/Cleaned_SpotifyFeatures-Copy1.csv')
spotify.head()

Unnamed: 0,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,mode,speechiness,tempo,...,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,key_G,key_G#
0,13,0.234,0.617,0.862,0.976,0.141,-12.855,1,0.0514,129.578,...,0,0,0,0,0,0,0,0,1,0
1,5,0.249,0.518,0.805,0.0,0.333,-6.248,1,0.0407,79.124,...,0,0,0,0,0,0,1,0,0,0
2,30,0.366,0.631,0.513,4e-06,0.109,-6.376,1,0.0293,120.365,...,0,0,0,1,0,0,0,0,0,0
3,39,0.815,0.768,0.137,0.922,0.113,-13.284,0,0.0747,76.43,...,0,0,1,0,0,0,0,0,0,0
4,70,0.131,0.748,0.627,0.0,0.0852,-6.029,1,0.0644,120.963,...,0,0,0,0,0,0,0,0,1,0


In [5]:
pop_corr = spotify.corr()[['popularity']].sort_values(by='popularity', ascending = False)
pop_corr

Unnamed: 0,popularity
popularity,1.0
genre_Pop,0.405307
loudness,0.340087
genre_Rock,0.332752
genre_Rap,0.314598
genre_Children’s Music,0.298279
genre_Dance,0.283525
genre_Alternative,0.281333
genre_Hip-Hop,0.279976
energy,0.251953


In [6]:
spotify_pop = spotify[spotify['popularity'

In [7]:
X = spotify[['loudness', 'energy', 'danceability', 'time_signature', 'tempo', 'valence', 'acousticness', 'speechiness', 'mode']]
y = spotify['popularity']
over50 = spotify[spotify['popularity'] >= 51] 
X50 = over50[['loudness', 'energy', 'danceability', 'time_signature', 'tempo', 'valence']]
y50 = over50['popularity']
# loudness
# energy
# danceability
# time_signature
# tempo
# valence

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2024)

In [8]:
linreg = LinearRegression()
linreg.fit(X, y)

In [9]:
print(linreg.coef_)
print(linreg.intercept_)

[ 6.55357537e-01 -5.43588922e+00  1.59080692e+01  3.93026258e+00
 -3.25325284e-03 -1.24421403e+01 -1.19413907e+01 -1.14688129e+01
 -1.78088875e+00]
48.03344263788385


In [10]:
set(zip(X.columns, linreg.coef_))

{('acousticness', -11.941390668676458),
 ('danceability', 15.908069176347084),
 ('energy', -5.435889215412164),
 ('loudness', 0.6553575367185144),
 ('mode', -1.7808887526934307),
 ('speechiness', -11.46881294784629),
 ('tempo', -0.003253252836209697),
 ('time_signature', 3.930262579616929),
 ('valence', -12.442140342909301)}

In [11]:
X.corr()

Unnamed: 0,loudness,energy,danceability,time_signature,tempo,valence,acousticness,speechiness,mode
loudness,1.0,0.823795,0.44565,0.202477,0.242821,0.416866,-0.695177,-0.005159,-0.025732
energy,0.823795,1.0,0.345714,0.185818,0.240676,0.440561,-0.733269,0.15207,-0.043849
danceability,0.44565,0.345714,1.0,0.186259,0.034382,0.580372,-0.363322,0.131468,-0.054986
time_signature,0.202477,0.185818,0.186259,1.0,0.038446,0.147956,-0.224078,-0.100139,-0.019455
tempo,0.242821,0.240676,0.034382,0.038446,1.0,0.152234,-0.256391,-0.097891,0.011332
valence,0.416866,0.440561,0.580372,0.147956,0.152234,1.0,-0.339131,0.019026,0.004959
acousticness,-0.695177,-0.733269,-0.363322,-0.224078,-0.256391,-0.339131,1.0,0.173311,0.056179
speechiness,-0.005159,0.15207,0.131468,-0.100139,-0.097891,0.019026,0.173311,1.0,-0.020682
mode,-0.025732,-0.043849,-0.054986,-0.019455,0.011332,0.004959,0.056179,-0.020682,1.0


In [12]:
y_pred = linreg.predict(X)
y_pred

array([32.05866877, 35.79119422, 44.33573723, ..., 44.98856883,
       47.84668282, 40.3095113 ])

In [13]:
linreg.score(X, y)

0.2060010575895509

In [14]:
metrics.r2_score(y, y_pred)

0.2060010575895509

In [15]:
np.log2(X_train.shape[0])

17.128598540544168

In [16]:
rf = RandomForestRegressor(n_estimators=300, oob_score=True, max_features='sqrt')

params = {
    'max_depth': np.append(np.arange(1, 17), None),
    'max_features': np.arange(1, 11),
    'min_samples_leaf': np.arange(1, 31)
}

rs = RandomizedSearchCV(rf, params, n_iter=100, cv=5, n_jobs=4)

In [18]:
rs.fit(X_train, y_train)

In [19]:
print(rs.score(X_train, y_train))
print(rs.score(X_test, y_test))

0.8490723442018786
0.358635670231346


In [17]:
rf2 = RandomForestRegressor(n_estimators=300, random_state=2024, **rs.best_params_)


AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

In [62]:
rf2.fit(X_train, y_train)
rf2.score(X_test, y_test)

0.2985453467368806

In [67]:
params = {
    'n_estimators':[100],
    'max_depth': np.append(np.arange(1, 17), None)
}

gb=GradientBoostingRegressor(random_state=2024)
gs=GridSearchCV(gb, params, cv=5, n_jobs=4)

In [68]:
%%time
gs.fit(X_train, y_train)

CPU times: user 1min 33s, sys: 1.47 s, total: 1min 34s
Wall time: 30min 19s


In [73]:
gs.best_params_

{'max_depth': 12, 'n_estimators': 100}

In [69]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.5386912220881853, 0.2595061641811066)

In [70]:
X2 = spotify[['loudness', 'energy', 'danceability', 'time_signature', 'tempo', 'valence',]]

In [72]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, random_state=2024)

In [75]:
gs.fit(X2_train, y2_train)
gs.score(X2_test, y2_test)

0.25765080717503497

In [86]:
logreg = LogisticRegression()
logreg.fit(X2_train, y2_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [113]:
logreg.best_params_

AttributeError: 'LogisticRegression' object has no attribute 'best_params_'

In [87]:
logreg.score(X2_train, y2_train), logreg.score(X2_test, y2_test)

(0.03966027412556179, 0.03839711916924881)

In [88]:
et = ExtraTreesClassifier(n_estimators=100, random_state=2024)
et.fit(X2_train, y2_train)

In [89]:
et.score(X2_train, y2_train), et.score(X2_test, y2_test)

(0.9393197108003238, 0.031927811741060214)

In [94]:
rf_grid = RandomForestRegressor(n_estimators=500, random_state=2024, max_depth=12)
gs=GridSearchCV(rf_grid, params, cv=5, n_jobs=4)

In [95]:
gs.fit(X2_train, y2_train)

In [96]:
gs.score(X2_test, y2_test)

0.2915917490171356

In [105]:
gs.best_params_

{'max_depth': None, 'n_estimators': 100}

In [106]:
gbr = GradientBoostingRegressor(random_state=2024)
params={
    'max_depth': [None, 12]
}   
gs2=GridSearchCV(gbr, params, cv=5, n_jobs=4)
gs2.fit(X2_train, y2_train)

In [107]:
gs2.score(X2_test, y2_test)

0.25765080717503497

In [102]:
X50.shape

(46847, 6)

In [104]:
X50_train, X50_test, y50_train, y50_test = train_test_split(X50, y50, random_state = 2024)

In [112]:
rf.fit(X50_train, y50_train)
rf.score(X50_test, y50_test)

0.13332465872805455

In [108]:
rf2.fit(X50_train, y50_train)
rf2.score(X50_test, y50_test)

0.12486011793464935

In [109]:
gs2.fit(X50_train, y50_train)
gs2.score(X50_test, y50_test)

0.07125905485854511

In [110]:
gs.fit(X50_train, y50_train)
gs.score(X50_test, y50_test)

0.12486011793464935

In [111]:
rs.fit(X50_train, y50_train)
rs.score(X50_test, y50_test)

0.13390501457317305

In [None]:
params = {
    'max_depth': np.append(np.arange(1, 17), None),
    'max_features': np.arange(1, 11),
    'min_samples_leaf': np.arange(1, 31)
}

rs = RandomizedSearchCV(rf, params, n_iter=100, cv=5, n_jobs=4)

In [None]:
params = {
    'C' : [0.1, 1, 10, 100, 1000]
}

svc = LinearSVC(max_iter=20000)
gs = GridSearchCV(svc, params, cv=5,)
gs.fit(X_train_sc, y_train)