In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
from scipy.stats import t,ttest_1samp,ttest_ind
from scipy.stats import t , skewnorm , skew , norm , ttest_1samp , ttest_ind
import matplotlib.ticker as ticker
from sklearn import datasets, linear_model
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE

from numpy.random import seed
from matplotlib import pyplot as plt
%matplotlib inline
sns.set_style("whitegrid")

In [None]:
df = pd.read_csv('SpotifyFeatures.csv')
df.head()

# in the above two cells we've imported the data .
## from the data we can see that dataset consists of songs along with
## various attributes like duration,energy ,key, liveness etc.

In [12]:
pd.isnull(df).sum()


genre               0
artist_name         0
track_name          0
track_id            0
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
dtype: int64

In [15]:
np.percentile(df['popularity'],90)

63.0

### we can say that any points outside the range [lower_bound,upper_bound] are outliers
### interestingly in outliers also points such lower tempos (i.e slower songs)
### have significantly lesser popularity than points with higher tempos 
### so we can say that songs with higher tempos are more likely to be popular 

## Feature Engineering ##

In [3]:
df.dtypes

genre                object
artist_name          object
track_name           object
track_id             object
popularity            int64
acousticness        float64
danceability        float64
duration_ms           int64
energy              float64
instrumentalness    float64
key                  object
liveness            float64
loudness            float64
mode                 object
speechiness         float64
tempo               float64
time_signature       object
valence             float64
dtype: object

In [4]:
df.select_dtypes(include=['object']).columns

Index(['genre', 'artist_name', 'track_name', 'track_id', 'key', 'mode',
       'time_signature'],
      dtype='object')

In [5]:
# apart from the artist_name,track_name,track_id we have 3 catagorical variables which are
# - key 
# - mode
# - time_signature

In [6]:
list_of_keys = df['key'].unique()

In [7]:
len(df['key'].unique())

12

In [8]:
for i in range(len(df['key'].unique())):
    df.loc[df['key'] == list_of_keys[i], 'key'] = i
    

In [9]:
df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,0,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,1,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,2,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,0,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,3,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [10]:
df.loc[df['mode'] == 'Major', 'mode'] = 1
df.loc[df['mode'] == 'Minor', 'mode'] = 0



In [11]:
df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,0,0.346,-1.828,1,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,1,0.151,-5.559,0,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,2,0.103,-13.879,0,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,0,0.0985,-12.178,1,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,3,0.202,-21.15,1,0.0456,140.576,4/4,0.39


In [12]:
time_signatures = df['time_signature'].unique()

In [13]:
for i in range(len(time_signatures)):
    df.loc[df['time_signature']==time_signatures[i],'time_signature'] = i

In [14]:
df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,0,0.346,-1.828,1,0.0525,166.969,0,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,1,0.151,-5.559,0,0.0868,174.003,0,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,2,0.103,-13.879,0,0.0362,99.488,1,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,0,0.0985,-12.178,1,0.0395,171.758,0,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,3,0.202,-21.15,1,0.0456,140.576,0,0.39


## set the threshold for popularity##

In [1]:
# Definition of Popularity 
# we define a song to be popular if it is 
# more popular than 90 percentile of popularity

In [16]:
np.percentile(df['popularity'],90)

63.0

In [17]:
# so in our case we define a song to be popular if its popularity is greater than 63 else not 
# this is turning into a classification problem 


## Machine Learning ##

In [18]:
df.loc[df['popularity'] < 63, 'popularity'] = 0 
df.loc[df['popularity'] >= 63, 'popularity'] = 1

In [19]:
df.loc[df['popularity'] == 1]


Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
135,R&B,Mary J. Blige,Be Without You - Kendu Mix,2YegxR5As7BeQuVp2U6pek,1,0.083000,0.724,246333,0.689,0.000000,8,0.3040,-5.922,0,0.1350,146.496,0,0.693
136,R&B,Rihanna,Desperado,6KFaHC9G178beAp7P0Vi5S,1,0.323000,0.685,186467,0.610,0.000000,2,0.1020,-5.221,0,0.0439,94.384,2,0.323
139,R&B,Olivia O'Brien,Love Myself,4XzgjxGKqULifVf7mnDIQK,1,0.596000,0.653,213947,0.621,0.000000,11,0.0811,-5.721,0,0.0409,100.006,0,0.466
141,R&B,Nao,Make It Out Alive (feat. SiR),21Ft8ME799DMZjxNwmui6Z,1,0.667000,0.670,239147,0.649,0.000130,2,0.1090,-7.426,0,0.0875,147.935,0,0.618
142,R&B,Frank Ocean,Seigfried,1BViPjTT585XAhkUUrkts0,1,0.975000,0.377,334570,0.255,0.000208,5,0.1020,-11.165,0,0.0387,125.004,1,0.370
144,R&B,Usher,You Make Me Wanna...,47TqCCnEliDp8NRDyIQoQq,1,0.035900,0.761,219120,0.639,0.000000,3,0.0945,-7.577,0,0.0539,164.088,0,0.922
145,R&B,Mariah Carey,Hero,4FCb4CUbFCMNRkI6lYc1zI,1,0.735000,0.501,257733,0.378,0.000000,5,0.1190,-9.370,1,0.0290,119.987,0,0.178
146,R&B,Beyoncé,Flawless Remix,0zVMzJ37VQNFUNvdxxat2E,1,0.241000,0.639,234413,0.659,0.000008,7,0.4840,-7.273,1,0.2150,136.801,0,0.457
147,R&B,Jason Derulo,Tip Toe (feat. French Montana),2z4pcBLQXF2BXKFvd0BuB6,1,0.023300,0.845,187521,0.709,0.000000,9,0.0940,-4.547,0,0.0714,98.062,0,0.620
149,R&B,Big Sean,I Know,6rje9f1wRFJDO2iTORw0lH,1,0.288000,0.741,319973,0.345,0.000000,0,0.1170,-11.310,1,0.0754,120.015,0,0.277


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

In [None]:
# pip install xgboost

In [None]:
features = ["acousticness", "danceability", "duration_ms", "energy", "instrumentalness", "key", "liveness", 
            "mode", "speechiness", "tempo", "time_signature", "valence"]

In [None]:
training = df.sample(frac = 0.8,random_state = 420)
X_train = training[features]
y_train = training['popularity']
X_test = df.drop(training.index)[features]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.2, random_state = 420)

In [34]:
LR_Model = LogisticRegression()
LR_Model.fit(X_train, y_train)
LR_Predict = LR_Model.predict(X_valid)
LR_Accuracy = accuracy_score(y_valid, LR_Predict)
print("Accuracy: " + str(LR_Accuracy))

LR_AUC = roc_auc_score(y_valid, LR_Predict) 
print("AUC: " + str(LR_AUC*100))



Accuracy: 0.8895692340745515
AUC: 50.0


In [35]:
KNN_Model = KNeighborsClassifier()
KNN_Model.fit(X_train, y_train)
KNN_Predict = KNN_Model.predict(X_valid)
KNN_Accuracy = accuracy_score(y_valid, KNN_Predict)
print("Accuracy: " + str(KNN_Accuracy))

KNN_AUC = roc_auc_score(y_valid, KNN_Predict) 
print("AUC: " + str(KNN_AUC*100))

Accuracy: 0.8747985820174025
AUC: 59.234086855334766


In [36]:
DT_Model = DecisionTreeClassifier()
DT_Model.fit(X_train, y_train)
DT_Predict = DT_Model.predict(X_valid)
DT_Accuracy = accuracy_score(y_valid, DT_Predict)
print("Accuracy: " + str(DT_Accuracy))

DT_AUC = roc_auc_score(y_valid, DT_Predict) 
print("AUC: " + str(DT_AUC))

Accuracy: 0.9157804275432377
AUC: 0.8334886227532248


In [37]:
RFC_Model = RandomForestClassifier()
RFC_Model.fit(X_train, y_train)
RFC_Predict = RFC_Model.predict(X_valid)
RFC_Accuracy = accuracy_score(y_valid, RFC_Predict)
print("Accuracy: " + str(RFC_Accuracy))

RFC_AUC = roc_auc_score(y_valid, RFC_Predict) 
print("AUC: " + str(RFC_AUC))



Accuracy: 0.9542109786228381
AUC: 0.8122760596076434


In [38]:

model_performance_accuracy = pd.DataFrame({'Model': ['LogisticRegression', 
                                                      'RandomForestClassifier', 
                                                      'KNeighborsClassifier',
                                                      'DecisionTreeClassifier'
                                                    ],
                                            'Accuracy': [LR_Accuracy,
                                                         RFC_Accuracy,
                                                         KNN_Accuracy,
                                                         DT_Accuracy
                                                         ]})

model_performance_AUC = pd.DataFrame({'Model': ['LogisticRegression', 
                                                      'RandomForestClassifier', 
                                                      'KNeighborsClassifier',
                                                      'DecisionTreeClassifier'],
                                            'AUC': [LR_AUC,
                                                         RFC_AUC,
                                                         KNN_AUC,
                                                         DT_AUC]})

In [39]:
model_performance_accuracy.sort_values(by = "Accuracy", ascending = False)


Unnamed: 0,Model,Accuracy
1,RandomForestClassifier,0.954211
3,DecisionTreeClassifier,0.91578
0,LogisticRegression,0.889569
2,KNeighborsClassifier,0.874799


### random forest got the most accuracy , lets see if we can optimize it further using hyperparameter tuning 

In [44]:
from sklearn.model_selection import RandomizedSearchCV
import pprint # for pretty printing


# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint.pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [50]:

improved_model = RandomForestClassifier(bootstrap = True,
 max_depth = 70,
 max_features =  'auto',
 min_samples_leaf= 4,
 min_samples_split =  10,
 n_estimators = 400)

In [51]:
improved_model.fit(X_train, y_train)
RFC_Predict = improved_model.predict(X_valid)
RFC_Accuracy = accuracy_score(y_valid, RFC_Predict)
print("Accuracy: " + str(RFC_Accuracy))

RFC_AUC = roc_auc_score(y_valid, RFC_Predict) 
print("AUC: " + str(RFC_AUC))

Accuracy: 0.9200236330432915
AUC: 0.6456636441443707


In [53]:
improved_model.fit(X_train, y_train)
RFC_Predict = improved_model.predict(X_test)
RFC_Accuracy = accuracy_score(y_test, RFC_Predict)
print("Accuracy: " + str(RFC_Accuracy))

RFC_AUC = roc_auc_score(y_test, RFC_Predict) 
print("AUC: " + str(RFC_AUC))

NameError: name 'y_test' is not defined

### strange .. after hyperparameter tuning the accuracy went down from 95% to 92%