In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('SpotifyFeatures.csv')

## Data Preprocessing 

In [3]:
df2 = pd.get_dummies(df,columns=['genre'])

In [4]:
keylists = df['key'].unique()
for i in range(len(keylists)):
    df.loc[df['key'] == keylists[i], 'key'] = i

df.loc[df["mode"] == 'Major', "mode"] = 1
df.loc[df["mode"] == 'Minor', "mode"] = 0

time_signature = df['time_signature'].unique()
for item in range(len(time_signature)):
    df.loc[df['time_signature']==time_signature[item],'time_signature'] = item

In [5]:
pop_class= []
for item in df['popularity']:
    if(item >= 0 and item <= 50):
        pop_class.append('Not Popular')
    elif(item >= 51 and item <= 100):
        pop_class.append('Popular')

df['popularity_class'] = pop_class

In [6]:
df.popularity_class.value_counts(normalize=True)

Not Popular    0.657957
Popular        0.342043
Name: popularity_class, dtype: float64

In [7]:
df2 = pd.get_dummies(df,columns=['genre'])

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import classification_report,accuracy_score
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE

Using TensorFlow backend.


In [52]:
x = df2.drop(['artist_name','track_name','track_id','popularity','popularity_class'],axis=1)
y = df2['popularity_class']

In [53]:
def report(x,y,model):
    X_train,X_test,y_train,y_test = train_test_split(x,y)
    cr = model()
    cr.fit(X_train,y_train)
    
    prediction_test = cr.predict(X_test)
    prediction_train = cr.predict(X_train)
    print('Class Report Data Test')
    print(classification_report(y_test,prediction_test))
    print('=====================================================')
    print('Class Report Data Train')
    print(classification_report(y_train,prediction_train))

In [54]:
report(x,y,LogisticRegression)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       0.66      1.00      0.79     38296
     Popular       0.00      0.00      0.00     19886

    accuracy                           0.66     58182
   macro avg       0.33      0.50      0.40     58182
weighted avg       0.43      0.66      0.52     58182

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       0.66      1.00      0.79    114827
     Popular       0.00      0.00      0.00     59716

    accuracy                           0.66    174543
   macro avg       0.33      0.50      0.40    174543
weighted avg       0.43      0.66      0.52    174543



In [55]:
report(x,y,DecisionTreeClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       0.84      0.84      0.84     38193
     Popular       0.69      0.69      0.69     19989

    accuracy                           0.78     58182
   macro avg       0.76      0.76      0.76     58182
weighted avg       0.78      0.78      0.78     58182

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00    114930
     Popular       1.00      1.00      1.00     59613

    accuracy                           1.00    174543
   macro avg       1.00      1.00      1.00    174543
weighted avg       1.00      1.00      1.00    174543



In [56]:
report(x,y,GradientBoostingClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       0.85      0.93      0.89     38421
     Popular       0.84      0.69      0.75     19761

    accuracy                           0.85     58182
   macro avg       0.85      0.81      0.82     58182
weighted avg       0.85      0.85      0.84     58182

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       0.85      0.93      0.89    114702
     Popular       0.84      0.69      0.76     59841

    accuracy                           0.85    174543
   macro avg       0.85      0.81      0.82    174543
weighted avg       0.85      0.85      0.84    174543



In [57]:
report(x,y,XGBClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       0.85      0.93      0.89     38144
     Popular       0.84      0.68      0.75     20038

    accuracy                           0.85     58182
   macro avg       0.85      0.81      0.82     58182
weighted avg       0.85      0.85      0.84     58182

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       0.85      0.93      0.89    114979
     Popular       0.84      0.68      0.75     59564

    accuracy                           0.85    174543
   macro avg       0.85      0.81      0.82    174543
weighted avg       0.85      0.85      0.84    174543



## Undersampling 

In [15]:
data_minor = df2[df2['popularity_class'] == 'Popular']
data_mayor = df2[df2['popularity_class'] == 'Not Popular']

In [16]:
len(data_minor)

79602

In [17]:
data_mayor_index = np.random.choice(data_mayor.index, len(data_minor))

In [18]:
df_class_mayor = df2.loc[data_mayor_index]

In [19]:
undersample_df = pd.concat([df_class_mayor,data_minor])

In [20]:
undersample_df['popularity_class'].value_counts(normalize=True)

Not Popular    0.5
Popular        0.5
Name: popularity_class, dtype: float64

## Undersampled Modelling 

In [59]:
x_under = undersample_df.drop(['artist_name','track_name','track_id','popularity','popularity_class'],axis=1)
y_under = undersample_df['popularity_class']

In [60]:
report(x_under,y_under,LogisticRegression)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       0.64      0.55      0.59     19930
     Popular       0.60      0.69      0.64     19871

    accuracy                           0.62     39801
   macro avg       0.62      0.62      0.62     39801
weighted avg       0.62      0.62      0.62     39801

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       0.64      0.55      0.59     59672
     Popular       0.61      0.69      0.65     59731

    accuracy                           0.62    119403
   macro avg       0.62      0.62      0.62    119403
weighted avg       0.62      0.62      0.62    119403



In [61]:
report(x_under,y_under,DecisionTreeClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       0.80      0.82      0.81     19916
     Popular       0.82      0.79      0.80     19885

    accuracy                           0.81     39801
   macro avg       0.81      0.81      0.81     39801
weighted avg       0.81      0.81      0.81     39801

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00     59686
     Popular       1.00      1.00      1.00     59717

    accuracy                           1.00    119403
   macro avg       1.00      1.00      1.00    119403
weighted avg       1.00      1.00      1.00    119403



In [62]:
report(x_under,y_under,RandomForestClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       0.83      0.89      0.86     19990
     Popular       0.88      0.82      0.85     19811

    accuracy                           0.85     39801
   macro avg       0.86      0.85      0.85     39801
weighted avg       0.86      0.85      0.85     39801

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       0.99      1.00      0.99     59612
     Popular       1.00      0.99      0.99     59791

    accuracy                           0.99    119403
   macro avg       0.99      0.99      0.99    119403
weighted avg       0.99      0.99      0.99    119403



In [63]:
report(x_under,y_under,XGBClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       0.81      0.87      0.84     19933
     Popular       0.86      0.80      0.83     19868

    accuracy                           0.83     39801
   macro avg       0.84      0.83      0.83     39801
weighted avg       0.84      0.83      0.83     39801

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       0.81      0.87      0.84     59669
     Popular       0.86      0.80      0.83     59734

    accuracy                           0.83    119403
   macro avg       0.84      0.83      0.83    119403
weighted avg       0.84      0.83      0.83    119403



In [64]:
report(x_under,y_under,GradientBoostingClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       0.82      0.86      0.84     19968
     Popular       0.85      0.81      0.83     19833

    accuracy                           0.84     39801
   macro avg       0.84      0.84      0.84     39801
weighted avg       0.84      0.84      0.84     39801

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       0.82      0.86      0.84     59634
     Popular       0.85      0.81      0.83     59769

    accuracy                           0.84    119403
   macro avg       0.84      0.84      0.84    119403
weighted avg       0.84      0.84      0.84    119403



## Oversampling 

In [27]:
ros = RandomOverSampler()
xros, yros = ros.fit_sample(x,y)

In [28]:
df_over_all = pd.DataFrame(xros,columns=x.columns)
df_over_all['popularity_class'] = yros

In [29]:
df_over_all['popularity_class'].value_counts()

Not Popular    153123
Popular        153123
Name: popularity_class, dtype: int64

In [66]:
newx, newy = df_over_all.drop(['popularity_class','popularity'],axis=1), df_over_all['popularity_class']

In [67]:
report(newx,newy,LogisticRegression)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       0.64      0.55      0.59     38282
     Popular       0.60      0.68      0.64     38280

    accuracy                           0.62     76562
   macro avg       0.62      0.62      0.62     76562
weighted avg       0.62      0.62      0.62     76562

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       0.64      0.56      0.59    114841
     Popular       0.61      0.68      0.64    114843

    accuracy                           0.62    229684
   macro avg       0.62      0.62      0.62    229684
weighted avg       0.62      0.62      0.62    229684



In [68]:
report(newx,newy,RandomForestClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       0.91      0.88      0.90     38160
     Popular       0.89      0.91      0.90     38402

    accuracy                           0.90     76562
   macro avg       0.90      0.90      0.90     76562
weighted avg       0.90      0.90      0.90     76562

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       0.99      1.00      0.99    114963
     Popular       1.00      0.99      0.99    114721

    accuracy                           0.99    229684
   macro avg       0.99      0.99      0.99    229684
weighted avg       0.99      0.99      0.99    229684



In [69]:
report(newx,newy,DecisionTreeClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       0.90      0.82      0.86     38270
     Popular       0.83      0.90      0.87     38292

    accuracy                           0.86     76562
   macro avg       0.86      0.86      0.86     76562
weighted avg       0.86      0.86      0.86     76562

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00    114853
     Popular       1.00      1.00      1.00    114831

    accuracy                           1.00    229684
   macro avg       1.00      1.00      1.00    229684
weighted avg       1.00      1.00      1.00    229684



In [70]:
report(newx,newy,GradientBoostingClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       0.82      0.86      0.84     38332
     Popular       0.85      0.81      0.83     38230

    accuracy                           0.83     76562
   macro avg       0.83      0.83      0.83     76562
weighted avg       0.83      0.83      0.83     76562

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       0.82      0.86      0.84    114791
     Popular       0.85      0.82      0.83    114893

    accuracy                           0.84    229684
   macro avg       0.84      0.84      0.84    229684
weighted avg       0.84      0.84      0.84    229684



In [71]:
report(newx,newy,XGBClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       0.82      0.86      0.84     38210
     Popular       0.86      0.81      0.83     38352

    accuracy                           0.83     76562
   macro avg       0.84      0.83      0.83     76562
weighted avg       0.84      0.83      0.83     76562

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       0.81      0.86      0.84    114913
     Popular       0.85      0.80      0.83    114771

    accuracy                           0.83    229684
   macro avg       0.83      0.83      0.83    229684
weighted avg       0.83      0.83      0.83    229684



In [36]:
def report_oversampling(x,y,model):
    X_train,X_test,y_train,y_test = train_test_split(x,y)
    cr = model()
    ros = RandomOverSampler()
    xros, yros = ros.fit_sample(X_train,y_train)
    
    cr.fit(xros,yros)
    
    prediction_test = cr.predict(X_test)
    prediction_train = cr.predict(xros)
    print('Class Report Data Test')
    print(classification_report(y_test,prediction_test))
    print('====================')
    print('Class Report Data Train')
    print(classification_report(yros,prediction_train))

In [37]:
report_oversampling(x,y,DecisionTreeClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00     38327
     Popular       1.00      1.00      1.00     19855

    accuracy                           1.00     58182
   macro avg       1.00      1.00      1.00     58182
weighted avg       1.00      1.00      1.00     58182

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00    114796
     Popular       1.00      1.00      1.00    114796

    accuracy                           1.00    229592
   macro avg       1.00      1.00      1.00    229592
weighted avg       1.00      1.00      1.00    229592



In [38]:
report_oversampling(x,y,RandomForestClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00     38208
     Popular       1.00      1.00      1.00     19974

    accuracy                           1.00     58182
   macro avg       1.00      1.00      1.00     58182
weighted avg       1.00      1.00      1.00     58182

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00    114915
     Popular       1.00      1.00      1.00    114915

    accuracy                           1.00    229830
   macro avg       1.00      1.00      1.00    229830
weighted avg       1.00      1.00      1.00    229830



## SMOTE 

In [39]:
smot = SMOTE()

In [40]:
xsm,ysm = smot.fit_sample(x,y)

In [41]:
df_smote = pd.DataFrame(xsm,columns=x.columns)
df_smote['popularity_class'] = ysm

In [42]:
df_smote['popularity_class'].value_counts()

Not Popular    153123
Popular        153123
Name: popularity_class, dtype: int64

In [43]:
newx_smote,newy_smote = df_smote.drop('popularity_class',axis=1), df_smote['popularity_class']

In [44]:
report(newx_smote,newy_smote,DecisionTreeClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00     38368
     Popular       1.00      1.00      1.00     38194

    accuracy                           1.00     76562
   macro avg       1.00      1.00      1.00     76562
weighted avg       1.00      1.00      1.00     76562

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00    114755
     Popular       1.00      1.00      1.00    114929

    accuracy                           1.00    229684
   macro avg       1.00      1.00      1.00    229684
weighted avg       1.00      1.00      1.00    229684



In [45]:
report(newx_smote,newy_smote,RandomForestClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00     38252
     Popular       1.00      1.00      1.00     38310

    accuracy                           1.00     76562
   macro avg       1.00      1.00      1.00     76562
weighted avg       1.00      1.00      1.00     76562

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00    114871
     Popular       1.00      1.00      1.00    114813

    accuracy                           1.00    229684
   macro avg       1.00      1.00      1.00    229684
weighted avg       1.00      1.00      1.00    229684



In [46]:
def report_smote(x,y,model):
    X_train,X_test,y_train,y_test = train_test_split(x,y)
    cr = model()
    cr.fit(X_train,y_train)
    smot = SMOTE()
    xsm, ysm = smot.fit_sample(X_train,y_train)
    
    cr.fit(xsm,ysm)
    
    prediction_test = cr.predict(X_test)
    prediction_train = cr.predict(xsm)
    print('Class Report Data Test')
    print(classification_report(y_test,prediction_test))
    print('=====================================================')
    print('Class Report Data Train')
    print(classification_report(ysm,prediction_train))

In [47]:
report_smote(x,y,LogisticRegression)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       0.95      0.85      0.90     38319
     Popular       0.76      0.92      0.83     19863

    accuracy                           0.87     58182
   macro avg       0.86      0.88      0.86     58182
weighted avg       0.89      0.87      0.88     58182

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       0.92      0.85      0.88    114804
     Popular       0.86      0.93      0.89    114804

    accuracy                           0.89    229608
   macro avg       0.89      0.89      0.89    229608
weighted avg       0.89      0.89      0.89    229608



In [48]:
report_smote(x,y,DecisionTreeClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00     38179
     Popular       1.00      1.00      1.00     20003

    accuracy                           1.00     58182
   macro avg       1.00      1.00      1.00     58182
weighted avg       1.00      1.00      1.00     58182

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00    114944
     Popular       1.00      1.00      1.00    114944

    accuracy                           1.00    229888
   macro avg       1.00      1.00      1.00    229888
weighted avg       1.00      1.00      1.00    229888



In [49]:
report_smote(x,y,RandomForestClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00     38294
     Popular       1.00      1.00      1.00     19888

    accuracy                           1.00     58182
   macro avg       1.00      1.00      1.00     58182
weighted avg       1.00      1.00      1.00     58182

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00    114829
     Popular       1.00      1.00      1.00    114829

    accuracy                           1.00    229658
   macro avg       1.00      1.00      1.00    229658
weighted avg       1.00      1.00      1.00    229658



In [50]:
report_smote(x,y,GradientBoostingClassifier)

Class Report Data Test
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00     38153
     Popular       1.00      1.00      1.00     20029

    accuracy                           1.00     58182
   macro avg       1.00      1.00      1.00     58182
weighted avg       1.00      1.00      1.00     58182

Class Report Data Train
              precision    recall  f1-score   support

 Not Popular       1.00      1.00      1.00    114970
     Popular       1.00      1.00      1.00    114970

    accuracy                           1.00    229940
   macro avg       1.00      1.00      1.00    229940
weighted avg       1.00      1.00      1.00    229940

