# Model Training


### Importing Important Libraries


In [163]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import accuracy_score, average_precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings

In [164]:
df = pd.read_csv("dataset-of-10s-CLEANED.csv")
df.head()

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target
0,Wild Things,Alessia Cara,spotify:track:2ZyuwVvV6Z3XJaXIFbspeE,0.741,0.626,1,-4.826,0,0.0886,0.02,0.0,0.0828,0.706,108.029,188493,4,41.18681,10,1
1,Surfboard,Esquivel!,spotify:track:61APOtq25SCMuK0V5w2Kgp,0.447,0.247,5,-14.661,0,0.0346,0.871,0.814,0.0946,0.25,155.489,176880,3,33.18083,9,0
2,Love Someone,Lukas Graham,spotify:track:2JqnpexlO9dmvjUMCaLCLJ,0.55,0.415,9,-6.557,0,0.052,0.161,0.0,0.108,0.274,172.065,205463,4,44.89147,9,1
3,Music To My Ears (feat. Tory Lanez),Keys N Krates,spotify:track:0cjfLhk8WJ3etPTCseKXtk,0.502,0.648,0,-5.698,0,0.0527,0.00513,0.0,0.204,0.291,91.837,193043,4,29.52521,7,0
4,Juju On That Beat (TZ Anthem),Zay Hilfigerrr & Zayion McCall,spotify:track:1lItf5ZXJc1by9SbPeljFd,0.807,0.887,1,-3.892,1,0.275,0.00381,0.0,0.391,0.78,160.517,144244,4,24.99199,8,1


Keeping only numerical and categorical features


In [165]:
df = df.drop(['track', 'artist','uri'], axis=1)
df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target
0,0.741,0.626,1,-4.826,0,0.0886,0.02,0.0,0.0828,0.706,108.029,188493,4,41.18681,10,1
1,0.447,0.247,5,-14.661,0,0.0346,0.871,0.814,0.0946,0.25,155.489,176880,3,33.18083,9,0
2,0.55,0.415,9,-6.557,0,0.052,0.161,0.0,0.108,0.274,172.065,205463,4,44.89147,9,1
3,0.502,0.648,0,-5.698,0,0.0527,0.00513,0.0,0.204,0.291,91.837,193043,4,29.52521,7,0
4,0.807,0.887,1,-3.892,1,0.275,0.00381,0.0,0.391,0.78,160.517,144244,4,24.99199,8,1


In [166]:
print("Categories in 'key' variable:     ",end=" " )
print(df['key'].unique())

print("Categories in 'time_signature' variable:     ",end=" " )
print(df['time_signature'].unique())

print("Categories in 'mode' variable:     ",end=" " )
print(df['mode'].unique())


Categories in 'key' variable:      [ 1  5  9  0  2  7  8 11  3  4  6 10]
Categories in 'time_signature' variable:      [4 3 5 1 0]
Categories in 'mode' variable:      [0 1]


In [167]:
X = df.drop('target', axis=1)
X

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections
0,0.741,0.626,1,-4.826,0,0.0886,0.02000,0.000000,0.0828,0.7060,108.029,188493,4,41.18681,10
1,0.447,0.247,5,-14.661,0,0.0346,0.87100,0.814000,0.0946,0.2500,155.489,176880,3,33.18083,9
2,0.550,0.415,9,-6.557,0,0.0520,0.16100,0.000000,0.1080,0.2740,172.065,205463,4,44.89147,9
3,0.502,0.648,0,-5.698,0,0.0527,0.00513,0.000000,0.2040,0.2910,91.837,193043,4,29.52521,7
4,0.807,0.887,1,-3.892,1,0.2750,0.00381,0.000000,0.3910,0.7800,160.517,144244,4,24.99199,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6353,0.172,0.358,9,-14.430,1,0.0342,0.88600,0.966000,0.3140,0.0361,72.272,150857,4,24.30824,7
6354,0.910,0.366,1,-9.954,1,0.0941,0.09960,0.000000,0.2610,0.7400,119.985,152000,4,32.53856,8
6355,0.719,0.804,10,-4.581,1,0.0355,0.01320,0.000003,0.1390,0.6050,119.999,227760,4,20.73371,7
6356,0.600,0.177,7,-16.070,1,0.0561,0.98900,0.868000,0.1490,0.5600,120.030,213387,4,21.65301,14


In [168]:
y = df["target"]
y

0       1
1       0
2       1
3       0
4       1
       ..
6353    0
6354    1
6355    1
6356    0
6357    0
Name: target, Length: 6358, dtype: int64

In [169]:
X.dtypes

danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
duration_ms           int64
time_signature        int64
chorus_hit          float64
sections              int64
dtype: object

In [170]:
X['time_signature'] = X['time_signature'].astype('O')
X['mode'] = X['mode'].astype('O')
X['key'] = X['key'].astype('O')
X.dtypes

danceability        float64
energy              float64
key                  object
loudness            float64
mode                 object
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
duration_ms           int64
time_signature       object
chorus_hit          float64
sections              int64
dtype: object

In [171]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [172]:

X = preprocessor.fit_transform(X)


In [173]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.528782,-0.094072,-0.664037,-0.520689,-0.687003,1.069577,-0.481093,-0.563191,0.005996,-0.083256
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-1.346898,-0.650832,2.199598,2.027499,-0.615976,-0.784987,1.108362,-0.698571,-0.402662,-0.347795
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.198654,-0.471432,-0.189569,-0.520689,-0.535317,-0.687378,1.663498,-0.365360,0.195097,-0.347795
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.362478,-0.464215,-0.714075,-0.520689,0.042536,-0.618239,-1.023369,-0.510149,-0.589260,-0.876874
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.706909,1.827781,-0.718517,-0.520689,1.168146,1.370537,1.276751,-1.079031,-0.820655,-0.612335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-1.302843,-0.654957,2.250074,2.503328,0.704660,-1.654924,-1.678609,-1.001939,-0.855556,-0.876874
6354,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.449204,-0.037365,-0.396182,-0.520689,0.385637,1.207856,-0.080682,-0.988614,-0.435447,-0.612335
6355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.575507,-0.641553,-0.686920,-0.520679,-0.348718,0.658807,-0.080213,-0.105430,-1.038015,-0.876874
6356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-1.615615,-0.429160,2.596671,2.196544,-0.288525,0.475791,-0.079175,-0.272985,-0.991090,0.974901


In [174]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((5086, 31), (1272, 31))

Making a competition between different models


In [175]:
def evaluate_model(true, predicted):
    acc = accuracy_score(true, predicted)
    ap = average_precision_score(true, predicted)
    return acc, ap

In [176]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Stochastic GD Classifier": SGDClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGB Classifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False, iterations=100, depth=6),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "SVM": SVC()
}
model_list = []
acc_list =[]
ap_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_acc , model_train_ap = evaluate_model(y_train, y_train_pred)

    model_test_acc , model_test_ap = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy Score: {:.4f}".format(model_train_acc))
    print("- Average Precision Score: {:.4f}".format(model_train_ap))
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Accuracy Score: {:.4f}".format(model_test_acc))
    print("- Average Precision Score: {:.4f}".format(model_test_ap))
    
    acc_list.append(model_test_acc)
    ap_list.append(model_test_ap)
    
    print('='*35)
    print('\n')


Logistic Regression
Model performance for Training set
- Accuracy Score: 0.8101
- Average Precision Score: 0.7374
----------------------------------
Model performance for Test set
- Accuracy Score: 0.8082
- Average Precision Score: 0.7301


Stochastic GD Classifier
Model performance for Training set
- Accuracy Score: 0.7873
- Average Precision Score: 0.7107
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7862
- Average Precision Score: 0.7051


Gaussian Naive Bayes
Model performance for Training set
- Accuracy Score: 0.7236
- Average Precision Score: 0.6462
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7248
- Average Precision Score: 0.6443




K-Neighbors Classifier
Model performance for Training set
- Accuracy Score: 0.8411
- Average Precision Score: 0.7676
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7783
- Average Precision Score: 0.6966


Decision Tree
Model performance for Training set
- Accuracy Score: 1.0000
- Average Precision Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7618
- Average Precision Score: 0.6928


Random Forest Classifier
Model performance for Training set
- Accuracy Score: 1.0000
- Average Precision Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy Score: 0.8412
- Average Precision Score: 0.7737


XGB Classifier
Model performance for Training set
- Accuracy Score: 0.9959
- Average Precision Score: 0.9918
----------------------------------
Model performance for Test set
- Accuracy Score: 0.8459
- Average Precision Score: 0.7776


CatBoosting Classifier
Model performanc

In [177]:
results = pd.DataFrame(list(zip(model_list, acc_list, ap_list)), columns=['Model Name', 'Accuracy Score', 'Average Precision'])
results.sort_values(by="Accuracy Score")

Unnamed: 0,Model Name,Accuracy Score,Average Precision
2,Gaussian Naive Bayes,0.724843,0.644257
4,Decision Tree,0.761792,0.692762
3,K-Neighbors Classifier,0.778302,0.6966
1,Stochastic GD Classifier,0.786164,0.7051
0,Logistic Regression,0.808176,0.730093
9,SVM,0.819969,0.741495
8,AdaBoost Classifier,0.839623,0.767677
5,Random Forest Classifier,0.841195,0.773684
6,XGB Classifier,0.845912,0.777617
7,CatBoosting Classifier,0.849843,0.779629


In [179]:
'''
from sklearn.model_selection import GridSearchCV
classifier = CatBoostClassifier()
parameters = [{'iterations': [1, 10, 50, 100], 'depth': [6, 7, 8, 9, 10], 'learning_rate': [0.01, 0.05, 0.1]}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = None)
grid_search = grid_search.fit(X_train, y_train)
'''

0:	learn: 0.6864369	total: 6.5ms	remaining: 0us
0:	learn: 0.6861512	total: 5.34ms	remaining: 0us
0:	learn: 0.6862874	total: 5.45ms	remaining: 0us
0:	learn: 0.6864659	total: 5.54ms	remaining: 0us
0:	learn: 0.6862812	total: 7.02ms	remaining: 0us
0:	learn: 0.6867481	total: 4.42ms	remaining: 0us
0:	learn: 0.6862028	total: 4.41ms	remaining: 0us
0:	learn: 0.6868083	total: 5.11ms	remaining: 0us
0:	learn: 0.6862844	total: 9.43ms	remaining: 0us
0:	learn: 0.6854936	total: 7.81ms	remaining: 0us
0:	learn: 0.6607714	total: 10.3ms	remaining: 0us
0:	learn: 0.6594605	total: 5.74ms	remaining: 0us
0:	learn: 0.6600932	total: 7.11ms	remaining: 0us
0:	learn: 0.6609053	total: 12.8ms	remaining: 0us
0:	learn: 0.6600696	total: 9.38ms	remaining: 0us
0:	learn: 0.6622192	total: 7.48ms	remaining: 0us
0:	learn: 0.6597310	total: 6.27ms	remaining: 0us
0:	learn: 0.6624866	total: 8.49ms	remaining: 0us
0:	learn: 0.6600783	total: 5.35ms	remaining: 0us
0:	learn: 0.6564489	total: 4.45ms	remaining: 0us
0:	learn: 0.6313022	t

In [180]:
grid_search.best_params_

{'depth': 8, 'iterations': 100, 'learning_rate': 0.1}

In [181]:
grid_search.best_score_

0.8454538774499947