In [19]:
# basic stuff
import numpy as np
import pandas as pd
import time

# other sklearn libraries
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import *

# models
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.mixture import GaussianMixture
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# plotting
import seaborn as sns

# Data

## Loading Data

In [2]:
data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Class
0,Bruno Mars,That's What I Like (feat. Gucci Mane),60.0,0.854,0.564,1.0,-4.964,1,0.0485,0.0171,,0.0849,0.899,134.071,234596.0,4,5
1,Boston,Hitch a Ride,54.0,0.382,0.814,3.0,-7.23,1,0.0406,0.0011,0.00401,0.101,0.569,116.454,251733.0,4,10
2,The Raincoats,No Side to Fall In,35.0,0.434,0.614,6.0,-8.334,1,0.0525,0.486,0.000196,0.394,0.787,147.681,109667.0,4,6
3,Deno,Lingo (feat. J.I & Chunkz),66.0,0.853,0.597,10.0,-6.528,0,0.0555,0.0212,,0.122,0.569,107.033,173968.0,4,5
4,Red Hot Chili Peppers,Nobody Weird Like Me - Remastered,53.0,0.167,0.975,2.0,-4.279,1,0.216,0.000169,0.0161,0.172,0.0918,199.06,229960.0,4,10


In [3]:
data[data['Artist Name'] == 'Bruno Mars']

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Class
0,Bruno Mars,That's What I Like (feat. Gucci Mane),60.0,0.854,0.564,1.0,-4.964,1,0.0485,0.0171,,0.0849,0.899,134.071,234596.0,4,5
2742,Bruno Mars,Talking to the Moon,82.0,0.523,0.606,1.0,-4.754,0,0.0301,0.512,,0.106,0.065,145.837,217867.0,4,9
13085,Bruno Mars,Marry You,75.0,0.62,0.832,10.0,-4.848,1,0.036,0.332,,0.104,0.481,144.926,230120.0,4,9
16996,Bruno Mars,Just The Way You Are,79.0,0.637,0.843,5.0,-5.413,1,0.0432,0.0151,,0.0876,0.434,109.012,220733.0,4,9


In [4]:
X = data.iloc[:, 2:15].to_numpy()
y = data.iloc[:, 16].to_numpy()

## Preprocessing Data

In [5]:
# # One-hot encoding the y values since this is a multiclass classification problem
# enc = LabelBinarizer()
# y = enc.fit_transform(y.reshape(-1, 1))

In [6]:
# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [7]:
# Filling in missing values
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train = imp_mean.fit_transform(X_train)
X_test = imp_mean.transform(X_test)

In [8]:
# Standard Scaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Exploratory Data Analysis

In [9]:
n_samples, n_features = X_train.shape

In [10]:
pca = PCA(n_components = n_features)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

# Training Models

## Model 1 - Logistic Regression

In [66]:
clf = LogisticRegression(solver='newton-cg', multi_class='ovr')
clf_lr = clf.fit(X_train, y_train)

In [67]:
y_test_pred = clf_lr.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.3889753278506335

In [37]:
y_test_pred

array([5, 8, 0, ..., 9, 9, 2], dtype=int64)

## Model 2 - Linear Discriminant Analysis

In [44]:
clf_lda = LinearDiscriminantAnalysis()
clf_lda.fit(X_train, y_train)

LinearDiscriminantAnalysis()

In [45]:
y_test_pred = clf_lda.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.47699488775283394

## Model 3 - Quadratic Discriminant Analysis

In [46]:
clf_qda = QuadraticDiscriminantAnalysis()
clf_qda.fit(X_train, y_train)



QuadraticDiscriminantAnalysis()

In [47]:
y_test_pred = clf_qda.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.4872193820849078

## Model 4 - Naive Bayes

In [52]:
clf_gnb = gnb = GaussianNB()
clf_gnb.fit(X_train, y_train)

GaussianNB()

In [53]:
y_test_pred = clf_gnb.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.4445432318292954

## Model 5 - Support Vector Machine

In [49]:
params_svc = {'kernel':('linear', 'rbf'), 'C':[1, 10], 'decision_function_shape':['ovr']}
svc = SVC()
clf_svc = GridSearchCV(svc, params_svc)

In [50]:
clf_svc.fit(X_train, y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10], 'decision_function_shape': ['ovr'],
                         'kernel': ('linear', 'rbf')})

In [51]:
y_test_pred = clf_svc.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.5298955323405201

## Model 6 - K Nearest Neighbors

In [54]:
params_knn = {'n_neighbors':[1,3,5,10,20,50]}
knn = KNeighborsClassifier()
clf_knn = GridSearchCV(knn, params_knn)
clf_knn.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 3, 5, 10, 20, 50]})

In [55]:
y_test_pred = clf_knn.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.49499888864192043

## Model 7 - Random Forest

## Model 8 - Gaussian Mixture Model

## Second Last Model - XGBoost (eXtreme Gradient Boosting)

In [21]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

#setting tree and tree depth
num_round = 50
maxdepth = 8
param = {
  'colsample_bylevel': 1,
  'colsample_bytree': 1,
  'gamma': 0,
  'learning_rate': 0.1, 
  'random_state': 1010,
  'objective': 'multi:softmax', 
  'num_class': 7, 
}

param['tree_method'] = 'gpu_hist'
param['grow_policy'] = 'depthwise'
param['max_depth'] = maxdepth
param['max_leaves'] = 0
param['verbosity'] = 0
param['gpu_id'] = 0
param['updater'] = 'grow_gpu_hist'
param['predictor'] = 'gpu_predictor'

gpu_result = {} 
start_time = time.time()
# Training with the above parameters
xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=gpu_result, verbose_eval=20)

print("GPU Training Time: %s seconds" % (str(time.time() - start_time)))


XGBoostError: [05:11:29] D:\bld\xgboost-split_1637426510059\work\src\tree\tree_updater.cc:20: Unknown tree updater grow_gpu_hist

In [14]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'objective=multi':'softmax',
        'tree_method':'gpu_hist'
        }

xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic', silent=True, nthread=-1, tree_method='gpu_hist')
random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=3, random_state=42 )
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


25 fits failed out of a total of 25.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\rohan\Anaconda3\envs\quant_general\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\rohan\Anaconda3\envs\quant_general\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\rohan\Anaconda3\envs\quant_general\lib\site-packages\xgboost\sklearn.py", line 1261, in fit
    callbacks=callbacks,
  File "C:\Users\rohan\Anaconda3\envs\quant_general\lib\site-packages\xgboost\training.py", line 196, in train
    early_stopping_ro

XGBoostError: Invalid Input: '_', valid values are: {'approx', 'auto', 'exact', 'gpu_hist', 'hist'}

In [13]:
y_train_pred = random_search.predict(X_train)
print(f"Train accuracy : {accuracy_score(y_train, y_train_pred)}")
y_test_pred = random_search.predict(X_test)
print(f"Test accuracy : {accuracy_score(y_test, y_test_pred)}")

Train accuracy : 0.5619026450322294
Test accuracy : 0.5034452100466771


## Last Model - LightGBM

In [68]:
params_lgbm = {
    'n_estimators': [400, 700, 1000],
    'colsample_bytree': [0.7, 0.8],
    'max_depth': [15,20,25],
    'num_leaves': [50, 100, 200],
    'reg_alpha': [1.1, 1.2, 1.3],
    'reg_lambda': [1.1, 1.2, 1.3],
    'min_split_gain': [0.3, 0.4],
    'subsample': [0.7, 0.8, 0.9],
    'subsample_freq': [20]
}

lgbm = LGBMClassifier()
clf_lgbm = RandomizedSearchCV(lgbm, param_distributions=params, n_iter=20, scoring='accuracy', n_jobs=-1, cv=5, random_state=42 )
clf_lgbm.fit(X_train, y_train)



RandomizedSearchCV(cv=5, estimator=LGBMClassifier(), n_iter=20, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.6, 0.8, 1.0],
                                        'gamma': [0.5, 1, 1.5, 2, 5],
                                        'max_depth': [3, 4, 5],
                                        'min_child_weight': [1, 5, 10],
                                        'objective=multi': 'softmax',
                                        'subsample': [0.6, 0.8, 1.0]},
                   random_state=42, scoring='accuracy')

In [69]:
y_test_pred = clf_lgbm.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.4136474772171594

# Using Pycaret for Classification

In [22]:
from pycaret.classification import *

In [26]:
classifier = setup(data = data, target = 'Class',ignore_features = ['Artist Name','Track Name'], fold_shuffle=True, session_id=123) 

IntProgress(value=0, description='Processing: ', max=3)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
Popularity,Numeric
danceability,Numeric
energy,Numeric
key,Categorical
loudness,Numeric
mode,Categorical
speechiness,Numeric
acousticness,Numeric
instrumentalness,Numeric
liveness,Numeric


KeyboardInterrupt: Interrupted by user

In [None]:
compare_models()

In [None]:
tuned_gbc = tune_model(dt)