With this model, the intent is to classify a song using its data as "stamina" or "tech" to add a feature to the app which removes user input and instead detects which rating model should be used instead.

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import plot_confusion_matrix, precision_score, recall_score, accuracy_score, f1_score,\
                            roc_curve, auc, classification_report, log_loss, mean_squared_error, mean_squared_log_error
from sklearn.decomposition import PCA

from statsmodels.api import qqplot

from xgboost import XGBClassifier, XGBRegressor, plot_importance, plot_tree, DMatrix
import shap


import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
##display all dataframe columns
pd.set_option('display.max_columns', None)

stam = pd.read_csv("../data/stam.csv")
tech = pd.read_csv("../data/not_stam.csv")

In [21]:
import sys
sys.path.append("..")

from src.data_cleaning import data_cleaner

In [22]:
cleaned_tech = data_cleaner(tech)

Initialized Tech DataFrame with 4808 rows.

Removing outliers and filling NaN values...

The songs in this Tech dataset are up to 164.002 seconds (2.733 minutes) long.
The songs in this Tech dataset have up to 826 steps.
The songs in this Tech dataset have a max  bpm weighted average up to 175 bpm.
The songs in this Tech dataset have up to 21 bpm changes.
The songs in this Tech dataset have up to 6.387 NPS.

Returning cleaned Tech DataFrame with 3209 rows and 31 columns.


In [23]:
cleaned_stam = data_cleaner(stam, is_stamina = True)

Initialized Stamina DataFrame with 2926 rows.

Removing outliers and filling NaN values...

The songs in this Stamina dataset are up to 1695.086 seconds (28.251 minutes) long.
The songs in this Stamina dataset have up to 16804steps.
The songs in this Stamina dataset have a max  bpm weighted average up to 216 bpm.
The songs in this Stamina dataset have up to 4 bpm changes.
The songs in this Stamina dataset have up to 10.866 NPS.

Returning cleaned Stamina DataFrame with 2255 rows and 40 columns.


In [24]:
cleaned_tech['target'] = 0
cleaned_stam['target'] = 1

In [25]:
all_songs = pd.concat([cleaned_tech, cleaned_stam], ignore_index=True)

In [26]:
#due to some columns being removed from the tech dataset for being irrelevant, I will fill the NaN values with 0

all_songs = all_songs.fillna(0)

#shuffling the dataframe to mix the target values

all_songs = all_songs.sample(frac=1).reset_index(drop=True)

In [27]:
data = all_songs.drop(columns=['target', 'title', 'artist', 'difficulty'], axis=1)
target = all_songs.target

In [28]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, random_state=42)

In [32]:
y_val

5444    0
4907    0
2320    1
2596    0
3481    1
       ..
3236    1
4100    0
3350    0
243     1
4681    0
Name: target, Length: 1025, dtype: int64

In [11]:
classifier_pipe = Pipeline([('scaler', StandardScaler()), ('classifier', XGBClassifier())])

In [12]:
classifier_pipe.fit(X_tr, y_tr)

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=0, num_parallel_tree=1, random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=1, tree_method='exact',
                               validate_parameters=1, verbosity=None))])

In [17]:
y_preds = classifier_pipe.predict(X_val)

In [18]:
sum(y_preds != y_val)

0

In [33]:
classifier_pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=0, num_parallel_tree=1, random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=1, tree_method='exact',
                               validate_parameters=1, verbosity=None))])

In [36]:
y_predictions = classifier_pipe.predict(X_test)

In [37]:
sum(y_predictions != y_test)

0