# Data Cleaning

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor

from sklearn import metrics

In [16]:
merged_df = pd.read_csv('./data/merged_tracks.csv')

In [17]:
merged_df['artist_name'].value_counts()

Johann Sebastian Bach         123
YoungBoy Never Broke Again    110
Workout Music                 109
Taylor Swift                   91
David Bowie                    89
                             ... 
Ignorant Bull                   1
your best friend jippy          1
Meech                           1
Champion                        1
ZP                              1
Name: artist_name, Length: 3738, dtype: int64

In [18]:
merged_df['artist_name'].value_counts()

Johann Sebastian Bach         123
YoungBoy Never Broke Again    110
Workout Music                 109
Taylor Swift                   91
David Bowie                    89
                             ... 
Ignorant Bull                   1
your best friend jippy          1
Meech                           1
Champion                        1
ZP                              1
Name: artist_name, Length: 3738, dtype: int64

In [19]:
merged_df.columns

Index(['artist_name', 'track_name', 'track_id', 'popularity', 'danceability',
       'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature'],
      dtype='object')

In [21]:
X = merged_df[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']]
y = merged_df['popularity']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [22]:
def model(model):
    models = model
    models.fit(X_train, y_train)
    print(f'{model}')
    print(f'Training r2 score: {models.score(X_train, y_train)}')
    print(f'Testing r2 score: {models.score(X_test, y_test)}')
    print(f'Testing MSE: {metrics.mean_squared_error(y_test, models.predict(X_test))}')
    return print('='*40)

In [23]:
model(LinearRegression())
model(KNeighborsRegressor())
model(DecisionTreeRegressor())
model(BaggingRegressor())
model(RandomForestRegressor())
model(AdaBoostRegressor())

LinearRegression()
Training r2 score: 0.23317069415733127
Testing r2 score: 0.2388739919183922
Testing MSE: 719.0519767347585
KNeighborsRegressor()
Training r2 score: 0.37417013100813434
Testing r2 score: 0.057724544427296465
Testing MSE: 890.1877241929055
DecisionTreeRegressor()
Training r2 score: 0.9958816676555867
Testing r2 score: -0.2816475122969202
Testing MSE: 1210.799745914707
BaggingRegressor()
Training r2 score: 0.8725323988551213
Testing r2 score: 0.28105206310072905
Testing MSE: 679.2054531151545
RandomForestRegressor()
Training r2 score: 0.9073833292778355
Testing r2 score: 0.35009485548316055
Testing MSE: 613.9792542798206
AdaBoostRegressor()
Training r2 score: 0.23051127904865365
Testing r2 score: 0.21903121847960294
Testing MSE: 737.7978681030265


In [24]:
X = merged_df[['danceability', 'energy', 'key', 'mode', 'valence', 'tempo', 'time_signature']]
y = merged_df['popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

model(LinearRegression())
model(KNeighborsRegressor())
model(DecisionTreeRegressor())
model(BaggingRegressor())
model(RandomForestRegressor())
model(AdaBoostRegressor())

LinearRegression()
Training r2 score: 0.05062455084640671
Testing r2 score: 0.052534656990840145
Testing MSE: 895.0907215686235
KNeighborsRegressor()
Training r2 score: 0.28725173533647275
Testing r2 score: -0.10741733560521904
Testing MSE: 1046.200781187724
DecisionTreeRegressor()
Training r2 score: 0.9956184663580941
Testing r2 score: -0.5606928335163084
Testing MSE: 1474.419813670785
BaggingRegressor()
Training r2 score: 0.832054461904949
Testing r2 score: 0.08322000924147366
Testing MSE: 866.1016146949365
RandomForestRegressor()
Training r2 score: 0.8790570309684865
Testing r2 score: 0.16037349172287363
Testing MSE: 793.213073899898
AdaBoostRegressor()
Training r2 score: 0.1105010105533617
Testing r2 score: 0.09417702464671629
Testing MSE: 855.7502884984888


In [25]:
knn = KNeighborsRegressor()
knn_params = {
    'n_neighbors': range(1, 32),
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'leaf_size': [5, 10, 15, 30, 45]
}
gs = GridSearchCV(knn, 
                  knn_params,
                  cv = 10,
                  verbose=1)
gs.fit(X_train, y_train)
print(f'Training r2 score: {gs.score(X_train, y_train)}')
print(f'Testing r2 score: {gs.score(X_test, y_test)}')
print(f'Testing MSE: {metrics.mean_squared_error(y_test, gs.predict(X_test))}')

Fitting 10 folds for each of 465 candidates, totalling 4650 fits
Training r2 score: 0.11327401915890833
Testing r2 score: 0.0535665857302462
Testing MSE: 894.1158364746434


In [26]:
gs.best_params_

{'leaf_size': 5, 'metric': 'manhattan', 'n_neighbors': 31}

In [27]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [28]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print(f'Training r2 score: {lr.score(X_train, y_train)}')
print(f'Testing r2 score: {lr.score(X_test, y_test)}')
print(f'Testing MSE: {metrics.mean_squared_error(y_test, lr.predict(X_test))}')
print(f'Baseline MSE: {metrics.mean_squared_error(y_test, pd.Series(y.mean(), index=range(len(y_test))))}')

Training r2 score: 0.05062455084640671
Testing r2 score: 0.052534656990840145
Testing MSE: 895.0907215686235
Baseline MSE: 944.835611047945


In [29]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
print(f'Training r2 score: {rf.score(X_train, y_train)}')
print(f'Testing r2 score: {rf.score(X_test, y_test)}')
print(f'Testing MSE: {metrics.mean_squared_error(y_test, rf.predict(X_test))}')
print(f'Baseline MSE: {metrics.mean_squared_error(y_test, pd.Series(y.mean(), index=range(len(y_test))))}')

Training r2 score: 0.8803308121563803
Testing r2 score: 0.1586437791663623
Testing MSE: 794.8471702515352
Baseline MSE: 944.835611047945


In [31]:
rf = RandomForestRegressor()
rf_params = {
    'n_estimators': [50, 100, 150, 200],
    'min_samples_leaf': [2, 3, 4, 5],
    'max_leaf_nodes': [None, 2, 3, 5]
}
gs = GridSearchCV(rf, 
                  rf_params,
                  cv = 5,
                  verbose=1)
gs.fit(X_train, y_train)
print(f'Training r2 score: {gs.score(X_train, y_train)}')
print(f'Testing r2 score: {gs.score(X_test, y_test)}')
print(f'Testing MSE: {metrics.mean_squared_error(y_test, gs.predict(X_test))}')

Fitting 5 folds for each of 64 candidates, totalling 320 fits
Training r2 score: 0.6053014287935046
Testing r2 score: 0.1750180240223832
Testing MSE: 779.3780718286123


In [32]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [33]:
gs.best_params_

{'max_leaf_nodes': None, 'min_samples_leaf': 5, 'n_estimators': 200}