## CrossValidation y mejora de modelos de ML

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import linear_model, tree, ensemble

In [16]:
df = pd.read_excel("spotify_dataset_clean.xlsx", index_col=[0])
df = df.drop(columns=['Genre', 'Title', 'Artist', 'explicit', 'release_date', 'duration_ms']) # se eliminan columnas string
df.dtypes

popularity               int64
danceability           float64
energy                 float64
key                      int64
loudness               float64
mode                     int64
speechiness            float64
acousticness           float64
instrumentalness       float64
liveness               float64
valence                float64
tempo                  float64
time_signature           int64
release_year             int64
years_since_release      int64
dtype: object

In [17]:
df.head()

Unnamed: 0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,release_year,years_since_release
0,75,0.489,0.724,5,-8.367,1,0.0352,0.313,0.185,0.287,0.15,117.292,4,1971,51
1,78,0.377,0.681,7,-8.039,1,0.0298,0.00088,0.0023,0.0504,0.285,108.789,4,1976,46
2,76,0.333,0.927,9,-8.55,0,0.0733,0.0029,0.000208,0.297,0.385,141.466,4,1976,46
3,78,0.572,0.835,0,-6.219,1,0.0317,0.171,0.000377,0.0702,0.795,129.981,4,1984,38
4,79,0.338,0.34,9,-12.049,0,0.0339,0.58,0.0032,0.116,0.197,82.433,4,1971,50


In [18]:
# Target variable 
y = df.popularity 
y.head()

0    75
1    78
2    76
3    78
4    79
Name: popularity, dtype: int64

In [19]:
# Input data
df = df.drop(columns=['popularity'])
X = df.copy()
X.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,release_year,years_since_release
0,0.489,0.724,5,-8.367,1,0.0352,0.313,0.185,0.287,0.15,117.292,4,1971,51
1,0.377,0.681,7,-8.039,1,0.0298,0.00088,0.0023,0.0504,0.285,108.789,4,1976,46
2,0.333,0.927,9,-8.55,0,0.0733,0.0029,0.000208,0.297,0.385,141.466,4,1976,46
3,0.572,0.835,0,-6.219,1,0.0317,0.171,0.000377,0.0702,0.795,129.981,4,1984,38
4,0.338,0.34,9,-12.049,0,0.0339,0.58,0.0032,0.116,0.197,82.433,4,1971,50


In [27]:
print("Shape of input data: {} and shape of target variable: {}".format(X.shape, y.shape))

Shape of input data: (5807, 14) and shape of target variable: (5807,)


#### Model Score Using KFold

In [20]:
# Lets split the data into 5 folds.  
# We will use this 'kf'(KFold splitting stratergy) object as input to cross_val_score() method
kf =KFold(n_splits=5, shuffle=True, random_state=42)

cnt = 1
# split()  method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt += 1

Fold:1, Train set: 4645, Test set:1162
Fold:2, Train set: 4645, Test set:1162
Fold:3, Train set: 4646, Test set:1161
Fold:4, Train set: 4646, Test set:1161
Fold:5, Train set: 4646, Test set:1161


In [21]:
def rmse(score):
    rmse = np.sqrt(-score)
    print(f'rmse= {"{:.2f}".format(rmse)}')

In [22]:
# Using Linear Regression 
score = cross_val_score(linear_model.LinearRegression(), X, y, cv= kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold: {score}')
rmse(score.mean())

Scores for each fold: [-176.39626939 -165.59819869 -146.89301507 -166.07655284 -161.73875847]
rmse= 12.78


In [23]:
# Using Decision Tree Regressor
score = cross_val_score(tree.DecisionTreeRegressor(random_state= 42), X, y, cv=kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold: {score}')
rmse(score.mean())

Scores for each fold: [-307.11531842 -319.5        -287.29371232 -281.33161068 -315.9121447 ]
rmse= 17.38


In [24]:
# Using Random Forest Regressor
score = cross_val_score(ensemble.RandomForestRegressor(random_state= 42), X, y, cv= kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold are: {score}')
rmse(score.mean())

Scores for each fold are: [-162.13858838 -152.88115818 -135.56844376 -151.53373023 -144.96630026]
rmse= 12.22


#### Model Tuning using KFold 

In [25]:
# Decision Tree Regressor Tuning 
max_depth = [1,2,3,4,5,6,7,8,9,10]

for val in max_depth:
    score = cross_val_score(tree.DecisionTreeRegressor(max_depth= val, random_state= 42), X, y, cv= kf, scoring="neg_mean_squared_error")
    print(f'For max depth: {val}')
    rmse(score.mean())

For max depth: 1
rmse= 12.95
For max depth: 2
rmse= 12.68
For max depth: 3
rmse= 12.52
For max depth: 4
rmse= 12.53
For max depth: 5
rmse= 12.62
For max depth: 6
rmse= 12.72
For max depth: 7
rmse= 12.91
For max depth: 8
rmse= 13.16
For max depth: 9
rmse= 13.56
For max depth: 10
rmse= 14.05


In [26]:
# Random Forest Regressor Tuning
estimators = [50, 100, 150, 200, 250, 300, 350]

for count in estimators:
    score = cross_val_score(ensemble.RandomForestRegressor(n_estimators= count, random_state= 42), X, y, cv= kf, scoring="neg_mean_squared_error")
    print(f'For estimators: {count}')
    rmse(score.mean())

For estimators: 50
rmse= 12.31
For estimators: 100
rmse= 12.22
For estimators: 150
rmse= 12.21
For estimators: 200
rmse= 12.20
For estimators: 250
rmse= 12.19
For estimators: 300
rmse= 12.20
For estimators: 350
rmse= 12.20
