In [None]:
## Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
## Loading in Data
spotify = pd.read_csv('/content/spotify_clean.csv')
spotify.head()

Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,explicit_True,key_Ab,key_B,key_Bb,key_C,key_D,key_Db,key_E,key_Eb,key_F,key_G,key_Gb,mode_Minor,track_genre_2.0,track_genre_3.0,track_genre_4.0,track_genre_5.0
0,1.727981,0.217042,0.59938,-0.827804,0.143116,1.483006,-0.933744,-0.593842,1.261167,0.935621,-1.15883,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0
1,0.975759,-0.994192,-0.886325,-1.675387,-1.717659,0.507178,1.492172,-0.593753,-0.701486,-0.779562,-1.547171,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2,1.060686,-0.044824,-0.792279,-1.158407,-0.621018,-0.05998,-0.018783,-0.593862,-0.475311,-1.414828,-1.590995,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
3,1.645697,-0.168963,-1.628365,-1.904202,-1.8266,-0.793222,1.471943,-0.592475,-0.282743,-1.312492,1.890019,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
4,2.09452,-0.212977,0.234509,-0.889175,-0.60951,-0.162439,0.778256,-0.593862,-0.986411,-1.206921,-0.034733,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0


In [None]:
## Splitting the Data
X = spotify.drop(['popularity'], axis = 1)
y = spotify['popularity']

In [None]:
## Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Hyperparameter Tuning

In [None]:
## Fitting initial rf regression
rf_regressor = RandomForestRegressor(random_state = 42)

In [None]:
## Parameters to hypertune
param_grid_rf = {
    'max_depth': [10, 20],
    'min_samples_split': [20, 50],
    'min_samples_leaf': [10, 40],
    'n_estimators': [50, 100],
    'max_features': ['sqrt', 'log2']
}

In [None]:
## Grid Searching
grid_search_rf = GridSearchCV(rf_regressor, param_grid_rf, cv=5, scoring='neg_mean_squared_error')
grid_search_rf.fit(X_train, y_train)
best_params_rf = grid_search_rf.best_params_
best_params_rf


{'max_depth': 20,
 'max_features': 'sqrt',
 'min_samples_leaf': 10,
 'min_samples_split': 20,
 'n_estimators': 100}

In [None]:
rf_model = RandomForestRegressor(**best_params_rf, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
rf_preds = rf_model.predict(X_test)

In [None]:
def calculate_regression_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    return mae, mse, rmse, r2


In [None]:
metrics = pd.DataFrame({
    'Metrics': ['MAE', 'MSE', 'RMSE', 'R2'],
    'Values': calculate_regression_metrics(y_test, rf_preds)
})
metrics

Unnamed: 0,Metrics,Values
0,MAE,0.670699
1,MSE,0.686922
2,RMSE,0.828808
3,R2,0.311728
