In [2]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import GridSearchCV, cross_validate

In [3]:
spotify = pd.read_csv("spotify_clean.csv")

In [3]:
spotify.head()

Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,...,key_E,key_Eb,key_F,key_G,key_Gb,mode_Minor,track_genre_2.0,track_genre_3.0,track_genre_4.0,track_genre_5.0
0,1.727981,0.217042,0.59938,-0.827804,0.143116,1.483006,-0.933744,-0.593842,1.261167,0.935621,...,0,0,0,0,0,1,0,0,1,0
1,0.975759,-0.994192,-0.886325,-1.675387,-1.717659,0.507178,1.492172,-0.593753,-0.701486,-0.779562,...,0,0,0,0,0,0,0,0,1,0
2,1.060686,-0.044824,-0.792279,-1.158407,-0.621018,-0.05998,-0.018783,-0.593862,-0.475311,-1.414828,...,0,0,0,0,0,0,0,0,1,0
3,1.645697,-0.168963,-1.628365,-1.904202,-1.8266,-0.793222,1.471943,-0.592475,-0.282743,-1.312492,...,0,0,0,0,0,0,0,0,1,0
4,2.09452,-0.212977,0.234509,-0.889175,-0.60951,-0.162439,0.778256,-0.593862,-0.986411,-1.206921,...,0,0,0,0,0,0,0,0,1,0


In [4]:
X = spotify.loc[:,spotify.columns != 'popularity']
y = spotify['popularity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 33)

In [5]:
model = XGBRegressor()

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'learning_rate': [0.01, 0.05, 0.1],  # Step size
    'max_depth': [3, 6, 9],  # Depth of each tree
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='r2')
grid_search.fit(X_train, y_train)

print("Best hyperparameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test Mean Squared Error (MSE): {mse}")
print(f"Test R²: {r2}")

scoring = {
    "neg_mean_squared_error": make_scorer(mean_squared_error),
    "r2": make_scorer(r2_score)
}

cv_results = cross_validate(best_model, X, y, cv=10, scoring=scoring, return_train_score=False)

print(f"{best_model} Cross-Validation Results:")
print(f"Mean Squared Error: {-cv_results['test_neg_mean_squared_error'].mean():.4f} ± {-cv_results['test_neg_mean_squared_error'].std():.4f}")
print(f"R²: {cv_results['test_r2'].mean():.4f} ± {cv_results['test_r2'].std():.4f}")


Best hyperparameters: {'colsample_bytree': 1.0, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 300, 'subsample': 0.9}
Test Mean Squared Error (MSE): 0.5211755508090399
Test R²: 0.4782346248992069
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=1.0, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0.1, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=9, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=300, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...) Cross-Validation Results:
Mean Squared Erro