In [70]:
# importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
import warnings

warnings.filterwarnings("ignore")

In [71]:
# importing the cleaned dataset
imploded_df = pd.read_csv('../data/processed/cleaned_data_imploded.csv')

In [72]:
z_scores = pd.DataFrame()
for col in imploded_df.columns:
    if col not in ['track_id', 'track_genre']:
        z_scores[col] = stats.zscore(imploded_df[col])

threshold = 3

outlier_indices = (z_scores > threshold).any(axis=1)
imploded_df = imploded_df[~outlier_indices]


In [73]:
imploded_df

Unnamed: 0,track_id,track_genre,popularity,duration_s,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
1,000CC8EParg64OmTxVnZ0p,club,47,322.933,0,0.269,0.51600,0,-7.361,1,0.0366,0.406000,0.000000,0.1170,0.341,178.174,4
2,000Iz0K615UepwSJ5z2RE5,minimal-techno,22,515.360,0,0.686,0.56000,5,-13.264,0,0.0462,0.001140,0.181000,0.1110,0.108,119.997,4
3,000RDCYioLteXcutOjeweY,hip-hop,62,190.203,0,0.679,0.77000,0,-3.537,1,0.1900,0.058300,0.000000,0.0825,0.839,161.721,4
4,000qpdoc97IMTBvF8gwcpy,minimal-techno,19,331.240,0,0.519,0.43100,6,-13.606,0,0.0291,0.000964,0.720000,0.0916,0.234,129.971,4
6,001APMDOl3qtx1526T11n1,"chill,soul",0,176.320,0,0.613,0.47100,1,-6.644,0,0.1070,0.316000,0.000001,0.1170,0.406,143.064,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88890,7zxHiMmVLt4LGWpOMqOpUh,pop-film,56,325.156,0,0.766,0.38200,7,-11.464,0,0.0324,0.698000,0.001430,0.1570,0.672,119.992,4
88891,7zxpdh3EqMq2JCkOI0EqcG,disney,23,109.573,0,0.529,0.00879,10,-32.266,1,0.0587,0.996000,0.959000,0.0916,0.510,82.694,4
88892,7zyYmIdjqqiX6kLryb7QBx,mandopop,61,260.573,0,0.423,0.36000,3,-9.458,1,0.0372,0.728000,0.000000,0.1050,0.291,130.576,4
88893,7zybSU9tFO9HNlwmGF7stc,electronic,54,234.300,0,0.649,0.83400,10,-11.430,0,0.0397,0.268000,0.932000,0.0974,0.150,125.004,4


In [74]:
unique_genres = set()
for genres in imploded_df['track_genre'].str.split(','):
    unique_genres.update(genres)

for genre in unique_genres:
    encoded_genre = genre.replace(' ', '_')
    encoded_genre = ''.join(e for e in encoded_genre if e.isalnum() or e == '_')
    imploded_df[encoded_genre] = imploded_df['track_genre'].str.contains(genre).astype(int)

imploded_df.drop(columns=['track_genre'], inplace=True)

In [75]:
imploded_df

Unnamed: 0,track_id,popularity,duration_s,explicit,danceability,energy,key,loudness,mode,speechiness,...,alternative,german,british,jdance,funk,newage,grindcore,goth,triphop,latin
1,000CC8EParg64OmTxVnZ0p,47,322.933,0,0.269,0.51600,0,-7.361,1,0.0366,...,0,0,0,0,0,0,0,0,0,0
2,000Iz0K615UepwSJ5z2RE5,22,515.360,0,0.686,0.56000,5,-13.264,0,0.0462,...,0,0,0,0,0,0,0,0,0,0
3,000RDCYioLteXcutOjeweY,62,190.203,0,0.679,0.77000,0,-3.537,1,0.1900,...,0,0,0,0,0,0,0,0,0,0
4,000qpdoc97IMTBvF8gwcpy,19,331.240,0,0.519,0.43100,6,-13.606,0,0.0291,...,0,0,0,0,0,0,0,0,0,0
6,001APMDOl3qtx1526T11n1,0,176.320,0,0.613,0.47100,1,-6.644,0,0.1070,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88890,7zxHiMmVLt4LGWpOMqOpUh,56,325.156,0,0.766,0.38200,7,-11.464,0,0.0324,...,0,0,0,0,0,0,0,0,0,0
88891,7zxpdh3EqMq2JCkOI0EqcG,23,109.573,0,0.529,0.00879,10,-32.266,1,0.0587,...,0,0,0,0,0,0,0,0,0,0
88892,7zyYmIdjqqiX6kLryb7QBx,61,260.573,0,0.423,0.36000,3,-9.458,1,0.0372,...,0,0,0,0,0,0,0,0,0,0
88893,7zybSU9tFO9HNlwmGF7stc,54,234.300,0,0.649,0.83400,10,-11.430,0,0.0397,...,0,0,0,0,0,0,0,0,0,0


In [76]:
# split train and test
imploded_df.drop(columns=['track_id'], inplace=True)

X = imploded_df.drop(columns=['danceability'])
y = imploded_df['danceability']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [77]:
models = [RandomForestRegressor(n_jobs=-1),
          GradientBoostingRegressor(),
          AdaBoostRegressor(),
          LinearRegression(),
          Lasso(),
          Ridge()]

for model in models:
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    model_name = model.__class__.__name__
    print(f"{model_name} Mean Squared Error: {mse:.3f}")
    print(f"{model_name} R-squared: {r2:.3f}")
    print("\n")


RandomForestRegressor Mean Squared Error: 0.009
RandomForestRegressor R-squared: 0.692


GradientBoostingRegressor Mean Squared Error: 0.011
GradientBoostingRegressor R-squared: 0.614


AdaBoostRegressor Mean Squared Error: 0.017
AdaBoostRegressor R-squared: 0.440


LinearRegression Mean Squared Error: 0.013
LinearRegression R-squared: 0.559


Lasso Mean Squared Error: 0.030
Lasso R-squared: -0.000


Ridge Mean Squared Error: 0.013
Ridge R-squared: 0.559


