In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pickle

In [25]:
df= pd.read_csv('spotify_data_clean.csv')
df.head()                
df.dtypes

track_id               object
track_name             object
track_number            int64
track_popularity        int64
explicit                 bool
artist_name            object
artist_popularity       int64
artist_followers        int64
artist_genres          object
album_id               object
album_name             object
album_release_date     object
album_total_tracks      int64
album_type             object
track_duration_min    float64
dtype: object

In [26]:
#check for nulls in the values
df.isna().sum()

track_id                 0
track_name               0
track_number             0
track_popularity         0
explicit                 0
artist_name              3
artist_popularity        0
artist_followers         0
artist_genres         3361
album_id                 0
album_name               0
album_release_date       0
album_total_tracks       0
album_type               0
track_duration_min       0
dtype: int64

In [27]:
#fill artist_genres columns with 'N/A'
df['artist_genres']= df['artist_genres'].fillna('NA')

In [8]:
#ML Task: Create and evaluate models to predict track popularity
# Filter the dataset with the best predictor columns and reduce noise
#feature/target column: track_popularity
#predictor colums: artist_popularity,artist_followers,explicit,album_total_tracks,track_duration_min,artist_genres (encoded),
#release_year (converted to year or track age)

In [28]:
#format album_release_date to year format to a release year column
df['album_release_date'] = pd.to_datetime(df['album_release_date'], errors='coerce')
df['release_year'] = df['album_release_date'].dt.year

In [29]:
#create a new dataset for the training full data
df_train_full = df [['track_popularity','artist_popularity','artist_followers','explicit','album_total_tracks','track_duration_min','artist_genres','release_year']]

In [30]:
df_train_full.dtypes

track_popularity        int64
artist_popularity       int64
artist_followers        int64
explicit                 bool
album_total_tracks      int64
track_duration_min    float64
artist_genres          object
release_year            int32
dtype: object

In [31]:
#identify the categorial and numerical columns
#do the train, test and validate split with random_state =42
#remove the target columns from the dataset
num_cols = df_train_full.select_dtypes(include=['number']).columns
cat_cols = df_train_full.select_dtypes(include=['object', 'category']).columns
df_train_data_full, df_test = train_test_split(df_train_full, test_size=0.2,random_state=42)
df_train, df_val = train_test_split(df_train_data_full, test_size=0.25,random_state=42)
df_train.reset_index(drop=True)
df_val.reset_index(drop=True)
df_test.reset_index(drop=True)
y_train= df_train.track_popularity.values
y_val =df_val.track_popularity.values
y_test =df_test.track_popularity.values
del df_train['track_popularity']
del df_val['track_popularity']
del df_test['track_popularity']

In [32]:
len(df_train), len(df_test), len(df_val)

(5148, 1717, 1717)

In [33]:
cat_cols

Index(['artist_genres'], dtype='object')

In [28]:
#we will be evaluating the best regression model for the task
#We will encode the categorical columns first for train,test and validation

In [43]:
train_dicts = df_train[cat_cols].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
encoded_cat =dv.fit_transform(train_dicts)
encoded_df = pd.DataFrame(encoded_cat, columns=dv.get_feature_names_out(cat_cols))
X_train = pd.concat([df_train[['artist_popularity','artist_followers','explicit','album_total_tracks','track_duration_min','release_year']].reset_index(drop=True),encoded_df.reset_index(drop=True)], axis=1)

In [44]:
val_dicts =  df_val[cat_cols].to_dict(orient='records')
encoded_cat_val =dv.transform(val_dicts)
encoded_df_val = pd.DataFrame(encoded_cat_val, columns=dv.get_feature_names_out(cat_cols))
X_val = pd.concat([df_val[['artist_popularity','artist_followers','explicit','album_total_tracks','track_duration_min','release_year']].reset_index(drop=True),encoded_df_val.reset_index(drop=True)], axis=1)

In [None]:
# train on multiple models and evaluate the scores from those models

In [45]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "XGBoost": XGBRegressor()
}

In [46]:
# breakfix Nan values introduced after one-hot encoding
#cleap up using imputer
X_train.replace(['NA', 'NaN', 'null', ''], np.nan, inplace=True)
X_val.replace(['NA', 'NaN', 'null', ''], np.nan, inplace=True)

In [47]:
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = mean_squared_error(y_val, preds) ** 0.5
    r2 = r2_score(y_val, preds)
    results[name] = {"model": model,"rmse": rmse,"r2": r2}
    print(f"{name}: RMSE={rmse:.3f}, R2={r2:.3f}")

Linear Regression: RMSE=20.663, R2=0.228
Random Forest: RMSE=19.272, R2=0.328
XGBoost: RMSE=19.311, R2=0.326


In [48]:
# According to the value Random forest is performing the best
#Lowest rmse value and highest r squared value
# Pick best model based on RMSE
best_model_name = min(results, key=lambda x: results[x]["rmse"])
best_model = results[best_model_name]["model"]

print(f"Best model: {best_model_name}")

Best model: Random Forest


In [None]:
#save the mode in a .bin file using pickle

In [49]:
with open ("RandomForest.bin", "wb") as f:
    pickle.dump((dv,best_model),f)

In [42]:
df_test

Unnamed: 0,artist_popularity,artist_followers,explicit,album_total_tracks,track_duration_min,artist_genres,release_year
4951,72,4697897,False,20,3.64,"norwegian pop, art pop",2016
4825,85,47716988,True,18,3.65,"pop, hip hop, country",2016
6741,86,17938214,True,15,5.09,"rap, hip hop",2011
222,20,696,False,4,3.57,,2025
5746,49,764106,True,12,4.85,"groove metal, metal",2014
...,...,...,...,...,...,...,...
1737,75,2223332,False,1,3.10,,2023
1128,79,9676425,True,1,2.34,"uk drill, drill, uk grime",2024
7481,71,5155421,False,13,3.64,"alternative metal, post-grunge, rock, hard rock",2006
4747,44,190329,False,6,3.02,"art pop, baroque pop",2017
