In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.metrics import mean_squared_error, recall_score, r2_score
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Stage 1: Data Preprocessing
# Loading the dataset with 'ISO-8859-1' encoding
df_path = "/content/drive/MyDrive/Spotify_ML/spotify-2023.csv"
df = pd.read_csv(df_path, encoding='ISO-8859-1')
df

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,My Mind & Me,Selena Gomez,1,2022,11,3,953,0,91473363,61,...,144,A,Major,60,24,39,57,0,8,3
949,Bigger Than The Whole Sky,Taylor Swift,1,2022,10,21,1180,0,121871870,4,...,166,F#,Major,42,7,24,83,1,12,6
950,A Veces (feat. Feid),"Feid, Paulo Londra",2,2022,11,3,573,0,73513683,2,...,92,C#,Major,80,81,67,4,0,8,6
951,En La De Ella,"Feid, Sech, Jhayco",3,2022,10,20,1320,0,133895612,29,...,97,C#,Major,82,67,77,8,0,12,5


In [3]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   track_name            953 non-null    object
 1   artist(s)_name        953 non-null    object
 2   artist_count          953 non-null    int64 
 3   released_year         953 non-null    int64 
 4   released_month        953 non-null    int64 
 5   released_day          953 non-null    int64 
 6   in_spotify_playlists  953 non-null    int64 
 7   in_spotify_charts     953 non-null    int64 
 8   streams               953 non-null    object
 9   in_apple_playlists    953 non-null    int64 
 10  in_apple_charts       953 non-null    int64 
 11  in_deezer_playlists   953 non-null    object
 12  in_deezer_charts      953 non-null    int64 
 13  in_shazam_charts      903 non-null    object
 14  bpm                   953 non-null    int64 
 15  key                   858 non-null    ob

track_name               0
artist(s)_name           0
artist_count             0
released_year            0
released_month           0
released_day             0
in_spotify_playlists     0
in_spotify_charts        0
streams                  0
in_apple_playlists       0
in_apple_charts          0
in_deezer_playlists      0
in_deezer_charts         0
in_shazam_charts        50
bpm                      0
key                     95
mode                     0
danceability_%           0
valence_%                0
energy_%                 0
acousticness_%           0
instrumentalness_%       0
liveness_%               0
speechiness_%            0
dtype: int64

In [4]:
# Combining release year, month, day into one column
df['release_date'] = pd.to_datetime(df['released_year'].astype(str) + '-' +
                                      df['released_month'].astype(str) + '-' +
                                      df['released_day'].astype(str), errors='coerce')
df.drop(['released_year', 'released_month', 'released_day'], axis=1, inplace=True)

# Check the new 'release_date' column
df.head(20)

Unnamed: 0,track_name,artist(s)_name,artist_count,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,in_apple_charts,in_deezer_playlists,in_deezer_charts,...,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%,release_date
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,553,147,141381703,43,263,45,10,...,B,Major,80,89,83,31,0,8,4,2023-07-14
1,LALA,Myke Towers,1,1474,48,133716286,48,126,58,14,...,C#,Major,71,61,74,7,0,10,4,2023-03-23
2,vampire,Olivia Rodrigo,1,1397,113,140003974,94,207,91,14,...,F,Major,51,32,53,17,0,31,6,2023-06-30
3,Cruel Summer,Taylor Swift,1,7858,100,800840817,116,207,125,12,...,A,Major,55,58,72,11,0,11,15,2019-08-23
4,WHERE SHE GOES,Bad Bunny,1,3133,50,303236322,84,133,87,15,...,A,Minor,65,23,80,14,63,11,6,2023-05-18
5,Sprinter,"Dave, Central Cee",2,2186,91,183706234,67,213,88,17,...,C#,Major,92,66,58,19,0,8,24,2023-06-01
6,Ella Baila Sola,"Eslabon Armado, Peso Pluma",2,3090,50,725980112,34,222,43,13,...,F,Minor,67,83,76,48,0,8,3,2023-03-16
7,Columbia,Quevedo,1,714,43,58149378,25,89,30,13,...,F,Major,67,26,71,37,0,11,4,2023-07-07
8,fukumean,Gunna,1,1096,83,95217315,60,210,48,11,...,C#,Minor,85,22,62,12,0,28,9,2023-05-15
9,La Bebe - Remix,"Peso Pluma, Yng Lvcas",2,2953,44,553634067,49,110,66,13,...,D,Minor,81,56,48,21,0,8,33,2023-03-17


In [5]:
# Converting Streams, in_shazam_charts, and in_deezer_playlists to integers
df['in_deezer_playlists'] = pd.to_numeric(df['in_deezer_playlists'].replace(',', '', regex=True), errors='coerce')
df['in_shazam_charts'] = pd.to_numeric(df['in_shazam_charts'].replace(',', '', regex=True), errors='coerce')
df['streams'] = pd.to_numeric(df['streams'], errors='coerce')

df.head(30)


Unnamed: 0,track_name,artist(s)_name,artist_count,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,in_apple_charts,in_deezer_playlists,in_deezer_charts,...,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%,release_date
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,553,147,141381700.0,43,263,45,10,...,B,Major,80,89,83,31,0,8,4,2023-07-14
1,LALA,Myke Towers,1,1474,48,133716300.0,48,126,58,14,...,C#,Major,71,61,74,7,0,10,4,2023-03-23
2,vampire,Olivia Rodrigo,1,1397,113,140004000.0,94,207,91,14,...,F,Major,51,32,53,17,0,31,6,2023-06-30
3,Cruel Summer,Taylor Swift,1,7858,100,800840800.0,116,207,125,12,...,A,Major,55,58,72,11,0,11,15,2019-08-23
4,WHERE SHE GOES,Bad Bunny,1,3133,50,303236300.0,84,133,87,15,...,A,Minor,65,23,80,14,63,11,6,2023-05-18
5,Sprinter,"Dave, Central Cee",2,2186,91,183706200.0,67,213,88,17,...,C#,Major,92,66,58,19,0,8,24,2023-06-01
6,Ella Baila Sola,"Eslabon Armado, Peso Pluma",2,3090,50,725980100.0,34,222,43,13,...,F,Minor,67,83,76,48,0,8,3,2023-03-16
7,Columbia,Quevedo,1,714,43,58149380.0,25,89,30,13,...,F,Major,67,26,71,37,0,11,4,2023-07-07
8,fukumean,Gunna,1,1096,83,95217320.0,60,210,48,11,...,C#,Minor,85,22,62,12,0,28,9,2023-05-15
9,La Bebe - Remix,"Peso Pluma, Yng Lvcas",2,2953,44,553634100.0,49,110,66,13,...,D,Minor,81,56,48,21,0,8,33,2023-03-17


In [6]:
# Checking if relevant columns have been converted to numeric data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   track_name            953 non-null    object        
 1   artist(s)_name        953 non-null    object        
 2   artist_count          953 non-null    int64         
 3   in_spotify_playlists  953 non-null    int64         
 4   in_spotify_charts     953 non-null    int64         
 5   streams               952 non-null    float64       
 6   in_apple_playlists    953 non-null    int64         
 7   in_apple_charts       953 non-null    int64         
 8   in_deezer_playlists   953 non-null    int64         
 9   in_deezer_charts      953 non-null    int64         
 10  in_shazam_charts      903 non-null    float64       
 11  bpm                   953 non-null    int64         
 12  key                   858 non-null    object        
 13  mode                

In [7]:
df.isnull().sum()

track_name               0
artist(s)_name           0
artist_count             0
in_spotify_playlists     0
in_spotify_charts        0
streams                  1
in_apple_playlists       0
in_apple_charts          0
in_deezer_playlists      0
in_deezer_charts         0
in_shazam_charts        50
bpm                      0
key                     95
mode                     0
danceability_%           0
valence_%                0
energy_%                 0
acousticness_%           0
instrumentalness_%       0
liveness_%               0
speechiness_%            0
release_date             0
dtype: int64

In [None]:
# Handling missing values using Simple Imputer for columns: in_shazam_charts, key

imputer_num = SimpleImputer(strategy='mean')
df['in_shazam_charts'] = imputer_num.fit_transform(df[['in_shazam_charts']])

imputer_cat = SimpleImputer(strategy='most_frequent')
df[['key']] = imputer_cat.fit_transform(df[['key']])

df.isnull().sum()

track_name              0
artist(s)_name          0
artist_count            0
in_spotify_playlists    0
in_spotify_charts       0
streams                 1
in_apple_playlists      0
in_apple_charts         0
in_deezer_playlists     0
in_deezer_charts        0
in_shazam_charts        0
bpm                     0
key                     0
mode                    0
danceability_%          0
valence_%               0
energy_%                0
acousticness_%          0
instrumentalness_%      0
liveness_%              0
speechiness_%           0
release_date            0
dtype: int64

In [None]:
# Handling missing values for stream
# Remove the row with NaN value in 'streams' column
df = df.dropna(subset=['streams'])

# Verify if the row has been removed
df.isnull().sum()

track_name              0
artist(s)_name          0
artist_count            0
in_spotify_playlists    0
in_spotify_charts       0
streams                 0
in_apple_playlists      0
in_apple_charts         0
in_deezer_playlists     0
in_deezer_charts        0
in_shazam_charts        0
bpm                     0
key                     0
mode                    0
danceability_%          0
valence_%               0
energy_%                0
acousticness_%          0
instrumentalness_%      0
liveness_%              0
speechiness_%           0
release_date            0
dtype: int64

In [None]:
# Check if there's duplicate rows
duplicates = df.duplicated()

# Remove duplicate rows
df = df[~duplicates]

In [None]:
#Encoding categorical variables
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()
df['key'] = label_enc.fit_transform(df['key'])
df['mode'] = label_enc.fit_transform(df['mode'])


# One-hot encoding for non-ordinal categorical features
df = pd.get_dummies(df, columns=['track_name', 'artist(s)_name'])

In [None]:
df.to_csv('/content/drive/MyDrive/Spotify_ML/spotify_updated_data.csv')

In [None]:
# Machine Learning Stage

# Split the data into features and target - only using columns which would be relevant to the target variable
x = df[['artist_count', 'in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists',
    'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts',
    'bpm', 'key', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%',
    'instrumentalness_%', 'liveness_%', 'speechiness_%']]
y = df['streams']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [None]:
# Initialize RandomForestRegressor with specified parameters
rf_model = RandomForestRegressor(n_estimators=900, max_depth=22, random_state=42)

# Fit the model on the training data
rf_model.fit(x_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(x_test)

# Calculate metrics - Adjusted R-Squared, R-Squared, RMSE
r2 = r2_score(y_test, y_pred)
adj_r2 = 1 - (1 - r2) * ((len(y_test) - 1) / (len(y_test) - x_test.shape[1] - 1))
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print the metrics
print(f"Adjusted R-Squared: {adj_r2:.4f}")
print(f"R-Squared: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

Adjusted R-Squared: 0.7386
R-Squared: 0.7620
RMSE: 241386934.93


In [None]:
# Initialize GradientBoostingRegressor with specified parameters
gb_model = GradientBoostingRegressor(n_estimators=900, max_depth=22, random_state=42)

# Fit the model on the training data
gb_model.fit(x_train, y_train)

# Predict on the test set
y_pred = gb_model.predict(x_test)

# Calculate metrics - Adjusted R-Squared, R-Squared, RMSE
r2 = r2_score(y_test, y_pred)
adj_r2 = 1 - (1 - r2) * ((len(y_test) - 1) / (len(y_test) - x_test.shape[1] - 1))
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print the metrics
print(f"Adjusted R-Squared: {adj_r2:.4f}")
print(f"R-Squared: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

Adjusted R-Squared: 0.6273
R-Squared: 0.6607
RMSE: 288207318.59


In [None]:
# Initialize LinearRegressor with specified parameters
lr_model = LogisticRegression(random_state=42)

# Fit the model on the training data
lr_model.fit(x_train, y_train)

# Predict on the test set
y_pred = lr_model.predict(x_test)

# Calculate metrics - Adjusted R-Squared, R-Squared, RMSE
r2 = r2_score(y_test, y_pred)
adj_r2 = 1 - (1 - r2) * ((len(y_test) - 1) / (len(y_test) - x_test.shape[1] - 1))
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print the metrics
print(f"Adjusted R-Squared: {adj_r2:.4f}")
print(f"R-Squared: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

Adjusted R-Squared: 0.2894
R-Squared: 0.3530
RMSE: 397982972.70


In [None]:
# Initialize XGBoost Regression with specified parameters
xgb_model = xgb.XGBRegressor(n_estimators=900, max_depth=22, random_state=42)

# Fit the model on the training data
xgb_model.fit(x_train, y_train)

# Predict on the test set
y_pred = xgb_model.predict(x_test)

# Calculate metrics - Adjusted R-Squared, R-Squared, RMSE
r2 = r2_score(y_test, y_pred)
adj_r2 = 1 - (1 - r2) * ((len(y_test) - 1) / (len(y_test) - x_test.shape[1] - 1))
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print the metrics
print(f"Adjusted R-Squared: {adj_r2:.4f}")
print(f"R-Squared: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

Adjusted R-Squared: 0.7158
R-Squared: 0.7412
RMSE: 251684937.82


In [None]:
# Initialize Neural Network Regression with specified parameters
nn_model = MLPRegressor(random_state=42)

# Fit the model on the training data
nn_model.fit(x_train, y_train)

# Predict on the test set
y_pred = nn_model.predict(x_test)

# Calculate metrics - Adjusted R-Squared, R-Squared, RMSE
r2 = r2_score(y_test, y_pred)
adj_r2 = 1 - (1 - r2) * ((len(y_test) - 1) / (len(y_test) - x_test.shape[1] - 1))
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print the metrics
print(f"Adjusted R-Squared: {adj_r2:.4f}")
print(f"R-Squared: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")

Adjusted R-Squared: -1.0974
R-Squared: -0.9097
RMSE: 683726831.40
