# Spotify song popularity prediction

Based on the notebook:

https://www.kaggle.com/thomaskonstantin/top-spotify-songs-analysis-modeling-and-prediction

I tried to elaborate multiple models to predict the data.

In [None]:
import os
import pandas as pd
import numpy as np
import random

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable as V

## Data

In [None]:
df = pd.read_csv("/kaggle/input/top50spotify2019/top50.csv", encoding="ISO-8859-1", index_col= ["Unnamed: 0"])
df.head()

In [None]:
df.info()

In [None]:
Genre_Enc= LabelEncoder().fit_transform(df["Genre"])
df.insert(3,'Genre_Enc',Genre_Enc)

## Correlation

In [None]:
plt.figure(figsize=(10,6))

heatmap = sns.heatmap(df.corr(), vmin=-1,vmax=1, annot=True, cmap='viridis')

heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)
plt.show()

Popularity is:
Correlated directly with:
* Genre
* Speechiness
* Beats.Per.Minute

Inverse correlated with:
* Valence

Non correlated to others.

## Model Popularity 

In [None]:
Y = df.Popularity
X = df.drop(['Popularity','Track.Name','Artist.Name','Genre'],axis=1)

In [None]:
fig = px.line(df, x= range(0,df.shape[0]), y = sorted(Y),
             hover_name = "Track.Name",hover_data= ['Artist.Name', 'Genre'])

fig.update_layout(title= 'Popularity sorted', 
                  xaxis =dict(title='Song index'),
                  yaxis =dict(title='Popularity'))
fig.show()

### Random Forest Regressor

In [None]:
RFR = RandomForestRegressor(random_state=42,n_jobs=2)

RFR_error = -1*cross_val_score(RFR,X,Y,cv=10, scoring='neg_mean_squared_error')
RFR.fit(X,Y)

print('RFR max error by CV is: {:.3f} \nRFR min error by CV is: {:.3f} \nRFR mean error by CV is: {:.3f}'.
      format(RFR_error.max(),RFR_error.min(),RFR_error.mean()))

fig= plt.figure(figsize=(16,9))

plt.plot(range(0,50),sorted(Y), label='Actual Value',lw=2, alpha= 0.7)
plt.plot(sorted(RFR.predict(X)), label='Random Forest Prediction')

plt.title("Random Forest prediction")
plt.xlabel("Items")
plt.ylabel("Popularity")
fig.legend(loc='lower right',bbox_to_anchor=(0.85,0.15))
plt.show()

RFR_mse = mse(RFR.predict(X), Y)
print("RFR mean squared error is: {:.4f}".format(RFR_mse))

### Gradient Boosting Regressor

In [None]:
GBR = GradientBoostingRegressor(random_state=42)

GBR_error = -1*cross_val_score(GBR,X,Y,cv=10, scoring='neg_mean_squared_error')
GBR.fit(X,Y)

print('GBR max error by CV is: {:.3f} \nGBR min error by CV is: {:.3f} \nGBR mean error by CV is: {:.3f}'.
      format(GBR_error.max(),GBR_error.min(),GBR_error.mean()))
fig= plt.figure(figsize=(16,9))

plt.plot(range(0,50),sorted(Y), label='Actual Value',lw=2, alpha= 0.7)
plt.plot(sorted(GBR.predict(X)), label='Gradien Boosting Prediction')

plt.title("Gradient Boosting prediction")
plt.xlabel("Items")
plt.ylabel("Popularity")
fig.legend(loc='lower right',bbox_to_anchor=(0.85,0.15))
plt.show()

GBR_mse = mse(GBR.predict(X), Y)
print("GBR mean squared error is: {:.4f}".format(GBR_mse))

### K Neighbours Regressor

In [None]:
KNR = KNeighborsRegressor(n_jobs=2)

KNR_error = -1*cross_val_score(KNR,X,Y,cv=10, scoring='neg_mean_squared_error')
KNR.fit(X,Y)

print('KNR max error by CV is: {:.3f} \nKNR min error by CV is: {:.3f} \nKNR mean error by CV is: {:.3f}'.
      format(KNR_error.max(),KNR_error.min(),KNR_error.mean()))

fig= plt.figure(figsize=(16,9))

plt.plot(range(0,50),sorted(Y), label='Actual Value',lw=2, alpha= 0.7)
plt.plot(sorted(KNR.predict(X)), label='K Neighbors Prediction')

plt.title("K Neighbors prediction")
plt.xlabel("Items")
plt.ylabel("Popularity")
fig.legend(loc='lower right',bbox_to_anchor=(0.85,0.15))
plt.show()

KNR_mse = mse(KNR.predict(X), Y)
print("KNR mean squared error is: {:.4f}".format(KNR_mse))

### Support Vector Regressor

In [None]:
SVRR = SVR(kernel='linear')

SVRR_error = -1*cross_val_score(SVRR,X,Y,cv=10, scoring='neg_mean_squared_error')
SVRR.fit(X,Y)

print('SVR max error by CV is: {:.3f} \nSVR min error by CV is: {:.3f} \nSVR mean error by CV is: {:.3f}'.
      format(SVRR_error.max(),SVRR_error.min(),SVRR_error.mean()))

fig= plt.figure(figsize=(16,9))

plt.plot(range(0,50),sorted(Y), label='Actual Value',lw=2, alpha= 0.7)
plt.plot(sorted(SVRR.predict(X)), label='Support Vector Prediction')

plt.title("Support Vector prediction")
plt.xlabel("Items")
plt.ylabel("Popularity")
fig.legend(loc='lower right',bbox_to_anchor=(0.85,0.15))
plt.show()

SVRR_mse = mse(SVRR.predict(X), Y)
print("SVR mean squared error is: {:.4f}".format(SVRR_mse))

### Linear Regressor

In [None]:
LR = LinearRegression()

LR_error = -1*cross_val_score(LR,X,Y,cv=10, scoring='neg_mean_squared_error')
LR.fit(X,Y)

print('LR min error by CV is: {:.3f} \nLR min error by CV is: {:.3f} \nLR mean error by CV is: {:.3f}'.
      format(LR_error.max(),LR_error.min(),LR_error.mean()))

fig= plt.figure(figsize=(16,9))

plt.plot(range(0,50),sorted(Y), label='Actual Value',lw=2, alpha= 0.7)
plt.plot(sorted(LR.predict(X)), label='Linear Prediction')

plt.title("Linear prediction")
plt.xlabel("Items")
plt.ylabel("Popularity")
fig.legend(loc='lower right',bbox_to_anchor=(0.75,0.15))
plt.show()

LR_mse = mse(LR.predict(X), Y)
print("RFR mean squared error is: {:.4f}".format(LR_mse))

## Obtaining a model
### Model of models by Cross Validation error

In [None]:
fig= plt.figure(figsize=(16,9))

plt.plot(RFR_error, label='Random Forest')
plt.plot(GBR_error, label='Gradient Boosting')
plt.plot(KNR_error, label='K Neighbours')
plt.plot(SVRR_error, label= 'Support Vector')
plt.plot(LR_error, label= 'Linear')


fig.suptitle("Cross Validation Error")
fig.legend(loc='upper right',bbox_to_anchor=(0.8,0.75))
plt.show()

In [None]:
# Giving weights to prediction models based on their accuracy
mean_errors = np.array([RFR_error.mean(),GBR_error.mean(),KNR_error.mean(),SVRR_error.mean(),LR_error.mean()])
min_errors = np.array([RFR_error.min(),GBR_error.min(),KNR_error.min(),SVRR_error.min(),LR_error.min()])
max_errors = np.array([RFR_error.max(),GBR_error.max(),KNR_error.max(),SVRR_error.max(),LR_error.max()])

RFR_w0,GBR_w0,KNR_w0,SVRR_w0,LR_w0 = mean_errors/(mean_errors.sum())
RFR_w1,GBR_w1,KNR_w1,SVRR_w1,LR_w1 = min_errors/(min_errors.sum())
RFR_w2,GBR_w2,KNR_w2,SVRR_w2,LR_w2 = max_errors/(max_errors.sum())

RFR_w = (RFR_w0+RFR_w1+RFR_w2)/3
GBR_w = (GBR_w0+GBR_w1+GBR_w2)/3
KNR_w = (KNR_w0+KNR_w1+KNR_w2)/3
SVRR_w = (SVRR_w0+SVRR_w1+SVRR_w2)/3
LR_w = (LR_w0+LR_w1+LR_w2)/3

In [None]:
RGKSL0_Ypred = RFR_w0*RFR.predict(X)+GBR_w0*GBR.predict(X)+KNR_w0*KNR.predict(X)+SVRR_w0*SVRR.predict(X)+LR_w0*LR.predict(X)
RGKSL0_mse = mse(RGKSL0_Ypred,Y)

RGKSL1_Ypred = RFR_w1*RFR.predict(X)+GBR_w1*GBR.predict(X)+KNR_w1*KNR.predict(X)+SVRR_w1*SVRR.predict(X)+LR_w1*LR.predict(X)
RGKSL1_mse = mse(RGKSL1_Ypred,Y)

RGKSL2_Ypred = RFR_w2*RFR.predict(X)+GBR_w2*GBR.predict(X)+KNR_w2*KNR.predict(X)+SVRR_w2*SVRR.predict(X)+LR_w2*LR.predict(X)
RGKSL2_mse = mse(RGKSL2_Ypred,Y)

RGKSL_Ypred = RFR_w*RFR.predict(X)+GBR_w*GBR.predict(X)+KNR_w*KNR.predict(X)+SVRR_w*SVRR.predict(X)+LR_w*LR.predict(X)
RGKSL_mse = mse(RGKSL_Ypred,Y)


print('RGKSL0 model has a mean squared error of : {:.3f}'.format(RGKSL0_mse) )
print('RGKSL1 model has a mean squared error of : {:.3f}'.format(RGKSL1_mse) )
print('RGKSL2 model has a mean squared error of : {:.3f}'.format(RGKSL2_mse) )
print('RGKSL model has a mean squared error of : {:.3f}'.format(RGKSL_mse) )

In [None]:
fig= plt.figure(figsize=(16,9))

plt.plot(range(0,50),sorted(Y), label='Actual Value')
plt.plot(sorted(RGKSL0_Ypred), label='w with Mean Error')
plt.plot(sorted(RGKSL1_Ypred), label='w with Min Error')
plt.plot(sorted(RGKSL2_Ypred), label='w with Max Error')
plt.plot(sorted(RGKSL_Ypred), label= 'w with Mean(Min,Max,Mean) Error')


fig.suptitle("Comparison between weights choosing method")
plt.xlabel("Items")
plt.ylabel("Popularity")
fig.legend(loc='lower right',bbox_to_anchor=(0.85,0.15))
plt.show()


The best option for building a linear model of previous models is to choose the weights based on the mean error or min error of the *mean_squared_error* function from cross validation.

### Model of models by MSE error

In [None]:
# Giving weights to prediction models based on their mse
mse_errors = np.array([RFR_mse,GBR_mse,KNR_mse,SVRR_mse,LR_mse])

RFR_wmse,GBR_wmse,KNR_wmse,SVRR_wmse,LR_wmse = (1/mse_errors)/((1/mse_errors).sum())

# Building the model

RGKSL_mse_Ypred = RFR_wmse*RFR.predict(X)+GBR_wmse*GBR.predict(X)+KNR_wmse*KNR.predict(X)+SVRR_wmse*SVRR.predict(X)+LR_wmse*LR.predict(X)
RGKSL_mse_MSE = mse(RGKSL_mse_Ypred,Y)

In [None]:
fig= plt.figure(figsize=(16,9))

plt.plot(range(0,50),sorted(Y), label='Actual Value',lw=2, alpha= 0.7)
plt.plot(sorted(RGKSL_mse_Ypred), label='w with previous mse')

plt.title("Model of models with MSE prediction")
plt.xlabel("Items")
plt.ylabel("Popularity")
fig.legend(loc='lower right',bbox_to_anchor=(0.85,0.15))
plt.show()
print('RGKSL_mse model has a mean squared error of : {:.3f}'.format(RGKSL_mse_MSE) )

## Neural Network

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

x = torch.from_numpy(X.values.astype(np.float32)).reshape(-1,10).to(device)
y = torch.from_numpy(np.array(Y).astype(np.float32)).reshape(-1,1).to(device)

input_shape, hidden_shape1, hidden_shape2, output_shape, EPOCH = 10, 500, 10, 1, 3001
model = torch.nn.Sequential(
    torch.nn.Linear(input_shape, hidden_shape1),
    torch.nn.ReLU(),
    torch.nn.Linear(hidden_shape1, hidden_shape2),
    torch.nn.ReLU(),
    torch.nn.Linear(hidden_shape2, output_shape),
).to(device)
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
loss_fn = torch.nn.MSELoss()


for epoch in range(EPOCH):

    pred = model(x)
    loss = loss_fn(pred, y)
    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    if epoch % 500 == 0:
        print('EPOCH: {}. Training loss: {:.4f}'.format(epoch,loss_fn(model(x), y)))

NNR_Ypred = model(x).detach().numpy()

fig= plt.figure(figsize=(16,9))

plt.plot(range(0,50),sorted(Y), label='Actual Value')
plt.plot(sorted(NNR_Ypred), '+-',label='Neural Network')

fig.suptitle("Neural Network Regression")
fig.legend(loc='lower right',bbox_to_anchor=(0.85,0.15))
plt.xlabel("Items")
plt.ylabel("Popularity")
plt.show()
NNR_mse = mse(NNR_Ypred,Y)
print("NNR has a mean squared error of: {:.4f}".format(NNR_mse))
print("Applying a Neural Network of two hidden layers seems to be the best option but may suffer from overfitting.\nThe error is clearly the best, dropping from 0.034 to {:.4f}.".format(NNR_mse))

## Summary

In [None]:
plt.figure(figsize=(16,9))

plt.plot(range(0,50),Y, label='Actual Value',lw=10,alpha= 0.5,color='black')
plt.plot(NNR_Ypred,label='Neural Network',color='green')
plt.plot(RGKSL0_Ypred, label='Regressor of CV error',color='orange')
plt.plot(RGKSL_mse_Ypred, label='Regressor of MSE error',color='magenta')
plt.plot(RFR.predict(X), label='Random Forest',color= 'blue')
plt.plot(GBR.predict(X), label='Gradient Boosting',color='red')
plt.plot(KNR.predict(X), label='K Neighbors',color='pink')
plt.plot(SVRR.predict(X), label='Support Vector', color='yellow')
plt.plot(LR.predict(X), label='Linear')

plt.title("Model predicted values")
plt.legend(loc='best')
plt.xlabel("Items")
plt.ylabel("Popularity")
plt.show()


In [None]:
plt.figure(figsize=(16,9))

height=np.array([RFR_mse,GBR_mse,KNR_mse,SVRR_mse,LR_mse,RGKSL0_mse,RGKSL_mse_MSE,NNR_mse])
x_mse = ['Random_Forest', 'Gradient_Boosting','K_Neighbours','Support_Vector','Linear',
        'Reg_of_CV','Reg_of_MSE','Neural_Network']

df_sns = pd.DataFrame({'Model':x_mse,
                      'Mean Squared Error':height})
df_sns.sort_values(by=['Mean Squared Error'],inplace=True)
sns.barplot(data=df_sns,y='Model',x='Mean Squared Error')

plt.title('Model ranking by MSE')
plt.show()

I'm starting with neural networks. If you have any advice please comment it so I can improve.

Thank you for reading. 