## Imports

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from tqdm.autonotebook import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

## Data Preparation

In [None]:
tracks_data_df = pd.read_csv('../input/spotify-dataset-19212020-160k-tracks/data.csv')
tracks_data_df.head()

In [None]:
tracks_data_df.tail()

In [None]:
tracks_data_df.describe()

### Visualizing Data

In [None]:
tracks_data_df.hist(figsize=(15, 15), color='black')
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(tracks_data_df.corr(), annot=True)

As you can see above, year, danceability, energy, loudness and tempo are important features for predicting popularity.

Let's take a look on the correlations between these features and popularity:

In [None]:
sns.scatterplot(x="year", y="popularity", data=tracks_data_df, alpha=0.03, color='blue')

In [None]:
sns.scatterplot(x="danceability", y="popularity", data=tracks_data_df, alpha=0.03, color='blue')

In [None]:
sns.scatterplot(x="energy", y="popularity", data=tracks_data_df, alpha=0.03, color='blue')

In [None]:
sns.scatterplot(x="loudness", y="popularity", data=tracks_data_df, alpha=0.03, color='blue')

In [None]:
sns.scatterplot(x="tempo", y="popularity", data=tracks_data_df, alpha=0.03, color='blue')

In [None]:
features = ['year', 'danceability', 'energy', 'loudness', 'tempo']
tracks_data = tracks_data_df.copy()
features_tracks_data = tracks_data_df[features]

We don't need to worry about outliners, because in this example they barely effect the performance of the model.

### Data normalization

In [None]:
scaler = StandardScaler()
scaler.fit(features_tracks_data)
features_tracks_data = scaler.transform(features_tracks_data)

y_tracks_data = tracks_data.popularity.values / 100

X_train, X_test, y_train, y_test = train_test_split(features_tracks_data, y_tracks_data, test_size=0.2, random_state=42)

In [None]:
for column in range(X_train.shape[1]): 
    print(X_train[:, column].min(), X_train[:, column].max())

## Create Random Forest Regressor

I used a Random Forest Regressor as my model because in this case it actually works better than a Decision Tree Regressor or a simple neural network. 

In [None]:
clf = RandomForestRegressor()
clf.fit(X_train, y_train)

In [None]:
preds = clf.predict(X_test)

accuracy = clf.score(X_test, y_test)
print("Test Accuracy: {:.4f}".format(accuracy*100))

average_error = (abs(y_test - preds)).mean()
print("{:.4f} average error".format(average_error))

In [None]:
for index in range(len(preds[:100])): 
    
    pred = preds[index]
    actual = y_test[index]
    
    print("Actual / Predicted: {:.4f} / {:.4f}".format(actual, pred))