In [None]:
import numpy as np
import pandas as pd 
import json
import math

# for data visualization
import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn import preprocessing
from sklearn import model_selection
from sklearn import linear_model
from sklearn import svm
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
from sklearn import ensemble

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
tracks = pd.read_csv('/kaggle/input/spotify-dataset-19212020-160k-tracks/tracks.csv')
artists = pd.read_csv('/kaggle/input/spotify-dataset-19212020-160k-tracks/artists.csv')
genres = pd.read_csv('/kaggle/input/spotify-dataset-19212020-160k-tracks/data_by_genres_o.csv')
years = pd.read_csv('/kaggle/input/spotify-dataset-19212020-160k-tracks/data_by_year_o.csv')

In [None]:
with open('/kaggle/input/spotify-dataset-19212020-160k-tracks/dict_artists.json') as f:
    artists_related = json.load(f)

## EDA and Pre-Processing

In [None]:
tracks.info()

In [None]:
artists.info()

In [None]:
genres.info()

In [None]:
years.info()

In [None]:
tracks.describe() 

In [None]:
fig = px.histogram(tracks, x="popularity", nbins=40, title="Histogram of Tracks' popularity")
fig.show()

In [None]:
tracks["release_year"] = pd.to_datetime(tracks["release_date"]).dt.year
tracks.columns

In [None]:
# Removing Redundant variables
tracks_ml = tracks[['popularity', 'duration_ms', 'explicit','release_year', 'danceability', 'energy',
                   'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
                   'liveness', 'valence', 'tempo', 'time_signature']]

In [None]:
tracks_2020_ml = tracks_ml.query("release_year == 2020").drop(["release_year"], axis=1)

In [None]:
sns.set(rc={'figure.figsize':(22,14)})
sns.heatmap(tracks_2020_ml.corr(), linewidths=.5, annot=True, cmap="YlGnBu",
           mask=np.triu(np.ones_like(tracks_2020_ml.corr(), dtype=np.bool)))\
    .set_title("Correlations Heatmap between Audio Features, Based on 2020's Tracks")

**Analysis:**     
1. Acousticness is highly negative correlated to energy and loudness.      
2. energy and loudness are highly positve correlated to each other.
3. Instrumentalness is highly negative correlated to loudness. 
4. Danceability and valence have a highly positve correlation.
5. Popularity is highly positve correlated to explicit and danceability, and it is highly negative correlated to instrumentalness.

In [None]:
cols = tracks_2020_ml.columns
for col in cols:
    fig = px.histogram(tracks_2020_ml, x=col, title="Histogram of Tracks' " + col + ", 2020")
    fig.show()

**Analysis:**     
1. From the histogram of popularity, most of song and rated in 0 to 1 range, and the range over 20 shows a normal distribution.      
2. The distribution of duration_ms, danceability, energy, loudness, liveness and valence are normal.    
3. The most of songs' time_signature is 4.            
4. The distribution of speechiness, acousticness and instrumentalness are right-skewed.

In [None]:
X, y = tracks_2020_ml[cols[1:]], tracks_2020_ml[cols[0]]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33, random_state=0)

## Modeling

In [None]:
# define models
models = {
    "OLS": linear_model.LinearRegression(),
    "Ridge": linear_model.Ridge(),
    "Lasso": linear_model.Lasso(),
    "Bayesian": linear_model.BayesianRidge(), 
    "SVM": svm.SVR(),
    "RandomForestReg": ensemble.RandomForestRegressor(),
    "AdaBoostReg": ensemble.AdaBoostRegressor(),
    "GradientBoostingReg": ensemble.GradientBoostingRegressor()
}

In [None]:
# cross validate to fit and score
cv_results = pd.DataFrame(columns=['model', 'train_score', 'test_score'])
for key in models.keys():
    cv_res = cross_validate(models[key], X_train, y_train, 
                             return_train_score=True,
                             scoring='neg_root_mean_squared_error',
                             cv=5, n_jobs=-1)
    res = {
        'model': key, 
        'train_score': cv_res["train_score"].mean(), 
        'test_score': cv_res["test_score"].mean(),
        'fit_time': cv_res["fit_time"].mean(),
        'score_time': cv_res["score_time"].mean(),
        }
    cv_results = cv_results.append(res, ignore_index=True)
    print("CV for model:", key, "done.")
    
# show and compare results
cv_results["train_score"] = cv_results["train_score"].apply(abs)
cv_results["test_score"] = cv_results["test_score"].apply(abs)
sorted_cv_results = cv_results.sort_values(['train_score', 'test_score'])
sorted_cv_results

## Evaluate  Best Model on Testing Set
Considering the difference between the train and test score in the cross validatation on training set, the RandomForestReg model is a little over fitting. However, this model gives the best test score, so we use it as our best model and evaluate it on testing set.

In [None]:
best_model_key = sorted_cv_results.head(1).model.values[0]
print("best_model_choosed:", best_model_key)
best_model = models[best_model_key].fit(X_train, y_train)
print('root_mean_squared_error:', np.sqrt(mean_squared_error(y_test, best_model.predict(X_test))))

## Model Error Analysis

In [None]:
test_set = y_test.reset_index()
test_set["predicted"] = best_model.predict(X_test)
test_set["abs_error"] = abs(test_set["predicted"] - test_set["popularity"])
test_set = test_set.sort_values("abs_error", ascending=False)
test_set

In [None]:
test_set.describe()

In [None]:
test_set.abs_error.plot(kind='hist', title="Testing Set Prediction Error Distribution", figsize=(10,6));

In [None]:
print("abs_error over 50 prediction percentage:", end=" ")
print(test_set.query("abs_error > 50").abs_error.count() / test_set.abs_error.count())

In [None]:
test_set.predicted.plot(kind='hist', title="Testing Set Prediction Distribution", figsize=(10,6));

In [None]:
test_set.popularity.plot(kind='hist', title="Testing Set True Value Distribution", figsize=(10,6));

In [None]:
print("nagetive prediction percentage:", end=" ")
print(test_set.query("predicted < 0").predicted.count() / test_set.predicted.count())

**Analysis:**    
In the prediction, only 1.26% of testing data have abs_root_mean_squared_error > 50, most of the error is bewteen 0 to 20.   
The mean abs_root_mean_squared_error is 12.897846 and the median is 9.95.    
In the dataset, part (popularity over 20) of the popularity true value is normally distributed, but another part (popularity under 20) is not. Since one of the assumptions of the regression is normally distributed, the predicted value is normally distributed. This is one of the causes of the error.

## Improved Modeling
Since one of the assumptions of the regression is normally distributed, remove outliers which's popularity <= 20 and re-modeling.

In [None]:
tracks_2020_ml_n = tracks_2020_ml.query("popularity > 20")
X_n, y_n = tracks_2020_ml_n[cols[1:]], tracks_2020_ml_n[cols[0]]
X_n_train, X_n_test, y_n_train, y_n_test = model_selection.train_test_split(X_n, y_n, test_size=0.33, random_state=1)

In [None]:
# cross validate to fit and score
cv_results = pd.DataFrame(columns=['model', 'train_score', 'test_score'])
for key in models.keys():
    cv_res = cross_validate(models[key], X_n_train, y_n_train, 
                             return_train_score=True,
                             scoring='neg_root_mean_squared_error',
                             cv=5, n_jobs=-1)
    res = {
        'model': key, 
        'train_score': cv_res["train_score"].mean(), 
        'test_score': cv_res["test_score"].mean(),
        'fit_time': cv_res["fit_time"].mean(),
        'score_time': cv_res["score_time"].mean(),
        }
    cv_results = cv_results.append(res, ignore_index=True)
    print("CV for model:", key, "done.")
    
# show and compare results
cv_results["train_score"] = cv_results["train_score"].apply(abs)
cv_results["test_score"] = cv_results["test_score"].apply(abs)
sorted_cv_results = cv_results.sort_values(['train_score', 'test_score'])
sorted_cv_results

## Evaluate NEW Best Model on Testing Set Again
Made a tradeoff the fit time and train/test score of Random Forest and Gradient Boosting regressor, Gradient Boosting regressor is not over fitting and saved more time, so this section will evaluate Gradient Boosting regressor on testing set.

In [None]:
best_model_key = sorted_cv_results.head(1).model.values[0]
print("best_model_choosed:", best_model_key)
n_best_model_rf = models[best_model_key].fit(X_n_train, y_n_train)
print('root_mean_squared_error:', np.sqrt(mean_squared_error(y_n_test, n_best_model_rf.predict(X_n_test))))

In [None]:
print("model_choosed: GradientBoostingReg")
n_best_model_gb = models["GradientBoostingReg"].fit(X_n_train, y_n_train)
print('root_mean_squared_error:', np.sqrt(mean_squared_error(y_n_test, n_best_model_gb.predict(X_n_test))))

## Error Analysis One More Time

In [None]:
test_set_n = y_n_test.reset_index()
test_set_n["predicted"] = n_best_model_gb.predict(X_n_test)
test_set_n["abs_error"] = abs(test_set_n["predicted"] - test_set_n["popularity"])
test_set_n = test_set_n.sort_values("abs_error", ascending=False)
test_set_n

In [None]:
pd.concat([test_set.describe().add_suffix('_o'), test_set_n.describe().add_suffix('_n')], axis=1)

After removing outliers, the max abs_error is reduced from 69 to 43, and the mean abs_error is reduced from 12.87 to 9.34.  


## Hyperparameter Tuning

In [None]:
# # Create the parameter grid based on the results of random search 
# param_grid_rf = {
#     'bootstrap': [True],
#     'max_depth': range(50, 101, 50),
#     'max_features': range(5, 10),
#     'min_samples_leaf': range(5, 10),
# #     'min_samples_split': range(2, 7),
#     'n_estimators':  range(10, 101, 20)
# }

# # Instantiate the grid search model
# grid_search_rf = model_selection.GridSearchCV(estimator = models["RandomForestReg"], param_grid = param_grid_rf, 
#                               cv = 2, n_jobs = -1, scoring='neg_root_mean_squared_error')

# # Fit the grid search to the data
# grid_search_rf.fit(X_n_train, y_n_train)
# print("best_params:", grid_search_rf.best_params_)

In [None]:
# best_grid_rf = grid_search_rf.best_estimator_
# print('root_mean_squared_error: ', np.sqrt(mean_squared_error(y_n_test, best_grid_rf.predict(X_n_test))))

In [None]:
# # Create the parameter grid based on the results of random search 
# param_grid_gb = {
#     "loss": ["ls", "lad", "huber", "quantile"],
#     "learning_rate": [0.01, 0.05, 0.1, 0.15, 0.2],
#     "n_estimators": range(10, 201, 50),
# #     "min_samples_leaf": range(5, 10),
# #     "max_depth": range(50, 101, 10),
#     "max_features": range(5, 10)
# }

# # Instantiate the grid search model
# grid_search_gb = model_selection.GridSearchCV(estimator = models["GradientBoostingReg"], param_grid = param_grid_gb, 
#                               cv = 2, n_jobs = -1, scoring='neg_root_mean_squared_error')

# # Fit the grid search to the data
# grid_search_gb.fit(X_n_train, y_n_train)
# print("best_params:", grid_search_gb.best_params_)

In [None]:
# best_grid_gb = grid_search_gb.best_estimator_
# print('root_mean_squared_error: ', np.sqrt(mean_squared_error(y_n_test, best_grid_gb.predict(X_n_test))))

## Can 2020 Model Predict 2019 and 2021 Songs' Popularity?

In [None]:
tracks_2021_ml = tracks_ml.query("release_year == 2021 & popularity > 20").drop(["release_year"], axis=1)
X_2021, y_2021 = tracks_2021_ml[cols[1:]], tracks_2021_ml[cols[0]]
print('root_mean_squared_error on 2021 songs:', np.sqrt(mean_squared_error(y_2021, n_best_model_gb.predict(X_2021))))

In [None]:
tracks_2019_ml = tracks_ml.query("release_year == 2019 & popularity > 20").drop(["release_year"], axis=1)
X_2019, y_2019 = tracks_2019_ml[cols[1:]], tracks_2019_ml[cols[0]]
print('root_mean_squared_error on 2019 songs:', np.sqrt(mean_squared_error(y_2019, n_best_model_gb.predict(X_2019))))

Thus, 2020 Model have the similar prediction error on 2019 and 2021 data set.