# Importing the libraries and data

In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
# Importing Datasets
all_dataset = pd.read_csv('../input/spotify-dataset-19212020-160k-tracks/data.csv')
# artist_dataset = pd.read_csv('data_by_artist.csv')
# genres_dataset = pd.read_csv('data_by_genres.csv')
# year_dataset = pd.read_csv('data_by_year.csv')
# w_genres_dataset = pd.read_csv('data_w_genres.csv')

# Exploration

In [None]:
all_dataset.shape

In [None]:
all_dataset.describe()

## EDA Plan of attack and some expectations

#### Univariate Analysis
Overall Valence on songs? Tendency of songs? <br>
Song distribution by years <br>
Acousticness distribution (Most songs acoustic or not?) <br>
Danceabilitty distribution (Most songs danceable or not?) <br>
Song duration distribution <br>
Explicit songs count? <br>
Instrumentalness distribution? <br>
key count? <br>
liveness distribution? <br>
loudness distribution? <br>
mode count? <br>
popularity distribution? <br>
speechiness distribution? <br>
tempo distribution? <br>


#### Bivariate Analysis
##### Popularity
year vs popularity <br>
valence vs popularity (valence=sentiment [depression 0 or happy 1]) <br>
mode vs popularity <br>
energy vs popularity <br>
key vs popularity <br>
explicit vs popularity <br> 

###### Valence
tempo vs valence (expect low tempo low valence -- high tempo high valence) <br>
acousticness(not electronicaly amplified) vs valence <br>
energy vs valence (expect high energy high valence) <br>
key vs valence <br>
valence vs year? (song valence differences in the last century) <br>
valence vs loudness (expect low loudness low valence) <br>

###### Others
instrumentalness(instruments) vs year <br>
tempo vs danceavility (expect high tempo high danceability) <br>
danceability vs explicit <br>
loudness vs explicit <br>


#### More info for the audio features you can find them in the spotify API docs here: [Audio Features](https://developer.spotify.com/documentation/web-api/reference/tracks/get-audio-features/)

In [None]:
all_dataset.tail()

In [None]:
all_dataset.isnull().sum() / all_dataset.shape[0]

## Univariate Analysis

In [None]:
# Valence Histogram
fig = px.histogram(all_dataset, x="valence", nbins=1000, title="Valence Histogram",color_discrete_sequence=['indianred'])
fig.show()

If we look at the whole dataset, most songs are above 0.5 valence so we can say the <b>tendency of creating a positive song is a little bit higher </b>. But there is not a category of the two (sad-0, happy-1) that stands out. <br><br> Also <b>highest count of sad songs is between 0.03-0.04</b> and <b>highest count of happy songs is between 0.96-0.972</b>. These are the spikes you can see in the histogram.

In [None]:
# Year Histogram
fig = px.histogram(all_dataset, x="year", nbins=100, title = "Year Song Count Histogram",color_discrete_sequence=['indianred'])
fig.show()

##### After 1945 songs in the dataset are almost the same count each year until 2020.

In [None]:
# Acousticness Histogram
fig = px.histogram(all_dataset, x="acousticness", nbins=1000, title = "Acousticness Histogram",color_discrete_sequence=['indianred'])
fig.show()

Most songs tend to have <b>extreme values of acousticness</b> (either 0 or 1) and we can see that in the above histogram.

In [None]:
# Acousticness Histogram
fig = px.histogram(all_dataset, x="danceability", title = "Danceability Histogram",color_discrete_sequence=['indianred'])
fig.show()

We can see that the <b>danceability histogram</b> follows a <b>normal distribution</b>. <br>
Most songs around 0.5 - 0.7 danceability.

In [None]:
# Song Duration Histogram (We transform ms to minutes)
fig = px.histogram(all_dataset, x=all_dataset.duration_ms/ (1000 * 60),range_x = [0,15],
                   title = "Song Duration Histogram",
                   color_discrete_sequence=['indianred'],
                   labels={
                "x": "Song Duration(minutes)",
            })
fig.show()

Again we see that the <b>duration of the songs</b> in the dataset follow a <b>normal distribution</b> and most values are around 2:90 and 3:35 minutes.

In [None]:
# Explicit Songs Count/Percentage
df_explicit_labels = all_dataset.copy()
for i in range(len(all_dataset)):
    if all_dataset["explicit"][i] == 0:
        df_explicit_labels["explicit"][i] = "non-explicit"
    elif all_dataset["explicit"][i] == 1:
        df_explicit_labels["explicit"][i] = "explicit"


fig = px.histogram(df_explicit_labels,
                   x=df_explicit_labels.explicit,
                   histnorm="percent",
                   color_discrete_sequence=['indianred'],
                   title = "Explicit Song Percentage")
fig.show()
print("Explicit Songs Count\n", df_explicit_labels["explicit"].value_counts())

#### Most songs in the dataset are non-explicit. (91.54 % non explicit and 8.46% explicit)

In [None]:
# Instrumentalness Histogram (Metric for how much instrumental is the song)
fig = px.histogram(all_dataset, x=all_dataset.instrumentalness, nbins = 100, title = "Instrumentalness Histogram",color_discrete_sequence=['indianred'])
fig.show()

We can see from the above histogram that the songs in the dataset are <b>mostly non-instrumental</b>.

In [None]:
# Key Count of Songs
df_keys_labels = all_dataset.copy()
keys = ["C","C#","D","D#","E","F","F#","G","G#","A","A#","B"]
for index in range(len(all_dataset)):
    for key_index in range(len(keys)):
        if all_dataset["key"][index] == key_index:
            df_keys_labels["key"][index] = keys[key_index]
            
fig = px.histogram(df_keys_labels,
                   x=df_keys_labels.key,
                   title = "Key of Songs Count",
                   color_discrete_sequence=['indianred'],
                   category_orders={
                "key": ["C","C#","D","D#","E","F","F#","G","G#","A","A#","B"]
            })
fig.show()
print("Key of Songs Count\n", df_keys_labels["key"].value_counts())

From the above plot we can see that the <b>most used key notes are C and G</b> and that all key notes with <b>#</b> (sharp notes) are mainly <b>used less by artists</b>.

In [None]:
# Liveness Histogram
# (Higher liveness values represent an increased probability that the track was performed live)(From Spotify docs)
fig = px.histogram(all_dataset, x=all_dataset.liveness, nbins = 1000, title = "Liveness Histogram",color_discrete_sequence=['indianred'])
fig.show()

In most songs there is probably <b>not a live audience</b>. For Example : live concert song recording etc.

In [None]:
# Loudness Histogram
fig = px.histogram(all_dataset, x=all_dataset.loudness, nbins = 1000, title = "Loudness Histogram",color_discrete_sequence=['indianred'])
fig.show()

Most values lie in bettween -20 and -5db.

In [None]:
# Modality of Songs Count/Percentage
df_mode_labels = all_dataset.copy()
for i in range(len(all_dataset)):
    if all_dataset["mode"][i] == 0:
        df_mode_labels["mode"][i] = "Minor"
    elif all_dataset["mode"][i] == 1:
        df_mode_labels["mode"][i] = "Major"


fig = px.histogram(df_mode_labels, x="mode", histnorm = "percent", title = "Modality of Songs Percentage",color_discrete_sequence=['indianred'])
fig.show()
print("Modality of Songs Count\n", df_mode_labels["mode"].value_counts())

Songs in the dataset are <b>mostly Major</b> (approximately 70% of the whole dataset).

In [None]:
# Speechiness Histogram
# 1.0 - 0.66 probably songs made entirely of words, 0.66-0.33 speech and music (rap songs), 0.33-0 other melodic songs
# (from Spotify docs)
fig = px.histogram(all_dataset, x=all_dataset.speechiness, nbins = 1000, title = "Speechiness Histogram",color_discrete_sequence=['indianred'])
fig.show()

We can see that most of the songs do not have <b>non melodic speechiness</b>.

In [None]:
# Tempo Distribution (Beats Per Minute BPM)
fig = px.histogram(all_dataset, x=all_dataset.tempo, nbins = 1000, title = "Tempo Histogram",color_discrete_sequence=['indianred'])
fig.show()

In [None]:
# Popularity Distribution
fig = px.histogram(all_dataset, x=all_dataset.popularity, nbins = 100, title = "Popularity Histogram",color_discrete_sequence=['indianred'])
fig.show()

## Bivariate Analysis

In [None]:
# First I will add all the String label made columns into one dataframe
labeled_df = all_dataset.copy()
labeled_df["explicit"] = df_explicit_labels["explicit"]
labeled_df["key"] = df_keys_labels["key"]
labeled_df["mode"] = df_mode_labels["mode"]

# Creating random subset due to the computational time and lagging of runtime
sub_labeled_df = labeled_df.sample(frac=0.5,random_state=42,axis=0)
sub_labeled_df["duration_ms"] = sub_labeled_df["duration_ms"]/ (1000*60)
sub_labeled_df.rename(columns={"duration_ms" : "duration"},inplace=True)

In [None]:
# Splitting to categorical and continous dataframes
labeled_cat_df = sub_labeled_df[["explicit", "key", "mode"]].copy()
labeled_con_df = sub_labeled_df.drop(columns=["explicit", "key", "mode"])

In [None]:
# Creating the indexes for the columns of sub_labeled_df
target_columns = sub_labeled_df.select_dtypes(include=["float64","int64"]).columns
target_columns = np.delete(target_columns, target_columns.get_loc("popularity"))
target_columns

In [None]:
# Creating the indexes for the categorical columns
cat_target_columns = labeled_cat_df.columns
cat_target_columns

In [None]:
# # Defining function to plot boxplots or Scatterplots dropdown menus
# """
# df = data, 
# target_columns are the different columns that we apply the plots against popularity
# """
# def plot_graph(df,target_columns,boxplot=False):
#     fig = go.Figure()
#     #buttons are the things you see in the dropdown 
#     buttons = []
    
#     if (boxplot == True):
#         fig.add_trace(go.Box(x= df[target_columns[0]],
#                          y= df["popularity"],
#                          marker_color = 'indianred',
#                          boxmean=True))
#     else:
#         fig.add_trace(go.Scattergl(x= df[target_columns[0]], y=df["popularity"], mode="markers",
#                                marker = go.Marker(size=5,
#                                color =df["popularity"],
#                                colorscale= 'YlOrRd',
#                                symbol = 'circle',
#                                showscale=True,
#                                cmax=100,
#                                cmin=0)))

    

#     for index in range(len(target_columns)):
#           buttons.append(dict(method='restyle',
#                       label=f"{target_columns[index]} vs Popularity".title(),
#                       visible=True,
#                       args=[{'y':[df["popularity"].values],
#                              'x':[df[target_columns[index]]]},[0]]
#                       )
#                 )

#   #to get a menu to show, you need to create an updatemenu.

#     updatemenu = []
#     your_menu = {}
#     updatemenu.append(your_menu)

#     updatemenu[0]['buttons'] = buttons
#     updatemenu[0]['direction'] = 'down'
#     updatemenu[0]['showactive'] = True
#     updatemenu[0]['yanchor'] = "top"

#     # add dropdown menus to the figure
#     fig.update_layout(showlegend=False, updatemenus=updatemenu)
#     fig.show()

In [None]:
# Defining function to plot boxplots or Scatterplots dropdown menus
"""
df = data, 
target_columns are the different columns that we apply the plots against popularity
plot_against : string of vs variable
boxplot : if Boxplot True, plot Boxplot
cmax : max value of y value for marker colorscale
cmin : min value of y value for marker colorscale
"""
def plot_graph(df,target_columns,plot_against,cmax=100,cmin=0,boxplot=False):
    fig = go.Figure()
    #buttons are the things you see in the dropdown 
    buttons = []
    
    if (boxplot == True):
        fig.add_trace(go.Box(x= df[target_columns[0]],
                         y= df[plot_against],
                         marker_color = 'indianred',
                         boxmean=True))
    else:
        fig.add_trace(go.Scattergl(x= df[target_columns[0]], y=df[plot_against], mode="markers",
                               marker = go.Marker(size=5,
                               color =df[plot_against],
                               colorscale= 'YlOrRd',
                               symbol = 'circle',
                               showscale=True,
                               cmax=cmax,
                               cmin=cmin)))

    

    for index in range(len(target_columns)):
          buttons.append(dict(method='restyle',
                      label=f"{target_columns[index]} vs {plot_against}".title(),
                      visible=True,
                      args=[{'y':[df[plot_against].values],
                             'x':[df[target_columns[index]]]},[0]]
                      )
                )

  #to get a menu to show, you need to create an updatemenu.

    updatemenu = []
    your_menu = {}
    updatemenu.append(your_menu)

    updatemenu[0]['buttons'] = buttons
    updatemenu[0]['direction'] = 'down'
    updatemenu[0]['showactive'] = True
    updatemenu[0]['yanchor'] = "top"

    # add dropdown menus to the figure
    fig.update_layout(showlegend=False, updatemenus=updatemenu)
    fig.show()

In [None]:
plot_graph(sub_labeled_df,target_columns,"popularity")

## Insights from the above Dropdown menu Graphs

**Acousticness vs Popularity** : More songs with popularity 60 or above tend to have low values of acousticness, 0 - 0.4 .
<br>
**Danceability vs Popularity** : Popular songs (>60) tend to have values of danceability between 0.4 - 0.8 .
<br>
**Duration vs Popularity** : Most Popular songs have duration of 2-5 minutes.
<br>
**Energy vs Popularity** : Popularity values are a little bit higher in the second half of the plot (0.5 - 1).
<br>
**Instrumentalness vs Popularity** : Most Popular songs have low values of instrumentalness (0 - 0.1).
<br>
**Liveness vs Popularity** : Most Popular songs have values between 0 and 0.4.
<br>
**Loudness vs Popularity** : Most Popular songs have values between -20 and 0 db.
<br>
**Speechiness vs Popularity** : Tracks with low speechiness tend to be more popular.
<br>
**Tempo vs Popularity** : Tracks with tempo between 60 and 200 are more popular.
<br>
**Valence vs Popularity** : Can't distinguish any correlation with the popularity. 
<br>
**Year vs Popularity** : Songs popularity is correlated with the year of the song. Newer songs have more popularity which is normal.
<br>

In [None]:
# Plotting boxplots dropdown menu
plot_graph(sub_labeled_df,cat_target_columns,"popularity",boxplot=True)

## Insights from above Boxplots Dropdown menu

**Explicit vs Popularity** : Explicit songs, tend to be more popular that the non-explicit ones. Worthy to note here that we have very little number of explicit songs, in contrast with non-explicit.
<br>
**Key vs Popularity** : More popular songs tend to have key values of C# and F# but in general the popularity amongst keys is the same (with the exception of D#)
<br>
**Mode vs Popularity** : Not any significant differences in mode and popularity boxplot.

In [None]:
# Creating the indexes for the columns of sub_labeled_df except valence and popularity
target_columns_val = sub_labeled_df.select_dtypes(include=["float64","int64"]).columns
target_columns_val = np.delete(target_columns_val, target_columns_val.get_loc("popularity"))
target_columns_val = np.delete(target_columns_val, target_columns_val.get_loc("valence"))
target_columns_val

In [None]:
plot_graph(sub_labeled_df,target_columns_val,"valence", cmax=1)

## Insights from above Scatterplot (Valence)

**Acousticness vs Valence** : We can't distinguish any correlation or pattern on the data.
<br>
**Danceability vs Valence** : We can see that in general, as the danceability goes high, the valence also goes high, as expected and noted in the Plan of Attack.
<br>
**Duration vs Valence** : If we consider that there aren't many tracks that last more than 10 minutes then we can't make any conclusions.
<br>
**Energy vs Valence** : We can see that songs with energy less 0.1 have also less valence than the songs with higher energy.
<br>
**Instrumentalness vs Valence** : Can't see any correlation between the two variables.
<br>
**Liveness vs Valence** : Even though there are less tracks with high liveness and high valence, we can't see any correlation.
<br>
**Loudness vs Valence** : More loud songs tend to have higher values of valence.
<br>
**Speechiness vs Valence** : We can see that most tracks gather in high or low speechiness values(near 1 or near 0). Additionally, tracks with higher speechiness tend to have lower valence than the tracks near 0.
<br>
**Tempo vs Valence** : The only thing we can notice is that tracks near 200BPM don't have valence values of 0.2 and below.
<br>
**Year vs Valence** : We can't distinguish any correlation or pattern on the data.
<br>

In [None]:
plot_graph(sub_labeled_df,cat_target_columns,"valence", boxplot=True)

## Insights from above Boxplots Dropdown menu (Valence)

**Explicit vs Valence** : Valence boxplots of explicit and non-explicit tracks are almost identical with the exception of the values in 75th percentile (explicit songs lower 75th percentile valence values).
<br>
**Key vs Valence** : Can't distinguish any correlation between keys and valence values.
<br>
**Mode vs Valence** : Boxplots of Major and Minor Modes almost identical.

In [None]:
def plot_single_scatter(x,y,cmax=100,cmin=0,title=""):
    fig = go.Figure()
    fig.add_trace(go.Scattergl(x= x, y=y, mode="markers",
                               marker = go.Marker(size=5,
                               color =y,
                               colorscale= 'YlOrRd',
                               symbol = 'circle',
                               showscale=True,
                               cmax=cmax,
                               cmin=cmin)))
    fig.update_layout(title=title,
                  yaxis_zeroline=False, xaxis_zeroline=False)
    fig.show()

In [None]:
def plot_single_box(x,y,title=""):
    fig = go.Figure()
    fig.add_trace(go.Box(x= x,
                         y= y,
                         marker_color = 'indianred',
                         boxmean=True))
    fig.update_layout(title=title,
                  yaxis_zeroline=False, xaxis_zeroline=False)
    fig.show()

In [None]:
plot_single_scatter(x=sub_labeled_df.year,y=sub_labeled_df.instrumentalness, cmax=1,title="Year Vs Instrumentalness Scatterplot")

In [None]:
plot_single_scatter(sub_labeled_df.tempo, sub_labeled_df.danceability,cmax=1,title="Tempo Vs Danceability Scatterplot")

We can see that until 110 BPM the danceability grows and after 110, declines. So the peak is at around 110 BPM.

In [None]:
plot_single_box(sub_labeled_df.explicit, sub_labeled_df.danceability,title="Explicit Vs Danceability Boxplot")

According to the boxplots above, we can deduct that explicit songs tend to be more danceable than non-explicit ones.

In [None]:
plot_single_box(sub_labeled_df.explicit, sub_labeled_df.loudness,title="Explicit Vs Loudness Boxplot")

Additionally, we can see that explicit songs tend to have values closer to 0 than the non-explicit tracks.

## Plotting the Correlation Matrix

In [None]:
import seaborn as sns

plt.figure(figsize=(15,7))
heatmap = sns.heatmap(all_dataset.corr(),vmin=-1, vmax=1, annot=True)
heatmap.set_title("Correlation Heatmap")
plt.show()

# Data Preprocessing Feature Engineering and cleanup

In [None]:
all_dataset.head()

In [None]:
# Get value counts of top 10 artists features
all_dataset["artists"].value_counts().nlargest(10)

In [None]:
# Get average popularity of an artist eg. Eminem
all_dataset.loc[all_dataset['artists']=="['Eminem']","popularity"].mean()

We will create 2 features for artists based on the number of tracks they released and on the mean value of their popularity amongst all their songs.<br>
These two variables I think can describe accurately each artist as to how much **famous** he/she is and how many **tracks** he/she has released.

In [None]:
# Create a copy of the existing dataset to manipulate
transf_dataset = all_dataset.copy()
transf_dataset_sub = transf_dataset.sample(frac=0.4,random_state=42,axis=0)
transf_dataset_sub.reset_index(drop=True, inplace=True)

In [None]:
# Assign series object of artists and song counts
all_artist_counts = transf_dataset_sub["artists"].value_counts()

In [None]:
# Total songs of eminem in the dataset
all_artist_counts[["['Eminem']"]][0]

In [None]:
# Creating two new feature zero columns
transf_dataset_sub["tracks_number"] = 0
transf_dataset_sub["artist_popularity"] = 0

In [None]:
# Artist of first song in the dataframe
transf_dataset_sub["artists"][0]

In [None]:
for index in range(len(transf_dataset_sub)):
    transf_dataset_sub["tracks_number"][index] = all_artist_counts[[transf_dataset_sub["artists"][index]]][0]
    transf_dataset_sub["artist_popularity"][index] = transf_dataset_sub.loc[transf_dataset_sub['artists']==transf_dataset_sub["artists"][index],"popularity"].mean()

In [None]:
transf_dataset_sub.head()

In [None]:
# If artist popularity = popularity of song, means that the proposed artist has 1 only song. 
# So we will assign those values of artist_popularity to 0
for index in range(len(transf_dataset_sub)):
    if transf_dataset_sub["tracks_number"][index]==1:
        transf_dataset_sub["artist_popularity"][index] = 0
        transf_dataset_sub["tracks_number"][index] = 0

In [None]:
transf_dataset_sub.head()

For the name of the song, i'll use the length of the name of each song as one feature. For example the first track has name "Il barbiere di Siviglia: Overture (Sinfonia)" and characters size of 44. So the new feature here will be 44.

In [None]:
# Character Length for first song
len(transf_dataset_sub["name"][0])

In [None]:
# Creating new column for new feature
transf_dataset_sub["song_length"] = 0

In [None]:
transf_dataset_sub.head()

In [None]:
# Assigning values to the song_length feature
for index in range(len(transf_dataset_sub)):
    transf_dataset_sub["song_length"][index] = len(transf_dataset_sub["name"][index])

In [None]:
transf_dataset_sub.head()

Now we will drop the columns : Artists, Id, name and release_date.

The column Id doesn't have anything usefull and the release date is described also in the year of the song (Year column).

In [None]:
# Dropping columns we won't use
transf_dataset_sub.drop(columns=["artists", "id", "name", "release_date"], inplace=True)

In [None]:
transf_dataset_sub.head()

In [None]:
# One hot encoding the key column
final_dataset = pd.concat([transf_dataset_sub,pd.get_dummies(transf_dataset_sub["key"], prefix="key")],axis=1)
final_dataset.drop(columns=["key"], inplace=True)

In [None]:
final_dataset.head()

In [None]:
# Normalizing the ms duration by converting it to minutes and renaming the column
final_dataset["duration_ms"] = final_dataset["duration_ms"]/(1000 * 60)
final_dataset.rename(columns={"duration_ms" : "duration"},inplace=True)

In [None]:
final_dataset.head()

# Training and Testing various models

### Splitting into Train and Test data

In [None]:
# Splitting to inputs and labels
X = final_dataset.drop(columns=["popularity"])
y = final_dataset["popularity"]

In [None]:
y.head()

In [None]:
# Splitting into Train and Test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

### Benchmarking with Simple Regression Models

#### Ridge Regression

In [None]:
# Ridge Regression (l2 norm)
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=400,solver='cholesky')
ridge_reg.fit(X_train,y_train)
y_pred = ridge_reg.predict(X_test)

In [None]:
# Applying the RMSE Metric
from sklearn.metrics import mean_squared_error
import math

def rmse(y_test,y_pred):
    return math.sqrt(mean_squared_error(y_test, y_pred))

print(rmse(y_test,y_pred))

#### Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha = 0.05)
lasso_reg.fit(X_train,y_train)
y_pred = lasso_reg.predict(X_test)

In [None]:
print(rmse(y_test,y_pred))

#### ElasticNet

In [None]:
from sklearn.linear_model import ElasticNet
elastic_reg = ElasticNet(l1_ratio=0.8,alpha=0.05)
elastic_reg.fit(X_train, y_train)
y_pred = elastic_reg.predict(X_test)

In [None]:
print(rmse(y_test,y_pred))

### Gradient Boosting Regressors

#### Sci-kit Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=200,learning_rate=0.2,validation_fraction=0.1,n_iter_no_change=15)
gbr.fit(X_train,y_train)
y_pred = gbr.predict(X_test)

In [None]:
# Evaluation on train set
print(rmse(y_train, gbr.predict(X_train)))

In [None]:
# Evaluation on test set
print(rmse(y_test,y_pred))

In [None]:
gbr.feature_importances_

We can actually see that with the gradient boosting regressor we are not overfitting in the training set, which is a good thing.

Let's see the importance of each feature in our model.

In [None]:
# Defining function for plotting feature importances
def plot_feature_importances(feat_imp_array, dataframe,title):
    # Sorted importance array
    col_sorted_by_importance=(-feat_imp_array).argsort()
    #Creating dataframe with importances
    feat_imp=pd.DataFrame({
      'cols':dataframe.columns[col_sorted_by_importance],
      'imps':feat_imp_array[col_sorted_by_importance]
  })
    #Plotting the importances of our model
    fig = px.bar(feat_imp, x='cols', y='imps', title=title)
    fig.show()

In [None]:
plot_feature_importances(gbr.feature_importances_,X_train,"GBR Feature Importances")

We have some interesting insights here. We can see that our engineered features "artist_popularity" and "tracks_number", highly describe our depedent variable. It seems normal if we think that a well known artist is more likely to release a popular track, since he's already famous. 

#### CatBoost Regressor

In [None]:
!pip install catboost

In [None]:
X_train.head()

In [None]:
from catboost import CatBoostRegressor, Pool, cv

cat_features = X_train[["explicit", "mode", "key_0", "key_1", "key_2", "key_3", "key_4", "key_5", "key_6", "key_7", "key_8", "key_9", "key_10", "key_11"]].columns

train_pool = Pool(X_train, 
                  y_train,
                  cat_features)

In [None]:
catboost_reg = CatBoostRegressor(iterations=1000,
                                #task_type='GPU',
                                #devices='0:1',
                                loss_function='RMSE',
                                eval_metric='RMSE',
                                random_seed=42)

catboost_reg.fit(train_pool,plot=True)
y_pred = catboost_reg.predict(X_test)

In [None]:
print(rmse(y_train,catboost_reg.predict(X_train)))

In [None]:
print(rmse(y_test,y_pred))

With RMSE of 8.66 on the training set and 9.76 on the test set, we don't have overfitting and this is actually our best perfomance yet on the test data (9.76).

In [None]:
plot_feature_importances(catboost_reg.feature_importances_,X_train,"CatBost Regressor Feature Importances")

With the CatBoost Regressor the predictions are not so artist depedent (artist popularity and artist songs) but more features play a role for the final prediction. We can also see that the year is the most important feature here.

### Tensorflow DNN Regression Model

In [None]:
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
print(tf.__version__)

In [None]:
# Normalizing our Train and Test data to feed them in our DNN
mm_scaler = MinMaxScaler()
X_train_sc = mm_scaler.fit_transform(X_train)
X_test_sc = mm_scaler.transform(X_test)

In [None]:
# The shape of our train set
X_train_sc.shape

In [None]:
# Defining our DNN Model
model = tf.keras.Sequential([
                             tf.keras.layers.Dense(128,activation="relu"),
                             tf.keras.layers.Dropout(0.2),
                             tf.keras.layers.Dense(64,activation="relu"),
                             tf.keras.layers.Dense(1)
])

# Defining RMSE Metric
rmse = tf.keras.metrics.RootMeanSquaredError()

# Compiling our Model
model.compile(loss="mean_squared_error", optimizer="adam", metrics=[rmse])

In [None]:
# Training the model
history = model.fit(x=X_train_sc,y=y_train,epochs=40, validation_data=(X_test_sc,y_test))

In [None]:
# Print RMSE Accuracy with DNN Regression Model
y_pred = model.predict(X_test_sc)
print(float(rmse(y_test,y_pred)))

In [None]:
# Plot function for training and testing
def plot_loss(history):
    plt.figure(figsize=(13,7))
    plt.plot(history.history['root_mean_squared_error'], label='root_mean_squared_error')
    plt.plot(history.history['val_root_mean_squared_error'], label='val_root_mean_squared_error')
    plt.xlabel('Epoch')
    plt.ylabel('Error')
    plt.legend()
    plt.grid(True)

In [None]:
plot_loss(history)

We can see that the training line and the validation line fit perfectly with each other. So there's no overfitting here. 

# Conclusions

To sum up, firstly we performed an Exploratory Data Analysis on the Spotify 1921-2020 Dataset, created some new features and then tested various regression models to predict the popularity of each one. Best performance achieved by CatBoostRegressor with 9.76 RMSE on the test set. Note that due to computational resources we used a subset of almost half the original dataset (fraction of 0.4) to plot the EDA graphs and to train our models.

#### Models and Results

Ridge Regression : 14.23<br>
Lasso Regression : 14.24<br>
ElasticNet : 14.25<br>
Gradient Boosting Regressor : 10.29<br>
CatBoost Regressor : 9.76<br>
Deep Neural Network Regression : 10.29<br>