In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
df_genre = pd.read_csv("../input/dataset-of-songs-in-spotify/genres_v2.csv")
df_playlist = pd.read_csv("../input/dataset-of-songs-in-spotify/playlists.csv")

# EDA

In [None]:
df_genre.columns

In [None]:
df_genre.shape

In [None]:
px.imshow(img=df_genre.isna(), title='Missing values(yellow: missing, blue: not missing)')

Interesting. The song_name column has missing values when `Unnamed: 0` and title columns are both null and vice versa. There's a little line for song_name at 20k but if you zoom in, it's gone. But the last 3 columns are a mystery. Why is there a missing pattern like that?  Let's see their content real quick.

In [None]:
df_genre['song_name'].head(10), df_genre['Unnamed: 0'].tail(10), df_genre['title'].tail(10)

So, `title` looks like it contains the name of playlist, `Unnamed: 0` is just increasing numbers and `song_name` is, well, song's name. We're just not going to take these features into consideration anymore. We'll also drop `id`, `uri`, `track_href` and `analysis_url`.

## What % values are missing for each feature?

In [None]:
df_pcmiss = df_genre.isna().sum(axis=0) / df_genre.shape[0] * 100
df_pcmiss = df_pcmiss.reset_index().rename(columns={'index': 'feature', 0: '% missing'})
px.bar(df_pcmiss, x='feature', y='% missing', title='% of missing values for each feature')

Lets look at the data distribution for these features.

In [None]:
dist_columns = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'duration_ms', 'time_signature', 'genre']

In [None]:
len(dist_columns)

In [None]:
rows = 3
cols = 5

fig = make_subplots(rows=rows, cols=cols, subplot_titles=dist_columns)

x, y = np.meshgrid(np.arange(rows) + 1, np.arange(cols) + 1)

count = 0
for row, col in zip(x.T.reshape(-1), y.T.reshape(-1)):
    fig.add_trace(
            go.Histogram(x=df_genre[dist_columns[count]].values),
            row=row, col=col
        )
    count += 1

fig.update_layout(height=900, width=900, title_text="Feature distribution", showlegend=False)
fig.show()

# Observations:

1. danceability - (almost) has a normal distribution
2. energy - most of the songs are highly energetic
3. key - many songs are in the key of 1, for others, key is distributed equally
4. loudness - also distributed normally
5. mode - not much interesting
6. speechiness - follows a chi-square-esque distribution
7. acousticness - also follows chi-square-esque distribution
8. instrumentalness - most of the songs are not insrumental, as expected. Very few instrumental songs make it to the top. Most songs need to have vocals to be popular.
9. liveness - distribution is weird, there's a peak at 0.11.
10. valence - valence in music descibes the musical positiveness conveyed by the song. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry). (Source: https://towardsdatascience.com/what-makes-a-song-likeable-dbfdb7abe404#:~:text=Valence%3A%20Describes%20the%20musical%20positiveness,measure%20of%20intensity%20and%20activity). The distribution is linear with downward slope.
11. tempo - (almost) follows a normal distribution
12. type - there's just one value to this feature, redundant
13. duration_ms - most songs are 2:30 min to 4:10. There's also a list of longer songs
14. time_signature - no song has time_signature = 2. Most common time signature is 4.
15. genre - most popular genres are Dark Trap and Underground Rap

In [None]:
box_columns = ['danceability', 'energy', 'key', 'loudness', 
               'speechiness', 'acousticness', 'instrumentalness',
               'liveness', 'valence', 'tempo', 'duration_ms']

In [None]:
len(box_columns)

In [None]:
rows = 3
cols = 4

fig = make_subplots(rows=rows, cols=cols, subplot_titles=box_columns)

x, y = np.meshgrid(np.arange(rows) + 1, np.arange(cols) + 1)

count = 0
for row, col in zip(x.T.reshape(-1), y.T.reshape(-1)):
    try:
        fig.add_trace(
            go.Box(x=df_genre[box_columns[count]].values, name=''),
            row=row, col=col
        )
        count += 1
    #if we run out of features, stop plotting
    except:
        break

fig.update_layout(height=900, width=900, title_text="Boxplots", showlegend=False)
fig.show()

## Observation:

Apart from instrumentalness, valence and key, all other features have a lot of outliers.

# Model Building: Predicting Genre

## Feature Engineering

## Scaling

A lot of features have values between 0 and 1(e.g. instrumentalness) while others have values in 100 thousands(duration_ms). We need to scale these features in the range of 0-1. 

But first, we need to divide the dataset into train, test and validation sets.

In [None]:
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
        'duration_ms', 'time_signature']
label    = 'genre'

In [None]:
X = df_genre[features]
y = df_genre[label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
# X_train_, X_val, y_train_, y_val = train_test_split(X_train, y_train, test_size=0.1)

In [None]:
print(X_train.shape)
print(X_test.shape)
# print(X_val.shape)

In [None]:
loudness_scaler = MinMaxScaler(feature_range=(0, 1))
X_train.loc[:, 'loudness'] = loudness_scaler.fit_transform(X_train['loudness'].values.reshape(-1, 1))
X_test.loc[:, 'loudness'] = loudness_scaler.transform(X_test['loudness'].values.reshape(-1, 1))
# X_val.loc[:, 'loudness'] = loudness_scaler.transform(X_val['loudness'].values.reshape(-1, 1))

In [None]:
tempo_scaler = MinMaxScaler(feature_range=(0, 1))
X_train.loc[:, 'tempo'] = tempo_scaler.fit_transform(X_train['tempo'].values.reshape(-1, 1))
X_test.loc[:, 'tempo'] = tempo_scaler.transform(X_test['tempo'].values.reshape(-1, 1))
# X_val.loc[:, 'tempo'] = tempo_scaler.transform(X_val['tempo'].values.reshape(-1, 1))

In [None]:
duration_ms_scaler = MinMaxScaler(feature_range=(0, 1))
X_train.loc[:, 'duration_ms'] = duration_ms_scaler.fit_transform(X_train['duration_ms'].values.reshape(-1, 1))
X_test.loc[:, 'duration_ms'] = duration_ms_scaler.transform(X_test['duration_ms'].values.reshape(-1, 1))
# X_val.loc[:, 'duration_ms'] = duration_ms_scaler.transform(X_val['duration_ms'].values.reshape(-1, 1))

## One Hot Encoding

In [None]:
X_train = pd.concat([X_train, pd.get_dummies(X_train['key'], prefix='key', drop_first=True)], axis=1)

X_test = pd.concat([X_test, pd.get_dummies(X_test['key'], prefix='key', drop_first=True)], axis=1)

# X_val = pd.concat([X_val, pd.get_dummies(X_val['key'], prefix='key', drop_first=True)], axis=1)

In [None]:
X_train = pd.concat([X_train, pd.get_dummies(X_train['key'], prefix='key', drop_first=True)], axis=1)

X_test = pd.concat([X_test, pd.get_dummies(X_test['key'], prefix='key', drop_first=True)], axis=1)

# X_val = pd.concat([X_val, pd.get_dummies(X_val['key'], prefix='key', drop_first=True)], axis=1)

In [None]:
X_train = pd.concat([X_train, pd.get_dummies(X_train['time_signature'], prefix='time_signature', drop_first=True)], axis=1)

X_test = pd.concat([X_test, pd.get_dummies(X_test['time_signature'], prefix='time_signature', drop_first=True)], axis=1)

# X_val = pd.concat([X_val, pd.get_dummies(X_val['time_signature'], prefix='time_signature', drop_first=True)], axis=1)

Now, drop `key` and `time_signature`

In [None]:
X_train.drop(['key', 'time_signature'], axis=1, inplace=True)
X_test.drop(['key', 'time_signature'], axis=1, inplace=True)
# X_val.drop(['key', 'time_signature'], axis=1, inplace=True)

In [None]:
ohe_label = OneHotEncoder()
y_train = ohe_label.fit_transform(y_train.values.reshape(-1, 1)).toarray()
y_test = ohe_label.transform(y_test.values.reshape(-1, 1)).toarray()
# y_val = ohe_label.transform(y_val.values.reshape(-1, 1)).toarray()

In [None]:
params_dict = {
    'n_estimators': [50, 75, 100, 125, 150],
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(8, 40, 4)
}

In [None]:
gs_cv = GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1),
                     param_grid=params_dict,
                     cv=5,
                     verbose=10,
                     n_jobs=-1,
                    )

In [None]:
gs_cv.fit(X_train, y_train)

In [None]:
import pickle

In [None]:
with open('grid_search_result.pkl', 'wb') as f:
    pickle.dump(gs_cv, f)

In [None]:
test_preds = gs_cv.best_estimator_.predict(X_test)

In [None]:
meaningfull_preds_test = ohe_label.inverse_transform(test_preds).reshape(-1)
meaningfull_true_test  = ohe_label.inverse_transform(y_test).reshape(-1)

In [None]:
plt.rcParams['figure.figsize'] = 14, 14
sns.heatmap(confusion_matrix(meaningfull_true_test, meaningfull_preds_test), 
                            annot=True,
                            xticklabels=ohe_label.categories_[0],
                            yticklabels=ohe_label.categories_[0],
                            fmt='d'
           );

# FUTURE WORK

* Optimize the RF model on validation set
* Try some more ML models
* Try Deep Learning