In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/dataset-of-songs-in-spotify/genres_v2.csv')

In [None]:
df.head()

In [None]:
df.columns

# Exploratory Data Analysis

In [None]:
df.isna().sum()

Only the columns song_name, unnamed: 0 and title are having missing values.

In [None]:
df[['type','id','uri','track_href','analysis_url','song_name','Unnamed: 0', 'title']].head(10)

The columns type,id,uri,track_href,analysis_url,song_name,Unnamed: 0,title has no significance. So dropping those columns

In [None]:
df.drop(['type','id','uri','track_href','analysis_url','song_name','Unnamed: 0', 'title'], axis = 1, inplace=True)

In [None]:
df.head()

## Genre

In [None]:
pd.DataFrame(df['genre'].value_counts())

In [None]:
ax = pd.DataFrame(df['genre'].value_counts()).plot(kind='bar',figsize=(8,5))
ax.set_title('Songs per Each Genre')
ax.set_xlabel('Genre')
ax.set_ylabel('Count')

The Genre Pop has very less number of songs while the Underground Rap has nearly 6K songs

## Danceability

In [None]:
sns.distplot(df['danceability'])

## Energy

In [None]:
sns.distplot(df['energy'])

## key

In [None]:
df['key'].value_counts()

In [None]:
fig_dims = (8, 8)
fig, ax = plt.subplots(figsize=fig_dims)
sns.boxplot(x='key',y='genre',data=df, ax=ax)

## Loudness

In [None]:
sns.distplot(df['loudness'])

## Mode

In [None]:
df['mode'].value_counts()

In [None]:
sns.countplot(x='mode',data=df, hue='genre', palette='bright')
plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0)
plt.tight_layout()

## Speechiness

In [None]:
sns.distplot(df['speechiness'])

## Acousticness

In [None]:
sns.distplot(df['acousticness'])

## Instrumentalness

In [None]:
sns.distplot(df['instrumentalness'])

## Liveness

In [None]:
sns.distplot(df['liveness'])

## Valence

In [None]:
sns.distplot(df['valence'])

## Tempo

In [None]:
sns.distplot(df['tempo'])

## duration_ms

In [None]:
sns.distplot(df['duration_ms'])

## time signature

In [None]:
df['time_signature'].value_counts()

In [None]:
fig_dims = (8, 4)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(x='time_signature',data=df, hue='genre', palette='bright', ax= ax)
plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0)
plt.tight_layout()

1. There is no significance for the columns time_signature, mode and key since there is no proper insight from those data. So dropping those columns
2. Danceability distribution looks like gaussian but not exactly gaussian distribution
3. No conclusion can be drawn from the distribution of energy
4. The distribution of loudness looks like gaussian distribution
5. The distribution of speechiness and Acousticness resembles like chi-square-esque distribution
6. Instrumentalness has more zero values
7. valence - valence in music descibes the musical positiveness conveyed by the song. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry). (Source: https://towardsdatascience.com/what-makes-a-song-likeable-dbfdb7abe404#:~:text=Valence%3A%20Describes%20the%20musical%20positiveness,measure%20of%20intensity%20and%20activity)
8. Tempo follows a normal distribution (almost)

In [None]:
df.drop(['time_signature','mode','key'], axis = 1, inplace=True)

## Label Encoding the Genre column

In [None]:
from sklearn.preprocessing import LabelEncoder,StandardScaler
df['genre_encoded'] = LabelEncoder().fit_transform(df['genre'])

In [None]:
#Label Encoder Mapping Reference
le = LabelEncoder()
le.fit(df['genre'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

In [None]:
df.head()

In [None]:
fig_dims = (10, 8)
fig, ax = plt.subplots(figsize=fig_dims)
sns.heatmap(df.corr(), annot=True, ax=ax)

In [None]:
df.corr()['genre_encoded']

Instrumentalness, duration_ms, energy are positively correlated while acousticness and tempo are negatively correlated

In [None]:
df.head()

In [None]:
X = df.drop(['genre_encoded','genre'], axis = 1)
y = df['genre_encoded']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [None]:
X_train.describe() ## Before Standardization

Tempo and duration_ms should be standardized

In [None]:
std = StandardScaler().fit(X_train[['tempo']])
X_train['tempo'] = std.transform(X_train[['tempo']])
X_test['tempo'] = std.transform(X_test[['tempo']])

std = StandardScaler().fit(X_train[['duration_ms']])
X_train['duration_ms'] = std.transform(X_train[['duration_ms']])
X_test['duration_ms'] = std.transform(X_test[['duration_ms']])

In [None]:
X_train.describe() ## After Standardization

# Model Building

### The metric should be Confusion Matrix and F1 score since it is a multi class classification. Accuracy can't be used for a multi class classification since interpretability is low

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix,f1_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [None]:
def plot_confusion_matrix(test_y, predict_y, plot_title):
    C = confusion_matrix(test_y, predict_y)
    labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    cmap=sns.light_palette("green")
    # representing A in heatmap format
    print("-"*50, "Confusion matrix", "-"*50)
    plt.figure(figsize=(15,12))
    sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title(plot_title)
    plt.show()

### The confusion matrix function plots the confusion matrix for the given inputs

## Decision Tree

In [None]:
parameters = {"max_depth":np.array([1, 2, 5, 10, 50, 100]), "min_samples_split":np.array([1, 5, 10, 50, 100, 500])}
dt_clf = DecisionTreeClassifier(random_state=42)
clf = GridSearchCV(dt_clf, parameters, cv=5, scoring='f1_micro', n_jobs=-1, verbose=10)
clf.fit(X_train, y_train)
print(clf.best_estimator_)
print(clf.best_params_)

In [None]:
clf = DecisionTreeClassifier(max_depth=10, min_samples_split=10, random_state=42)
clf.fit(X_train,y_train)
y_train_pred = clf.predict(X_train)
y_pred = clf.predict(X_test)
# Train Confusion Matrix
plot_confusion_matrix(y_train,y_train_pred, 'Train Confusion Matrix')
# Test Confusion Matrix
plot_confusion_matrix(y_test,y_pred, 'Test Confusion Matrix')

In [None]:
print('Train F1 Score is {0}'.format(f1_score(y_train,y_train_pred,average='micro')))
print('Test F1 Score is {0}'.format(f1_score(y_test,y_pred,average='micro')))

## Random Forest

In [None]:
parameters = {"n_estimators":np.array([10,50,100,250,350,500,750,1000,2000,3000]) ,"max_depth":np.array([1,5,8,10,50,70,100]),
              "min_samples_split":np.array([2,5,10,100,500])}
rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1)
clf = RandomizedSearchCV(rf_clf, parameters, cv=5, scoring='f1_micro', n_jobs=-1, verbose=10)
clf.fit(X_train, y_train)
print(clf.best_estimator_)
print(clf.best_params_)

In [None]:
clf = RandomForestClassifier(max_depth=10, min_samples_split=10, n_estimators=750, n_jobs=-1, random_state=42)
clf.fit(X_train,y_train)
y_train_pred = clf.predict(X_train)
y_pred = clf.predict(X_test)
# Train Confusion Matrix
plot_confusion_matrix(y_train,y_train_pred, 'Train Confusion Matrix')
# Test Confusion Matrix
plot_confusion_matrix(y_test,y_pred, 'Test Confusion Matrix')
print('Train F1 Score is {0}'.format(f1_score(y_train,y_train_pred,average='micro')))
print('Test F1 Score is {0}'.format(f1_score(y_test,y_pred,average='micro')))

## XGBoost

In [None]:
params = {
    'learning_rate':[0.01,0.03,0.05,0.1,0.15,0.2],
     'n_estimators':[100,200,350,500,1000,2000],
     'max_depth':[2,3,5,8,10],
    'colsample_bytree':[0.1,0.3,0.5,1],
    'colsample_bylevel':[0.1,0.3,0.5,1],
    'reg_alpha' : [0.001,0.01,0.1,1,10],
    'reg_lambda' : [0.001,0.01,0.1,1,10],
    'subsample':[0.1,0.3,0.5,1]
    }
xgb = XGBClassifier(n_jobs = -1)
clf = RandomizedSearchCV(xgb, params, cv=5, scoring='f1_micro', n_jobs=-1, verbose=10)
clf.fit(X_train, y_train)
print(clf.best_estimator_)
print(clf.best_params_)

In [None]:
clf = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=2,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=500, n_jobs=-1, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=1,
              reg_lambda=0.001, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
clf.fit(X_train,y_train)
y_train_pred = clf.predict(X_train)
y_pred = clf.predict(X_test)
# Train Confusion Matrix
plot_confusion_matrix(y_train,y_train_pred, 'Train Confusion Matrix')
# Test Confusion Matrix
plot_confusion_matrix(y_test,y_pred, 'Test Confusion Matrix')
print('Train F1 Score is {0}'.format(f1_score(y_train,y_train_pred,average='micro')))
print('Test F1 Score is {0}'.format(f1_score(y_test,y_pred,average='micro')))

### The model is getting confused in classifying certain data points belonging to some genre's like Dark Trap, Hiphop and RnB 

# Results

In [None]:
from prettytable import PrettyTable
results = []

# add the results in a list
results.append(['Model','Train F1','Test F1'])
results.append(['Decision Tree',0.679,0.631])
results.append(['Random Forest',0.727,0.683])
results.append(['XGBoost',0.741,0.697])

In [None]:
table = PrettyTable()
table.field_names = results[0]
for i in range(len(results)):
    if i!=0:
        table.add_row(results[i])
print(table)

# Conclusion

### 1. The best F1 score obtained is 69.7 with XGBoost model. The features are not having a specific pattern to achieve good F1 score. They are falling under different distributions which is complex for the machine learning model to learn from them

### 2. If more further more feature engineering techniques may help to get better prediction results