# Genre classification using Spotify dataset

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import ListedColormap

# Get data

In [None]:
genre = pd.read_csv('../input/dataset-of-songs-in-spotify/genres_v2.csv')

# Clean dataset
1. Notice that some sonngs have names but others do not. The ones that do not have a name actually has a reference of where they are found under the **title** column. We merge those with the name column.

2. For purposes of classification, we separate the text columns (type, id, etc.) and create another dataframe containing only numeric values 

3. Standardize and remove NaN values

In [None]:
# Unify song name
for i in range(len(genre)):
    if genre['song_name'][i] != genre['song_name'][i]:
        genre['song_name'][i:] = genre['title'][i:]
        break
genre.drop(['Unnamed: 0', 'title'], axis=1, inplace=True)
# Drop NaN
genre.dropna(inplace=True)
# Produce a numerical version dataset
genre_data = genre.drop(['type', 'id', 'uri', 'track_href', 'analysis_url', 'song_name', 'genre'], axis=1)
# Standardize 
num_column = genre_data.columns
genre_data = StandardScaler().fit_transform(genre_data)
genre_data = pd.DataFrame(genre_data)
genre_data.columns = num_column
pca = PCA(n_components=2).fit(genre_data).transform(genre_data)

# View dataset using pairplot

In [None]:
sns.pairplot(genre_data)

# Using K-NN to build model

In [None]:
val_accuracies = []
train_accuracies = []
KNN_models = []
X_train, X_test, Y_train, Y_test = train_test_split(genre_data, genre['genre'], test_size=0.2)
X_train, X_val, Y_train, Y_val = train_test_split(genre_data, genre['genre'], test_size=0.2)
# We try neighbors from 1 to 11
for i in range(1,11):
    neigh = KNeighborsClassifier(n_neighbors=i)
    neigh.fit(X_train, Y_train)
    total = 0
    for actual,pred in zip(neigh.predict(X_val), Y_val):
        total += actual == pred
    val_accuracies.append(total/len(X_val))
    total = 0
    for actual,pred in zip(neigh.predict(X_train), Y_train):
        total += actual == pred
    train_accuracies.append(total/len(X_train))
    KNN_models.append(neigh)

# View results and choose which model to use

In [None]:
plt.plot(np.arange(10) + 1, val_accuracies, c='y')
plt.plot(np.arange(10) + 1, train_accuracies, c='b')
plt.legend(['Validation set', 'Train set'])

## From the plot, we can see that 1-3 neighbors are ideal value to use. Both gives good validation results as well as training results.

In [None]:
for i in range(3):
    select_model = KNN_models[i]
    total = 0
    for actual,pred in zip(select_model.predict(X_test), Y_test):
        total += actual == pred
    print('Model {} accuracy:'.format(i+1), total/len(X_test))


# Plot the first two pcs and their explained variances

In [None]:
print(PCA(n_components=2).fit(genre_data).explained_variance_ratio_)
genre['pc1'] = pca[:, 0]
genre['pc2'] = pca[:, 1]
sns.set(rc={'figure.figsize': (20,15)})
sns.scatterplot(data=genre, x='pc1', y='pc2', hue='genre', alpha=0.6)

## We choose the 1-neighbor model and plot decision bounadries

In [None]:
h = .02  # step size in the mesh

# Create color maps
cmap_light = ListedColormap(['orange', 'blue', 'red', 'brown', 'yellow', 'green', 'aqua', 'purple', 'pink', 'silver', 'black', 'dimgray', 'darkred', 'linen', 'lawngreen'])
cmap_bold = ['orange', 'blue', 'red', 'brown', 'yellow', 'green', 'aqua', 'purple', 'pink', 'silver', 'black', 'dimgray', 'darkred', 'linen', 'lawngreen']

# We train the model under pca to plot 2d boundaries
clf = KNeighborsClassifier(n_neighbors=5).fit(pca, genre['genre'])

codemap = {}
for i,genre_name in enumerate(genre['genre']):
    codemap[genre_name] = i
    
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = pca[:, 0].min() - 1, pca[:, 0].max() + 1
y_min, y_max = pca[:, 1].min() - 1, pca[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = np.array([codemap[genre_name] for genre_name in Z])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(20,16))
plt.contourf(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
pts = sns.scatterplot(x=pca[:, 0], y=pca[:, 1], hue=genre['genre'],
                palette=cmap_bold, alpha=0, edgecolor="black", size=1)
pts.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())

plt.show()

# Analysis

## Limitation  

1. Genres are generally not independent of one another. We have names that implies relationships (Dark Trap VS. Trap). This is also shown from the plot of the first two pc.  

2. There are significant differences between number of samples of each genre as shown in the cell below.

In [None]:
count = {}
for name in genre['genre']:
    if name not in count:
        count[name] = 1
    else:
        count[name] += 1
for name in count:
    print(name, count[name])

# Further questions

What is the relationship between these genres? 
Primary idea: Usign K-Means to generate clusters and analyze the elements of those clusters