
I used some ideas found here:
* 1. https://www.kaggle.com/giorgosdiamantis/song-genre-classification
* https://www.kaggle.com/jubsadeghi/spotify
* https://www.kaggle.com/botirrakhimov/genre-prediction-using-sklearn-classification


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import covariance
from sklearn import compose
from sklearn import model_selection
from sklearn import pipeline
from sklearn import linear_model
from sklearn import metrics
from sklearn import decomposition
from sklearn.svm import LinearSVC

In [None]:
df = pd.read_csv("/kaggle/input/dataset-of-songs-in-spotify/genres_v2.csv")
print(df.shape)
df.head()

# Data description

In [None]:
df.info()

In [None]:
#drop unneeded columns
df.drop(columns=['type', 'id', 'uri', 'track_href', 'analysis_url', 'song_name', 'Unnamed: 0', 'title'], inplace=True)
print(df.shape)
df.head()

In [None]:
df.describe()

In [None]:
#is there empty values
df.isna().sum()

In [None]:
df.info()

In [None]:
df['genre'].value_counts()

In [None]:
df['time_signature'].value_counts()

In [None]:
histograms = df.hist(figsize=(10,10))
plt.show()

In [None]:
for col in df.select_dtypes(include=np.number).columns.tolist():
  fig = px.box(df, 
        color = "genre",
        y = col, 
        title = col
       )
  fig.show()

In [None]:
# scater matrix drawing fnc
def scatterMatrix(df):
  df_2_charts = df.copy();
  
  kolumny = pd.Index(df.columns).tolist()
  wykres = px.scatter_matrix(df_2_charts, dimensions=kolumny[:-1], color="genre")
  wykres.update_layout(autosize=True, width=2000, height=2000)
  wykres.show()

In [None]:
scatterMatrix(df)

# Data preparation

## Outlayers - IQR

In [None]:
### outlayers - IQR
def IQR_outliers(data, outlayers):
  Q1, Q3 = np.percentile(data, [25, 75])
  IQR = Q3 - Q1

  u_band  = Q3 + (1.5 * IQR)
  l_band  = Q1 - (1.5 * IQR)
  idx = np.where((data > u_band) | (data < l_band))
  outlayers[idx] = True
  
  return outlayers

#outlayers array, by default row is treated as non outlayer
outlayers = np.full(shape=df.shape[0], fill_value=False)

cols = df.select_dtypes(include=np.number).columns.tolist()
#for each feature - mark outlayers
for col in cols[:-1]:
  outlayers = IQR_outliers(df[col], outlayers)


outIdx = [i for i,x in enumerate(outlayers) if x == True]
df_nonoutlayers_by_iqr = df.drop(outIdx)
df_nonoutlayers_by_iqr.head(10)
df_nonoutlayers_by_iqr.shape

## Outlayers - eliptic envelope

In [None]:
### Outlayers - elipticEnvelope


detector = covariance.EllipticEnvelope(contamination=0.1, support_fraction=1)

toOutlayers = df.drop(columns="genre")
detector.fit(toOutlayers)

ol_flag = detector.predict(toOutlayers)

outIdx = [i for i,x in enumerate(ol_flag) if x == -1]

df_nonoutlayers_by_envelope = df.drop(outIdx)
df_nonoutlayers_by_envelope.head(10)
df_nonoutlayers_by_envelope.shape 

## Select data for further processing

In [None]:
#Select dataframe for further processing

#original with all rows
df_pre_model = df.copy()

#without rows detected by elipticEnvelope
#df_pre_model = df_nonoutlayers_by_envelope.copy()

#without rows detected by IQR
#df_pre_model = df_nonoutlayers_by_iqr.copy()
df_pre_model.shape

## Variance analysys

In [None]:
df_pre_model.var()

In [None]:
#low voarince features


low_var = df_pre_model.var()
low_var = low_var[low_var < 0.02 ]

low_var_cols = low_var.index.tolist()
print(low_var_cols)

In [None]:
#remove low varaince features

#be carful - when dropping outlayers variance differ
#this suits well using all rows

 
df_pre_model.drop(columns=low_var_cols, inplace=True);

## Correlation analysys 

In [None]:
#There is no highly correlated values - do not drop any features
df2Corr = df_pre_model.copy();
df2Corr['genre'] = preprocessing.LabelEncoder().fit_transform(df2Corr['genre'])
corrMx = df2Corr.corr()
corrMx.style.background_gradient(cmap = "RdBu_r")

## Category coding

In [None]:
#treat a time_signature as as category value

#be carful - dropping low variance features may drop also time_signature

df_category = pd.get_dummies(df_pre_model, columns=['time_signature'], prefix_sep='_')

df_category.shape

# Bulding models

## Helper functions 

In [None]:
def buildModel(algo, X_train, X_test, y_train, y_test):
  
  # train model
  algo.fit(X_train, y_train)

  predict = algo.predict(X_test)

  # accuracy
  accuracy = algo.score(X_test, y_test)
  
  return {
      "predict": predict,
      "accuracy": accuracy
  }

In [None]:
def modelCrossValidation(X, Y, algo):

  validation = model_selection.KFold(
      n_splits=5, shuffle=True, random_state=1
  )
  statsNames = ['accuracy', 'balanced_accuracy', 'f1_weighted', 'f1_macro']

  res = {}
  for sname in statsNames:
    res[sname] =  round(model_selection.cross_val_score(algo, 
                                                    X,
                                                    Y,
                                                    cv=validation,
                                                    scoring=sname,
                                                    n_jobs=-1).mean(), 2)
  return res

## Split test and train data

In [None]:
X1 = df_category.drop(columns=['genre'])
Y1 = df_category["genre"]
X1.head()

In [None]:
from imblearn.over_sampling import SMOTE

X1_std = preprocessing.StandardScaler().fit_transform(X1)


smote = SMOTE()
X1, Y1 = smote.fit_resample(X1_std, Y1)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X1, 
                                                                    Y1, 
                                                                    test_size=.2, 
                                                                    random_state=1,
                                                                    shuffle=True
                                                                    )

scoring = [['model name','accuracy']]

## Models

In [None]:
##LogisticRegression;

algLR = linear_model.LogisticRegression(max_iter=1000);
resLR = buildModel(algLR, X_train, X_test, y_train, y_test)

scoring.append([
      'LogisticRegression', 
      resLR["accuracy"]
])
print(scoring)

In [None]:
## LinearSVC

algLSVC = LinearSVC(random_state=1, max_iter=5000)
resLSVC = buildModel(algLSVC, X_train, X_test, y_train, y_test)

scoring.append([
   'LinearSVC', 
   resLSVC["accuracy"]
])
  
print(scoring)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

## KNeighborsClassifier

algKNC = KNeighborsClassifier(n_neighbors=5)
resKNC = buildModel(algKNC, X_train, X_test, y_train, y_test)

scoring.append([
                  'KNeighborsClassifier', 
                  resKNC["accuracy"]
                ])
print(scoring)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# RandomForestClassifier

algRFC = RandomForestClassifier(random_state  = 1, max_depth = 10  )
resRFC = buildModel(algRFC, X_train, X_test, y_train, y_test)

scoring.append([
                  'RandomForestClassifier', 
                  resRFC["accuracy"]
                ])
print(scoring)


In [None]:
from sklearn.ensemble import BaggingClassifier

# BaggingClassifier

algBag = BaggingClassifier()
resBag = buildModel(algBag, X_train, X_test, y_train, y_test)

scoring.append([
                  'BaggingClassifier', 
                  resBag["accuracy"]
                ])
print(scoring)

# Comapre models results

In [None]:
#draw chart to compare algorithms

toChart = pd.DataFrame(scoring, columns =['algorithm', 'accuracy']) 
toChart.drop(0, inplace=True)

px.bar(toChart, x="algorithm", y="accuracy")

# Best model validation

In [None]:
#best alg
bestAlg = algKNC;

vres = modelCrossValidation(X_train, y_train, bestAlg)
print(vres)

In [None]:
#print metrics
print(metrics.classification_report(y_test, resKNC['predict']))

In [None]:
print("Confusion matrix: ");
print(metrics.confusion_matrix(y_test, resKNC['predict'], labels=np.unique(y_test)), sep="\n")

In [None]:

fig, ax = plt.subplots(figsize=(15, 15))
metrics.plot_confusion_matrix(bestAlg, X_test, y_test, ax=ax)