In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns 
from warnings import filterwarnings as filt

filt('ignore')
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (12,6)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
base_dir = "/kaggle/input/music-genre-classification/"
traindf = pd.read_csv(f'{base_dir}train.csv')
testdf = pd.read_csv(f'{base_dir}test.csv')

In [None]:
traindf.head()

In [None]:
traindf['Artist Name'].value_counts()

### Handling null values 

In [None]:
y_train = traindf.Class
traindf = traindf.drop(['Class'], axis = 1)
trainIdx, testIdx = traindf.shape[0], testdf.shape[0]
df = pd.concat([traindf, testdf])
df.head()

In [None]:
df.shape

In [None]:
null_feats = pd.DataFrame(df.isnull().sum(), columns = ['nans']).sort_values('nans', ascending = False)
null_feats['nans %'] = np.round(df.isnull().sum() / df.shape[0], 2)
null_feats.head()

In [None]:
nulls = null_feats.index[:3]
sns.kdeplot(df[nulls[0]])

In [None]:
sns.kdeplot(df[nulls[1]])

In [None]:
sns.kdeplot(df[nulls[2]])

In [None]:
df[nulls].describe()

In [None]:
df[nulls] = df[nulls].fillna(df[nulls].mean())

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
df.groupby('Artist Name')['Popularity'].mean().sort_values(ascending = False).head(10)

### Data cleaning and feature engg

In [None]:
df[df['Artist Name'] == 'The Weeknd, Ariana Grande']

In [None]:
df['Solo'] = df['Artist Name'].apply(lambda x : 0 if len(x.split(',')) > 1 else 1 )
df.head()

In [None]:
sns.kdeplot(df.loudness)

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
from pdpbox import pdp
from sklearn.ensemble import RandomForestClassifier as rfc 
import shap

def pdp_plot(col, val_x, val_y):
    val_x = val_x.select_dtypes(exclude = 'object')
    model = rfc(n_estimators = 100,random_state = 123).fit(val_x, val_y)
    isolate = pdp.pdp_isolate(model, dataset= val_x, feature = col, model_features = val_x.columns)
    pdp.pdp_plot(isolate, col);
    
def pdp_interact(cols, val_x, val_y):
    val_x = val_x.select_dtypes(exclude = 'object')
    model = rfc(n_estimators = 100,random_state = 123).fit(val_x, val_y)
    interact = pdp.pdp_interact(model, dataset= val_x, features=cols, model_features = val_x.columns)
    pdp.pdp_interact_plot(interact, cols, plot_type='contour');
            
def permImp(val_x, val_y):
    val_x = val_x.select_dtypes(exclude = 'object')
    model = rfc(n_estimators = 100,random_state = 123).fit(val_x, val_y)
    perm = PermutationImportance(model).fit(val_x, val_y)
    return eli5.show_weights(perm, feature_names = val_x.columns.tolist())

def force_plot(x_train, y_train, val_x):
    x_train = x_train.select_dtypes(exclude = 'object')
    val_x = val_x.select_dtypes(exclude = 'object')
    model = rfc(n_estimators = 100,random_state = 123).fit(x_train, y_train)
    explainer = shap.TreeExplainer(model)
    samp = val_x.sample(n = 1)
    shap_values = explainer.shap_values(samp)
    return shap.force_plot(explainer.expected_value[-1], shap_values[-1], samp)
    
def train_val_split(x, y, test_size = 0.2):
    idx = x.sample(frac = test_size).index
    x_test, y_test = x.iloc[idx], y.iloc[idx]
    x_train, y_train = x.drop(idx), y.drop(idx)
    return x_train, x_test, y_train, y_test
    

In [None]:
x = df.reset_index(drop = True).iloc[:trainIdx]
x_test = df.reset_index(drop = True).iloc[trainIdx: ]

In [None]:
train_x, val_x, train_y, val_y = train_val_split(x, y_train)
train_x.shape, val_x.shape, train_y.shape, val_y.shape

In [None]:
permImp(val_x, val_y)

In [None]:
pdp_plot('duration_in min/ms', val_x, val_y)

according the pd plot as the duration of the song increases (min/ms), greater the change of being class 10, 8, 6, 5, 2, 1 

In [None]:
classes = pd.read_csv(f"{base_dir}submission.csv").columns
classes = {key : val.split('_')[0] for key, val in enumerate(classes)}
classes

In [None]:
pdp_interact(['duration_in min/ms', 'Solo'], val_x, val_y)

In [None]:
shap.initjs()
force_plot(train_x, train_y, val_x)

In [None]:
from scipy.stats import norm, skew
def plot(df, rc, kind = 'dist'):
    fig, ax = plt.subplots(rc[0],rc[1])
    fig.tight_layout()
    cols = df.columns
    ind = 0
    for r in range(rc[0]):
        for c in range(rc[1]):
            if ind >= len(cols): break
            x = df[cols[ind]]
            if kind == 'dist':
                sns.distplot(x, ax = ax[r,c], fit = norm)
            elif kind == 'box':
                sns.boxplot(x, ax = ax[r, c])
            ind += 1

In [None]:
train_x.head()

In [None]:
feats = [c for c in train_x.select_dtypes(exclude = 'object').columns if train_x[c].nunique() >= 10]
plot(train_x[feats], [4,3])

In [None]:
plot(train_x[feats], [4,3], 'box')

In [None]:
def skew_score(df) : 
    df = df.select_dtypes(exclude = 'object')
    return pd.DataFrame(np.abs(skew(df)), index = df.columns, columns = ['skew_score']).sort_values('skew_score', ascending = False)

In [None]:
skews = skew_score(train_x)
high_skew_feats = skews[skews.skew_score > 0.5].index
skews[skews.skew_score > 0.5]

In [None]:
train_artist_name, val_artist_name = train_x['Artist Name'], val_x['Artist Name']  
train_song_name, val_song_name = train_x['Track Name'], val_x['Track Name']  
feats_to_drop = ['Artist Name', 'Track Name']
train_x = train_x.drop(feats_to_drop, axis = 1)
val_x = val_x.drop(feats_to_drop, axis = 1)

In [None]:
train_x.head()

In [None]:
sns.scatterplot(data = train_x, x = 'Popularity', y ='loudness', hue = 'Solo')

In [None]:
sns.scatterplot(data = train_x, x = 'Popularity', y ='duration_in min/ms', hue = 'Solo')

In [None]:
from sklearn.linear_model import LogisticRegression as lrr
from sklearn.ensemble import RandomForestClassifier as rfc 
from sklearn.naive_bayes import GaussianNB as gnb
from sklearn.svm import SVC
from xgboost import XGBRFClassifier as xgb 

from sklearn.model_selection import cross_val_score as cvs, GridSearchCV as gscv, StratifiedKFold as skf
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report, confusion_matrix

from sklearn.preprocessing import StandardScaler as ss, MinMaxScaler as mms, RobustScaler as rs

In [None]:
def best_model(xt, yt, scaler = None):
    models = [lrr(), SVC(), rfc(), gnb(), xgb()]
    names = ['logistic regression','svm','random forest clf', 'naive bayes', 'xgboost']
    scores = []
    for model in models:
        if scaler == 'std':
            model = Pipeline(steps = [('std',ss()),('model',model)])
        elif scaler == 'robust':
            model = Pipeline(steps = [('robust',rs()),('model',model)])
        elif scaler == 'mms':
            model = Pipeline(steps = [('mms',mms()),('model',model)])
        cv = skf(n_splits = 5, shuffle = True, random_state = True)
        score = cvs(model, cv = cv, X = xt, y = yt, scoring = 'f1_micro').mean()
        scores.append(score)
    return pd.DataFrame(score, index = names, columns = ['f1_score']).sort_values('f1_score', ascending = True)

def get_score(xt, yt, model = lrr(), scaler = None):
    if scaler == 'std':
        model = Pipeline(steps = [('std',ss()),('model',model)])
    elif scaler == 'robust':
        model = Pipeline(steps = [('robust',rs()),('model',model)])
    elif scaler == 'mms':
        model = Pipeline(steps = [('mms',mms()),('model',model)])
    cv = skf(n_splits = 5, shuffle = True, random_state = True)
    auc = cvs(model, cv = cv, X = xt, y = yt).mean()
    print(f"Model score :==> {auc}")
    
def gridCv(xt, yt, model, params, scaler = None):
    if scaler == 'std':
        model = Pipeline(steps = [('std',ss()),('model',model)])
    elif scaler == 'robust':
        model = Pipeline(steps = [('robust',rs()),('model',model)])
    elif scaler == 'mms':
        model = Pipeline(steps = [('mms',mms()),('model',model)])
    skcv = skf(n_splits = 5, shuffle = True, random_state = True)
    cv = gscv(model, param_grid = params, cv = skcv , return_train_score = True)
    cv.fit(xt,yt)
    results = pd.DataFrame(cv.cv_results_).sort_values('mean_test_score', ascending = False)
    results = results[['mean_test_score','mean_train_score','params']]
    best_params = cv.best_params_
    best_est = cv.best_estimator_
    return best_est, best_params, results

def clf_report(yt, pred):
    print(classification_report(yt, pred))


In [None]:
best_model(train_x, train_y)

In [None]:
best_model(train_x, train_y, 'std')

In [None]:
best_model(train_x, train_y, 'robust')

In [None]:
best_model(train_x, train_y, 'mms')

In [None]:
gridCv(train_x, train_y, gnb(), {})

In [None]:
new_x = np.log1p(train_x[high_skew_feats])
best_model(new_x, train_y)