# Problem D Prediction Models for Task #3.5

Included is also feature importance analysis.

In [None]:
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
import math
import itertools
py.init_notebook_mode(connected=True)
import random
import seaborn as sns

"""XGBoost and Other Helper Functions"""
from sklearn.model_selection import GroupKFold, StratifiedKFold, RepeatedStratifiedKFold, KFold
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import eli5
from eli5.sklearn import PermutationImportance
import tensorflow as tf
import tensorflow_addons as tfa
import gc

LOOK_AT = 10
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

In [None]:
data_by_artist = pd.read_csv("../input/icm-problem-d/2021_ICM_Problem_D_Data/data_by_artist.csv")
data_by_year = pd.read_csv("../input/icm-problem-d/2021_ICM_Problem_D_Data/data_by_year.csv")
full_music_data = pd.read_csv("../input/icm-problem-d/2021_ICM_Problem_D_Data/full_music_data.csv")
influence_data = pd.read_csv("../input/icm-problem-d/2021_ICM_Problem_D_Data/influence_data.csv")
music_df = pd.read_csv("../input/icm-problem-d/music_genre.csv")

Drop pop/rock potentially? Since there is a significant outlier in the amount of Pop/Rock songs in the dataset compared to all other songs.

In [None]:
#music_df = music_df.loc[music_df['Genre'] != "Pop/Rock"]

In [None]:
genre_list = list(music_df["Genre"].unique())
genre_list

In [None]:
fig = px.bar(music_df.groupby('Genre').size())
fig.update_layout(title={'text': f"Distribution of Each Song's Genre", 'x': 0.5,
                             'xanchor': 'center', 'font': {'size': 20}}, yaxis_title="Count", showlegend=False)

In [None]:
xnn = music_df.select_dtypes(np.number).drop(["year", "Unnamed: 0"], axis=1)
X_train = xnn.to_numpy()

y_train = []
for genre in music_df["Genre"]:
    y_train.append(genre_list.index(genre))
y_train = np.array(y_train)

<h3> Potential XGBoost CV Function </h3>

In [None]:
data_dmatrix = xgb.DMatrix(data=xnn, label=y_train)
params = {"objective":"multi:softprob", 'num_class': len(genre_list), 'min_child_weight': 3, 'colsample_bytree': 0.8, 'learning_rate': 0.17,
                    'max_depth': 10, 'reg_lambda': 1.5, 'subsample': 0.8, 'reg_alpha': 0, 'gamma': 0, 'tree_method': 'gpu_hist'}

<h3> XGBoost Implementation with some Hyperparameter Optimization </h3>

In [None]:
%%time

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
for train_index, test_index in kf.split(X_train, y_train):
    clf = xgb.XGBClassifier(
        n_estimators=300,
        learning_rate=0.17,
        max_depth=10,
        min_child_weight=3,
        max_delta_step=0,
        subsample=0.8,
        colsample_bytree=0.8,
        missing=-999,
        random_state=SEED,
        n_jobs=-1,
        reg_lambda=1.5,
        reg_alpha=0,
        gamma=0,
        objective='multi:softprob',
        tree_method='gpu_hist'
    )

    clf.fit(X_train[train_index], y_train[train_index], early_stopping_rounds=10, eval_set=[(X_train[test_index], y_train[test_index])], verbose=100)
    print(np.count_nonzero(np.argmax(clf.predict_proba(X_train[test_index]), axis=1) == y_train[test_index])/len(y_train[test_index]))
    gc.collect()

In [None]:
np.argmax(clf.predict_proba(X_train[test_index]), axis=1)

<h3> XGBoost Feature Importance </h3>

In [None]:
%%time

perm = PermutationImportance(clf, random_state=SEED).fit(X_train, y_train)
eli5.show_weights(perm, feature_names = xnn.columns.tolist())

<h3> Simple Keras Neural Network </h3>

The following network architecture was copied from a Jane Street Market Prediction model. Although it is certainly not optimized for this task, it holds sufficient to test the efficacy of MLPs for this classification.

In [None]:
def create_mlp(
    num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate
):

    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)

    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=tfa.optimizers.RectifiedAdam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    )

    return model

In [None]:
yy_train = []
for genre in music_df["Genre"]:
    my_list = [0 for x in range(len(genre_list))]
    my_list[genre_list.index(genre)] = 1
    yy_train.append(my_list)
yy_train = np.array(yy_train)

In [None]:
batch_size = 256
hidden_units = [160, 160, 160]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
learning_rate = 1e-3

clf = create_mlp(len(X_train[0]), len(yy_train[0]), hidden_units, dropout_rates, label_smoothing, learning_rate)

FOLDS = 5
es = tf.keras.callbacks.EarlyStopping(monitor='AUC', mode='max', verbose=0, patience=10, restore_best_weights=True)
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
for train_index, test_index in kf.split(X_train, yy_train):
    %time clf.fit(X_train[train_index], yy_train[train_index], validation_data=(X_train[test_index], yy_train[test_index]), callbacks=[es], epochs=128, batch_size=batch_size, verbose=1)
    print(np.count_nonzero(np.argmax(clf(X_train[test_index]), axis=1) == y_train[test_index])/len(y_train[test_index]))
    gc.collect()
    break