In [None]:
# -*- coding: utf-8 -*-
"""
Created on Sun Nov  1 12:07:10 2020
@author: jaket
"""

# Mechanisms of Action - A XGb and NN Comparison


### On Kaggle Neural nets and XGB classifiers have been the most commonly used approaches to this multioutput classification problem.<br>
### Here, I compare the log loss of both and tune/use the best for submission. EDA has already been extensively done so I will go straight to modelling. I use PCA to reduce the variables included as the factor number is significant and highly covaried.<br>


In [None]:
import math
import pandas as pd
import numpy as np
import os
import warnings 
import calendar

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.model_selection import KFold
from category_encoders import CountEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
warnings.filterwarnings('ignore')

In [None]:
SEED = 42
np.random.seed(SEED)

In [None]:
train = pd.read_csv('../input/lish-moa/train_features.csv')
test = pd.read_csv('../input/lish-moa/test_features.csv')
targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

## PCA on train features<br>

#### First scale the numerical data, then fit a PCA to it

In [None]:
scaler = StandardScaler()

In [None]:
numeric = train.drop(['sig_id', 'cp_type', 'cp_time', 'cp_dose'], axis=1)
cats= train.filter(items=['sig_id', 'cp_type', 'cp_time', 'cp_dose'])

In [None]:
scaler.fit(numeric) # Fit Scaler
numeric_sc = scaler.transform(numeric) # Scale data

In [None]:
pca = PCA(.90)
pca.fit(numeric_sc) # Do PCA
pca.n_components_  # How many dimensions?

In [None]:
train_pc =pca.transform(numeric_sc)
train_pc = pd.DataFrame(data=train_pc)
train=pd.concat([train_pc, cats], axis=1) # Add back to cats

In [None]:
print ('90% of the variance is explained in ',pca.n_components_,  ' components.') 

Lets do the same with the test data...

In [None]:
numeric_test = test.drop(['sig_id', 'cp_type', 'cp_time', 'cp_dose'], axis=1)
cats_test= test.filter(items=['sig_id', 'cp_type', 'cp_time', 'cp_dose'])
scaler.fit(numeric_test)
numeric_sc_test = scaler.transform(numeric_test) # Scale data

In [None]:
test_pc =pca.transform(numeric_sc_test)
test_pc = pd.DataFrame(data=test_pc)
test= pd.concat([test_pc, cats_test], axis=1)

### Encode categorical variables

In [None]:
train[['cp_type', 'cp_dose']]=train[['cp_type', 'cp_dose']].astype('category')
test[['cp_type', 'cp_dose']]=test[['cp_type', 'cp_dose']].astype('category')

In [None]:
dummies_train=pd.get_dummies(train[['cp_type', 'cp_dose']])
dummies_test=pd.get_dummies(test[['cp_type', 'cp_dose']])

In [None]:
train=train.drop(['cp_type', 'cp_dose'], axis=1) # Delete uncoded cats
test=test.drop(['cp_type', 'cp_dose'], axis=1)

In [None]:
train=pd.concat([train, dummies_train], axis=1) # Add encoded cats
test=pd.concat([test, dummies_test], axis=1)

Drop id col and turn to np

In [None]:
X = train.drop('sig_id', axis=1).to_numpy()
X_test = test.drop('sig_id', axis=1).to_numpy()
y = targets.drop('sig_id', axis=1).to_numpy() 

Now that we have X, y and X_test we dont need anything else in the environment. Kaggle throws an error 137 (OOM) without deleting all objects - so I do this for space.

In [None]:
del train
del test
del numeric
del cats
del numeric_sc
del train_pc
del numeric_test
del cats_test
del numeric_sc_test
del test_pc
del dummies_train
del dummies_test


## Set up cross-fold validation

In [None]:
kf = KFold(n_splits=5)
def model_validation_loop (X, y, clf):
    log_loss_list = []
    for n, (train_idx, pred_idx) in enumerate(kf.split(X, y)):
        print('Starting fold: ', n)
        X_train, X_test = X[train_idx], X[pred_idx]
        y_train, y_test = y[train_idx], y[pred_idx]
        
        clf.fit(X_train, y_train)
        
        # Get Log Loss
        
        preds = clf.predict_proba(X_test) # list of preds per class
        preds = np.array(preds)[:,:,1].T # take the positive class
        
        loss = log_loss(np.ravel(y_test), np.ravel(preds))
        print('Log Loss for this fold:', loss)
        log_loss_list.append(loss)
    
    print('Mean Log Loss:', np.mean(log_loss_list))
    print('_'*50)
    


We'll compare these 2 multioutput clfs

In [None]:
classifiers = [
    MultiOutputClassifier(XGBClassifier()),
    MultiOutputClassifier(MLPClassifier(random_state = 42))    
    ]

In [None]:
clf_names= ['GradientBoost', 'NeuralNet']

The XGB model has a slight performance bonus on the Neural Net. Now we can tune the XGB model. Instead of doing a parameter search I have looked on Kaggle for parametes to save many hours. Thanks to: https://www.kaggle.com/fchmiel/xgboost-baseline-multilabel-classification

In [None]:
params = {'colsample_bytree': 0.6522,
          'gamma': 3.6975,
          'learning_rate': 0.0503,
          'max_delta_step': 2.0706,
          'max_depth': 10,
          'min_child_weight': 31.5800,
          'n_estimators': 166,
          'subsample': 0.8639}

In [None]:
tuned_xgb=MultiOutputClassifier(XGBClassifier().set_params(**params))

Run validation fn to test improvement in performance in the basic vs tuned model.

In [None]:
model_validation_loop(X, y, tuned_xgb)

Improved model observed. Lets now predict the test data

In [None]:
sample_sub =  pd.read_csv('../input/lish-moa/sample_submission.csv')

Predict

In [None]:
final_preds=pd.DataFrame(data=tuned_xgb.predict(X_test))

Format

In [None]:
ids=pd.DataFrame(data=sample_sub['sig_id'])[0:100]
final_preds=pd.concat([ids, final_preds], axis=1)
final_preds.columns=sample_sub.columns

Write

In [None]:
final_preds.to_csv('submission.csv', index=False)