<div>
    <h1 align="center">Tabular Playground Series - Jun 2021</h1>
    <h2 align="center">HistGradientBoosting & Catboost & Neural Networks</h2>
    <h4 align="center">By: Somayyeh Gholami & Mehran Kazeminia</h4>
</div>

<div class="alert alert-success">  
</div>

## Import

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from catboost import CatBoostClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from mlxtend.preprocessing import minmax_scaling
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn import preprocessing

In [None]:
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import activations,callbacks
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import initializers

from keras.models import Model
import gc

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

<div class="alert alert-success">  
</div>

## Data Set

In [None]:
df1 = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
df2 = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
sam = pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')

In [None]:
MV1 = df1.isnull().sum()
print(f'Missing Value 1:\n{MV1[MV1 > 0]}\n')

MV2 = df2.isnull().sum()
print(f'Missing Value 2:\n{MV2[MV2 > 0]}\n')

In [None]:
# display(df1.info(), df2.info())
display(df1, df1.describe().transpose())
display(df2, df2.describe().transpose())

In [None]:
df1['target'].value_counts().plot(kind='bar')

In [None]:
df1['target'].value_counts().plot(kind='pie')
df1['target'].value_counts(normalize=True)

In [None]:
data1 = df1.copy()
data1['target'] = data1['target'].str.slice(start=6).astype(int) - 1

# dic = {'Class_1':0, 'Class_2':1, 'Class_3':2, 'Class_4':3, 'Class_5':4, 'Class_6':5, 'Class_7':6, 'Class_8':7, 'Class_9':8 }
# data1['target'].replace(dic, inplace=True)
display(df1['target'], data1['target'])

In [None]:
X = data1.drop(columns = ['id','target'])
display(X)

In [None]:
y = data1.target
display(y)

In [None]:
data2 = df2.copy()
XX = data2.drop(columns = ['id'])
display(XX)

<div class="alert alert-success">  
</div>

## Split

In [None]:
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.50, random_state=123) 
val_X.to_csv("val_X.csv",index=False)
val_y.to_csv("val_y.csv",index=False)

## Scaling

In [None]:
X_scaled = minmax_scaling(X, columns=X.columns)
# display(X_scaled)

In [None]:
train_Xs, val_Xs, train_ys, val_ys = train_test_split(X_scaled, y, test_size=0.50, random_state=123) 

In [None]:
XXs = minmax_scaling(XX, columns=XX.columns)
# display(XXs)

## Features

In [None]:
yf = df1.target
Xf = df1.drop(columns = ['id', 'target'])

# Label encoding for categoricals
for colname in Xf.select_dtypes("object"):    
    Xf[colname], _ = Xf[colname].factorize()

# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = Xf.dtypes == int

In [None]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features, random_state=123)    
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)                         
    mi_scores = mi_scores.sort_values(ascending=False)                          
    return mi_scores

mi_scores = make_mi_scores(Xf, yf, discrete_features)                          
display(mi_scores)

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

plt.figure(dpi=100, facecolor='lightgray', figsize=(10, 4))
plot_mi_scores(mi_scores.head(15))

In [None]:
plt.figure(dpi=100, facecolor='lightgray', figsize=(4, 4))
plot_mi_scores(mi_scores.tail(15))

In [None]:
def drop_uninformative(df, mi_scores):    
    return df.loc[:, mi_scores > 0.003]

Xff = drop_uninformative(Xf, mi_scores)
# display(Xff)

In [None]:
train_Xf, val_Xf, train_yf, val_yf = train_test_split(Xff, yf, test_size=0.50, random_state=123) 

In [None]:
XXf = XX.drop(columns = ['feature_36'])
# display(XXf)

<div class="alert alert-success">  
</div>

## HistGradientBoostingClassifier

### Validation Model

In [None]:
model1v = HistGradientBoostingClassifier(max_iter=250,
                                         validation_fraction=None, 
                                         learning_rate=0.01, 
                                         max_depth=10, 
                                         min_samples_leaf=24, 
                                         max_leaf_nodes=60,
                                         random_state=123)

model1v.fit(train_X, train_y)
oof_pred1 = model1v.predict_proba(val_X)
log_loss(val_y, oof_pred1)

In [None]:
predictions1 = model1v.predict(val_X)
accuracy1 = accuracy_score(val_y, predictions1)
display(accuracy1) 

### Model 1

In [None]:
model1 = HistGradientBoostingClassifier(max_iter=250,
                                        validation_fraction=None, 
                                        learning_rate=0.01, 
                                        max_depth=10, 
                                        min_samples_leaf=24, 
                                        max_leaf_nodes=60,
                                        random_state=123,
                                        verbose=1)

model1.fit(X, y)
pred1 = model1.predict_proba(XX)
display(pred1, pred1.shape) 

In [None]:
sub1 = sam.copy()
display(sub1)

In [None]:
sub1.iloc[:, 1:] = pred1.data
display(sub1)

# sub1['Class_1'] = pred1[:,0] 
# sub1['Class_2'] = pred1[:,1] 
# sub1['Class_3'] = pred1[:,2] 
# sub1['Class_4'] = pred1[:,3] 
# sub1['Class_5'] = pred1[:,4] 
# sub1['Class_6'] = pred1[:,5] 
# sub1['Class_7'] = pred1[:,6] 
# sub1['Class_8'] = pred1[:,7] 
# sub1['Class_9'] = pred1[:,8] 

In [None]:
sub1.to_csv("submission1.csv",index=False)
# Public Score: 1.75770
!ls

<div class="alert alert-success">  
</div>

## CatBoostClassifier

### Validation Model

In [None]:
model2v = CatBoostClassifier(depth=8,
                             iterations=1000,
                             learning_rate=0.02,                            
                             eval_metric='MultiClass',
                             loss_function='MultiClass', 
                             bootstrap_type= 'Bernoulli',
                             leaf_estimation_method='Gradient',
                             random_state=123,
                             task_type='GPU')                        

model2v.fit(train_X, train_y, verbose=100)
oof_pred2 = model2v.predict_proba(val_X)
log_loss(val_y, oof_pred2)

In [None]:
predictions2 = model2v.predict(val_X)
accuracy2 = accuracy_score(val_y, predictions2)
display(accuracy2) 

In [None]:
model2v.feature_importances_

### Model 2

In [None]:
model2 = CatBoostClassifier(depth=8,
                            iterations=1000,
                            learning_rate=0.02,                            
                            eval_metric='MultiClass',
                            loss_function='MultiClass', 
                            bootstrap_type= 'Bernoulli',
                            leaf_estimation_method='Gradient',
                            random_state=123,
                            task_type='GPU')   

model2.fit(X, y, verbose=100)
pred2 = model2.predict_proba(XX)
display(pred2, pred2.shape) 

In [None]:
sub2 = sam.copy()
# display(sub2)

In [None]:
sub2.iloc[:, 1:] = pred2.data
display(sub2)

In [None]:
sub2.to_csv("submission2.csv",index=False)
# Public Score: 1.75011
!ls

<div class="alert alert-success">  
</div>

## Neural Networks

Thanks to: @pourchot https://www.kaggle.com/pourchot/blending-neural-networks-weights-optimization

### Model 3

In [None]:
train = df1.copy()
test = df2.copy()
submission = sam.set_index('id')
targets = pd.get_dummies(df1['target'])

In [None]:
def custom_metric(y_true, y_pred):
    y_pred = K.clip(y_pred, 1e-15, 1-1e-15)
    loss = K.mean(cce(y_true, y_pred))
    return loss

cce = tf.keras.losses.CategoricalCrossentropy()

es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=1e-05, patience=8, verbose=0,
    mode='min', baseline=None, restore_best_weights=True)

plateau = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.7, patience=2, verbose=0)

In [None]:
def get_model():
    inputs = layers.Input(shape = (75,))
    
    embed = layers.Embedding(360, 8)(inputs)
    embed = layers.Flatten()(embed)
    
    hidden = layers.Dropout(0.2)(embed)
    hidden = tfa.layers.WeightNormalization(layers.Dense(units=32, activation='selu', kernel_initializer="lecun_normal"))(hidden)
    
    output = layers.Dropout(0.2)(layers.Concatenate()([embed, hidden]))
    output = tfa.layers.WeightNormalization(layers.Dense(units=32, activation='relu'))(output) 
    
    output = layers.Dropout(0.3)(layers.Concatenate()([embed, hidden, output]))
    output = tfa.layers.WeightNormalization(layers.Dense(units=32, activation='elu'))(output) 
    output = layers.Dense(9, activation = 'softmax')(output)
    
    model = keras.Model(inputs=inputs, outputs=output, name="res_nn_model")
    
    return model

In [None]:
EPOCH = 50
SEED = 123
N_FOLDS = 10

NN_a_train_preds = []
NN_a_test_preds  = []
NN_a_oof_pred3 = []

oof_NN_a = np.zeros((train.shape[0],9))
pred_NN_a = np.zeros((test.shape[0],9))

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

In [None]:
for fold, (tr_idx, ts_idx) in enumerate(skf.split(train,train.iloc[:,-1])):

    X_train = train.iloc[:,1:-1].iloc[tr_idx]
    y_train = targets.iloc[tr_idx]
    X_test = train.iloc[:,1:-1].iloc[ts_idx]
    y_test = targets.iloc[ts_idx]
    K.clear_session()
    
    model_attention = get_model()

    model_attention.compile(loss='categorical_crossentropy', 
                            optimizer = keras.optimizers.Adam(learning_rate=2e-4), 
                            metrics=custom_metric)
    
    model_attention.fit(X_train, y_train,
                        batch_size = 256, epochs = EPOCH,
                        validation_data=(X_test, y_test),
                        callbacks=[es, plateau],
                        verbose = 0)
    
    pred_a = model_attention.predict(X_test) 
    oof_NN_a[ts_idx] += pred_a 
    score_NN_a = log_loss(y_test, pred_a)
    print(f"\nFOLD {fold} Score NN Attention model: {score_NN_a}")
    pred_NN_a += model_attention.predict(test.iloc[:,1:]) / N_FOLDS 
    
    NN_a_train_preds.append(oof_NN_a[ts_idx])
    NN_a_oof_pred3.append(model_attention.predict(val_X))
    NN_a_test_preds.append(model_attention.predict(test.iloc[:,1:]))
    
score_a = log_loss(targets, oof_NN_a)
print('=' * 60)
print(f"\n===== FINAL SCORE ATTENTION MODEL : {score_a} =====\n")  
print('=' * 60)
display(oof_NN_a, oof_NN_a.shape)

In [None]:
oof_pred3 = sum(np.array(NN_a_oof_pred3)/N_FOLDS)
log_loss(val_y, oof_pred3)

In [None]:
pred3 = sum(np.array(NN_a_test_preds)/N_FOLDS)
display(pred3, pred3.shape)

In [None]:
sub3 = sam.copy()
# display(sub3)

In [None]:
sub3.iloc[:, 1:] = pred3.data
display(sub3)

In [None]:
sub3.to_csv("submission3.csv",index=False)
# Public Score: 1.74587
!ls

<div class="alert alert-success">  
</div>

## Ensembling

In [None]:
def generate(main, support, coeff):
    
    g = main.copy()    
    for i in main.columns[1:]:
        
        res = []
        lm, Is = [], []        
        lm = main[i].tolist()
        ls = support[i].tolist()  
        
        for j in range(len(main)):
            res.append((lm[j] * coeff) + (ls[j] * (1.- coeff)))            
        g[i] = res
        
    return g

Thanks to: @oxzplvifi https://www.kaggle.com/oxzplvifi/tabular-residual-network

In [None]:
sub4 = pd.read_csv('../input/tabular-residual-network/submission.csv')
sub4.to_csv("submission4.csv",index=False)
# Public Score: 1.74522
# display(sub4)

Thanks to: @bhavikjain https://www.kaggle.com/bhavikjain/tps-june-21-eda-models

In [None]:
sub5 = pd.read_csv('../input/tps-june-21-eda-models/Sol.csv')
sub5.to_csv("submission5.csv",index=False)
# Public Score: 1.74456
# display(sub5)

Thanks to: @fusioncenter https://www.kaggle.com/fusioncenter/residual-network-for-tabular-data

In [None]:
sub6 = pd.read_csv('../input/tps6-74442/TPS6_74442.csv')
sub6.to_csv("submission6.csv",index=False)
# Public Score: 1.74442
# display(sub6)

In [None]:
sub = generate(sub2, sub1, 0.85)

sub = generate(sub3, sub , 0.85)

sub = generate(sub4, sub , 0.85)

sub = generate(sub5, sub , 0.85)

sub = generate(sub6, sub , 0.55)

display(sub)

In [None]:
sub.to_csv("submission.csv",index=False)
# Public Score: 
!ls

<div class="alert alert-success">
    <h1 align="center">If you find this work useful, please don't forget upvoting :)</h1>
</div>