# Tabular Playground Series - Jun 2021

1. ANALYZE DATA
2. FAST AI
3. EXTRACT EMBEDDINGS
4. TRAIN XG BOOST
5. TRAIN CAT BOOST - with FAST AI Embeddings
6. TRAIN CAT BOOST - without embeddings
7. ENSEMBLE

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
%matplotlib inline
import torch
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold # For creating folds
from sklearn.metrics import log_loss # Evaluation metrics
import random

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/test.csv")
ss = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/sample_submission.csv")

### Data Visualization

In [None]:
feature_cols = [f"feature_{i}" for i in range(75)]

In [None]:
num_cols = 5
plt.figure(figsize = (10, 8))
f, axes = plt.subplots(nrows=15, ncols=5, figsize=(20, 30))
for index, col in enumerate(feature_cols):
    i,j = (index // num_cols, index % num_cols)
    sns.kdeplot(train_df.loc[train_df['target'] == 'Class_1', col] , color="gray",ax=axes[i,j])
    sns.kdeplot(train_df.loc[train_df['target'] == 'Class_2', col] , color="red",ax=axes[i,j])
    sns.kdeplot(train_df.loc[train_df['target'] == 'Class_3', col] , color="black",ax=axes[i,j])
    sns.kdeplot(train_df.loc[train_df['target'] == 'Class_4', col] , color="maroon",ax=axes[i,j])
    sns.kdeplot(train_df.loc[train_df['target'] == 'Class_5', col] , color="brown",ax=axes[i,j])
    sns.kdeplot(train_df.loc[train_df['target'] == 'Class_6', col] , color="blue",ax=axes[i,j])
    sns.kdeplot(train_df.loc[train_df['target'] == 'Class_7', col] , color="black",ax=axes[i,j])
    sns.kdeplot(train_df.loc[train_df['target'] == 'Class_8', col] , color="red",ax=axes[i,j])
    sns.kdeplot(train_df.loc[train_df['target'] == 'Class_9', col] , color="maroon",ax=axes[i,j])
    
plt.title('Distribution of Target');
plt.tight_layout()
plt.show()

In [None]:
train_df.shape,test_df.shape

In [None]:
train_df['target'].value_counts().plot(kind = 'barh',color="gray")

### FAST AI

In [None]:
from fastai.tabular.all import *

In [None]:
cat_vars = []
cont_vars = []

for cols in feature_cols:
    if len(train_df[cols].value_counts()) < 22:
        cat_vars.append(cols)
    else:
        cont_vars.append(cols)
    
len(cat_vars)

In [None]:
# https://www.kaggle.com/soerendip/fastai

splits = RandomSplitter(valid_pct=0.2)(range_of(train_df))

to = TabularPandas(
    train_df,
    y_names="target",
    cat_names = cat_vars,
    cont_names = cont_vars,
    procs = [Categorify, FillMissing, Normalize],
    splits=splits
)

# and convert it do dataloader with batch size of ...
batch_size = 4096
dls = to.dataloaders(bs=batch_size)

In [None]:
categories = train_df[cat_vars].nunique().keys().to_list()
cardinalities = train_df[cat_vars].nunique().values
emb_szs = {cat: min(100, card//2) for cat, card in zip(categories, cardinalities)}
emb_szs

In [None]:
config = tabular_config(ps=[0.001,0.01])
learn = tabular_learner(dls, emb_szs=emb_szs,  wd=5e-1, layers=[250,100],
                        config=config, metrics=accuracy)

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(5, 0.02089296132326126)

In [None]:
dl = learn.dls.test_dl(test_df)
pred = learn.get_preds(dl=dl)
ss.loc[:,1:] = pred[0].numpy()
ss.to_csv('submission_fastai.csv',index = False)

### Extract Embeddings

In [None]:
X_train = pd.DataFrame(dls.train_ds[feature_cols])
y_train = dls.train_ds['target']

X_valid = pd.DataFrame(dls.valid_ds[feature_cols])
y_valid = dls.valid_ds['target']

In [None]:
X_train.shape,X_valid.shape

In [None]:
target = pd.concat([y_train, y_valid])

In [None]:
X_test = pd.DataFrame(dl.dataset[:])[feature_cols]
X_test.shape

### XG Boost

In [None]:
from xgboost import XGBClassifier
clf = XGBClassifier(objective = 'multi:softmax', booster = 'gbtree', 
                     nrounds = 'min.error.idx', num_class = 3, 
                     maximize = False, eval_metric = 'logloss', eta = .1,
                     max_depth = 14, colsample_bytree = .4, n_jobs=-1)

In [None]:
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_valid, clf.predict(X_valid))
acc

In [None]:
pred = clf.predict_proba(X_test)
ss.loc[:,1:] = pred
ss.to_csv('submission_XGBoost.csv',index=False) #1.78111

### Cat Boost 1

In [None]:
categorical_features_indices = []
for col in cat_vars:
    categorical_features_indices.append(np.where(train_df.columns == col)[0][0])
    print(train_df[col].dtype)
categorical_features_indices

In [None]:
train_embed = pd.concat([X_train, X_valid])
df_train = train_embed
df_test = X_test
train_data = Pool(df_train[feature_cols],  target)

In [None]:
from sklearn.metrics import accuracy_score
from catboost import cv
import optuna

def objective(trial, model, train_df, y):
    #train_set = Dataset(df_train[feature_cols], label = target)
    train_data = Pool(train_df[feature_cols], label = y)
    hparams = {
        'loss_function': 'MultiClass',
        #'eval_metric': 'Logloss',        
        #'verbosity': -1,
        'grow_policy': 'Lossguide',
        'bootstrap_type': 'Poisson',
        
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.001, 3.0),
        'max_bin': trial.suggest_int('max_bin', 150, 250),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'leaf_estimation_method': 'Gradient',
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 500),
        'iterations':10000,
        'random_state':2021,
        'task_type': 'GPU',
    }
    
    if hparams["bootstrap_type"] == "Bayesian":
        hparams["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 60)
    elif hparams["bootstrap_type"] == "Bernoulli":
        hparams["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    
    k = 5
    cv_results = cv(
            params = hparams,            
            dtrain = train_data,
            num_boost_round = 2500,
            nfold = k,
            stratified = True,
            early_stopping_rounds = 100,
            verbose_eval = False,
        )

    return cv_results.mean()


In [None]:
study_catboost = optuna.create_study(direction = 'minimize')
study_catboost.optimize(lambda trial: objective(trial, 'catboost',
                                                df_train[feature_cols], target),
                        n_trials = 100, timeout = 600)

In [None]:
study_catboost.best_params

In [None]:
param_cb ={
    'max_depth': 8, 
    'l2_leaf_reg': 2.9203995625107293, 
    'bagging_temperature': 32.30584785106606, 
    'loss_function': 'MultiClass',
    #'eval_metric': 'AUC',    
    'grow_policy': 'Lossguide',
    'bootstrap_type': 'Bayesian', 
    'learning_rate': 0.27478567078925825, 
    'max_bin': 227, 
    'min_data_in_leaf': 333,
    'task_type':'GPU',
    'iterations':10000,
    'random_state':2021,
    #'subsample': 0.13534551086578891
}

In [None]:
%%time
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score
from catboost import CatBoostClassifier



oofcat         = np.zeros((df_train.shape[0],9))
predictionscat = np.zeros((df_test.shape[0],9))


from sklearn.model_selection import KFold,StratifiedKFold
fold=StratifiedKFold(n_splits=5,shuffle=True,random_state=2021)
i=1

for dev_index, val_index in fold.split(df_train[feature_cols],target):    
    dev_X, val_X = df_train[feature_cols].loc[dev_index,:], df_train[feature_cols].loc[val_index,:]
    dev_y, val_y = target[dev_index], target[val_index]

    m=CatBoostClassifier(**param_cb)
    m.fit(dev_X,dev_y,eval_set=[(val_X, val_y)], early_stopping_rounds=100,verbose=100)
  
    oofcat[val_index]  = m.predict_proba(val_X.values)
    pred_test = m.predict_proba(df_test[feature_cols].values)
    predictionscat += pred_test

predictionscat = predictionscat/5 

In [None]:
oofcat.shape,predictionscat.shape,df_train[feature_cols].shape,target.shape,df_test[feature_cols].shape

In [None]:
ss["Class_1"] = pd.DataFrame(predictionscat)[0]
ss["Class_2"] = pd.DataFrame(predictionscat)[1]
ss["Class_3"] = pd.DataFrame(predictionscat)[2]
ss["Class_4"] = pd.DataFrame(predictionscat)[3]
ss["Class_5"] = pd.DataFrame(predictionscat)[4]
ss["Class_6"] = pd.DataFrame(predictionscat)[5]
ss["Class_7"] = pd.DataFrame(predictionscat)[6]
ss["Class_8"] = pd.DataFrame(predictionscat)[7]
ss["Class_9"] = pd.DataFrame(predictionscat)[8]
ss.to_csv("submission.csv", index=False) #1.75322

### Cat Boost 2

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/test.csv")
ss = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/sample_submission.csv")

In [None]:
y = train_df['target']

In [None]:
%%time
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score
from catboost import CatBoostClassifier


# categorical_features_indices = np.where(train_df.dtypes =='category')[0]
# categorical_features_indices
oofcat         = np.zeros((train_df.shape[0],9))
predictionscat_p = np.zeros((test_df.shape[0],9))


from sklearn.model_selection import KFold,StratifiedKFold
fold=StratifiedKFold(n_splits=5,shuffle=True,random_state=2021)
i=1

for dev_index, val_index in fold.split(train_df[feature_cols],y):    
    dev_X, val_X = train_df[feature_cols].loc[dev_index,:], train_df[feature_cols].loc[val_index,:]
    dev_y, val_y = y[dev_index], y[val_index]

    m=CatBoostClassifier(**param_cb)
    m.fit(dev_X,dev_y,eval_set=[(val_X, val_y)], early_stopping_rounds=100,verbose=100)
  
    oofcat[val_index]  = m.predict_proba(val_X.values)
    pred_test = m.predict_proba(test_df[feature_cols].values)
    predictionscat_p += pred_test

predictionscat_p = predictionscat_p/5 

In [None]:
ss["Class_1"] = pd.DataFrame(predictionscat_p)[0]
ss["Class_2"] = pd.DataFrame(predictionscat_p)[1]
ss["Class_3"] = pd.DataFrame(predictionscat_p)[2]
ss["Class_4"] = pd.DataFrame(predictionscat_p)[3]
ss["Class_5"] = pd.DataFrame(predictionscat_p)[4]
ss["Class_6"] = pd.DataFrame(predictionscat_p)[5]
ss["Class_7"] = pd.DataFrame(predictionscat_p)[6]
ss["Class_8"] = pd.DataFrame(predictionscat_p)[7]
ss["Class_9"] = pd.DataFrame(predictionscat_p)[8]
ss.to_csv("submission_p.csv", index=False)

### Blend

In [None]:

col1 = (pd.DataFrame(predictionscat)[0] * 0.40 + pd.DataFrame(predictionscat_p)[0] * 0.60) 
col2 = (pd.DataFrame(predictionscat)[1] * 0.40 + pd.DataFrame(predictionscat_p)[1] * 0.60)
col3 = (pd.DataFrame(predictionscat)[2] * 0.40 + pd.DataFrame(predictionscat_p)[2] * 0.60)
col4 = (pd.DataFrame(predictionscat)[3] * 0.40 + pd.DataFrame(predictionscat_p)[3] * 0.60)
col5 = (pd.DataFrame(predictionscat)[4] * 0.40 + pd.DataFrame(predictionscat_p)[4] * 0.60)
col6 = (pd.DataFrame(predictionscat)[5] * 0.40 + pd.DataFrame(predictionscat_p)[5] * 0.60)
col7 = (pd.DataFrame(predictionscat)[6] * 0.40 + pd.DataFrame(predictionscat_p)[6] * 0.60)
col8 = (pd.DataFrame(predictionscat)[7] * 0.40 + pd.DataFrame(predictionscat_p)[7] * 0.60)
col9 = (pd.DataFrame(predictionscat)[8] * 0.40 + pd.DataFrame(predictionscat_p)[8] * 0.60)

ss['Class_1'] =  col1
ss['Class_2'] =  col2
ss['Class_3'] =  col3
ss['Class_4'] =  col4
ss['Class_5'] =  col5
ss['Class_6'] =  col6
ss['Class_7'] =  col7
ss['Class_8'] =  col8
ss['Class_9'] =  col9
ss.to_csv('submission1.csv', index=False)

In [None]:
### finish

### Tab Net

In [None]:
# train_df = df
# test_df = test 
# feature_cols = cat_features
# y = train_df.target

In [None]:
# y = pd.DataFrame(train_df['target'])
# y = y['target'].map({
#       'Class_1':'0',
#       'Class_2':'1',
#       'Class_3':'2',
#       'Class_4':'3',
#       'Class_5':'4',
#       'Class_6':'5',
#       'Class_7':'6',
#       'Class_8':'7',
#       'Class_9':'8'})
# y = np.array(y).astype(int)
# y = y.ravel()

In [None]:
# !pip install -q /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.1-py3-none-any.whl

In [None]:
# from sklearn.model_selection import train_test_split, KFold
# from pytorch_tabnet.tab_model import TabNetClassifier

In [None]:
# df = pd.concat([train_df,test_df],axis=0)

In [None]:
# from sklearn.preprocessing import LabelEncoder
# categorical_columns = []
# categorical_dims =  {}
# for col in df.columns[df.dtypes == int]:
#     #print(col, train_df[col].nunique())
#     l_enc = LabelEncoder()    
#     df[col] = l_enc.fit_transform(df[col].values)
#     categorical_columns.append(col)
#     categorical_dims[col] = len(l_enc.classes_)

In [None]:
# train_df.shape,test_df.shape,df.shape

In [None]:
# train_df = df.loc[df.target.isnull() == False]
# test_df = df.loc[df.target.isnull() == False][feature_cols]

In [None]:
# cat_idxs = [ i for i, f in enumerate(feature_cols) if f in categorical_columns]

# cat_dims = [ categorical_dims[f] for i, f in enumerate(feature_cols) if f in categorical_columns]

In [None]:
# %%time
# from sklearn.metrics import accuracy_score
# scores = []
# ooftabnet = np.zeros((train_df.shape[0],9))
# predictionstabnet= np.zeros((test_df.shape[0],9))

# fold=StratifiedKFold(n_splits=5,shuffle=True,random_state=2021)
# i=1
# n_d = n_a = 50

# tabnet_params = dict(verbose=40)

# for dev_index, val_index in fold.split(train_df[feature_cols],y): 
#     #print(dev_index)
#     dev_X, val_X = train_df[feature_cols].loc[dev_index,:], train_df[feature_cols].loc[val_index,:]
#     dev_y, val_y = y[dev_index], y[val_index]
    
#     model = TabNetClassifier(
#     n_d=n_d,n_a=n_a, n_steps=5,
#     gamma=1.5, n_independent=2, n_shared=2,
#     cat_idxs=cat_idxs,
#     cat_dims=cat_dims,
#     cat_emb_dim=1,
#     lambda_sparse=1e-2, momentum=0.3, clip_value=2.,
#     optimizer_fn=torch.optim.Adam,
#     optimizer_params=dict(lr=2e-2),
#     scheduler_params = {"gamma": 0.95,
#                      "step_size": 10},
#     scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15)
    
#     model.fit(
#       dev_X.values, dev_y,
#       eval_set=[(val_X.values, val_y)],
#       patience=100,
#       max_epochs=300,
#     )
#     scores.append(accuracy_score(val_y, model.predict(val_X.values)))
#     print("IN 1")
#     pred_val  = model.predict_proba(val_X.values)
#     print("IN 2")
#     pred_test = model.predict_proba(test_df[feature_cols].values)
      
#     ooftabnet[val_index] = pred_val
#     predictionstabnet += pred_test
    
# predictionstabnet /= 5.

In [None]:
# ss["Class_1"] = pd.DataFrame(predictionstabnet)[0]
# ss["Class_2"] = pd.DataFrame(predictionstabnet)[1]
# ss["Class_3"] = pd.DataFrame(predictionstabnet)[2]
# ss["Class_4"] = pd.DataFrame(predictionstabnet)[3]
# ss["Class_5"] = pd.DataFrame(predictionstabnet)[4]
# ss["Class_6"] = pd.DataFrame(predictionstabnet)[5]
# ss["Class_7"] = pd.DataFrame(predictionstabnet)[6]
# ss["Class_8"] = pd.DataFrame(predictionstabnet)[7]
# ss["Class_9"] = pd.DataFrame(predictionstabnet)[8]
# ss.to_csv("submission.csv", index=False)