### Tabular Playgroud Series - May 2021

## **Tabular Playground Series - May 2021**

1. Data Visualization
*     a. Feature Analysis
*     b. Feature Corelation Maps
2. Feature Selection
*     a. PyCaret - Category Features
3. Model Building
*     a. LGBM
*     b. Tabnet
*     c. CatBoost
4. Ensemble
*     a. Blend

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import category_encoders as ce
import lightgbm as lgb

from sklearn.model_selection import KFold,StratifiedKFold

In [None]:
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv")
train = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/test.csv")

In [None]:
# for col in train.columns:
#     print(col, " Missing Data Count: ",train[col].isnull().sum())

### Data Visualization

In [None]:
train['target'].value_counts().plot(kind = 'barh',color="gray")

In [None]:
train.shape,test.shape

In [None]:
#!pip install pycaret

In [None]:
#!pip show pycaret

In [None]:
train.head()

In [None]:
feature_cols = train.columns[train.columns.str.startswith('feature_')]

In [None]:
num_cols = 5
plt.figure(figsize = (10, 5))
f, axes = plt.subplots(nrows=10, ncols=5, figsize=(20, 30))
for index, col in enumerate(feature_cols):
    i,j = (index // num_cols, index % num_cols)
    sns.kdeplot(train.loc[train['target'] == 'Class_1', col] , color="gray",ax=axes[i,j])
    sns.kdeplot(train.loc[train['target'] == 'Class_2', col] , color="red",ax=axes[i,j])
    sns.kdeplot(train.loc[train['target'] == 'Class_3', col] , color="black",ax=axes[i,j])
    sns.kdeplot(train.loc[train['target'] == 'Class_4', col] , color="maroon",ax=axes[i,j])
    
plt.title('Distribution of Target');
plt.tight_layout()
plt.show()

In [None]:
sns.heatmap(train.corr(),annot=False, vmin=0, vmax=0.05,linewidths=0.3) 
fig=plt.gcf()
fig.set_size_inches(25,12)
plt.show()

In [None]:
# %%time
# from sklearn.manifold import TSNE
# tsne_model = TSNE(n_components=2)
# transformed = tsne_model.fit_transform(train[feature_cols])

In [None]:
# scatter = plt.scatter(transformed[:,0], transformed[:,1], c=train['target'],color=train.target,labels={'color': 'target'})
# handles, _ = scatter.legend_elements(prop='colors')
# #plt.legend(handles, labels)
# plt.show

In [None]:
# # # initialize setup
# from pycaret.classification import * 
# s = setup(train, target = 'target', log_experiment = True, experiment_name = 'ModelExperiment 1')

In [None]:
#Referenced from https://www.kaggle.com/pranjalverma08/tps-may-21-the-ensemble-approach-with-optuna

params = {}
params["objective"] = "multiclass"
params["num_class"] = "4"

params["boosting"] = "gbdt"
params['metric']= "multi_logloss",

params["max_depth"] = 30
#params["min_data_in_leaf"] = 1
params["min_child_samples"] = 77
params["colsample_bytree"] = 0.18
params["subsample"] = 0.013

params["cat_l2"] =  10
params["max_bin"] =  8
params["min_data_per_group"] =  90

params["lambda_l1"] =  8.80654952977217
params["lambda_l2"] = 3.918745336956548e-06
params["learning_rate"] = 0.001
params["feature_fraction"] = 0.40025260416683445
params["bagging_fraction"] = 0.7543739297812108
params["bagging_freq"] = 1
params["num_leaves"] = 11   #50
params["n_estimators"] = 1000
#params["cat_smooth"] = 60
params["nthread"] =  4
params["verbosity"] = -1
params['early_stopping_rounds'] = 500
num_rounds = 1000

In [None]:
newArray = []
for i in range(len(train)):
    if train["target"][i] == "Class_1":
        newArray.append(0)
    elif train["target"][i] == "Class_2":
        newArray.append(1)
    elif train["target"][i] == "Class_3":
        newArray.append(2)
    elif train["target"][i] == "Class_4":
        newArray.append(3)
train["target"] = newArray

In [None]:
# Reference https://www.kaggle.com/udbhavpangotra/tps-may-21-extensive-eda-catboost-shap#Model---CATBOOST
print(len(train))
train = train[train['feature_5']!=10]

train = train[train['feature_6']!=26]
train = train[train['feature_6']!=27]

train = train[train['feature_7']!=30]
train = train[train['feature_7']!=31]

train = train[train['feature_9']!=17]

train = train[train['feature_10']!=16]

train = train[train['feature_11']!=12]

train = train[train['feature_15']!=20]

train = train[train['feature_16']!=18]

train = train[train['feature_23']!=18]
train = train[train['feature_23']!=19]

train = train[train['feature_27']!=29]

train = train[train['feature_28']!=23]

train = train[train['feature_29']!=13]

train = train[train['feature_33']!=24]

train = train[train['feature_32']!=26]
train = train[train['feature_32']!=27]

train = train[train['feature_35']!=43]
train = train[train['feature_35']!=-2]
train = train[train['feature_35']!=38]
train = train[train['feature_35']!=39]


train = train[train['feature_38']!=65]
train = train[train['feature_38']!=55]
train = train[train['feature_38']!=-8]
train = train[train['feature_38']!=-3]
train = train[train['feature_38']!=-2]
train = train[train['feature_38']!=63]

train = train[train['feature_39']!=65]
train = train[train['feature_39']!=66]
train = train[train['feature_39']!=-5]
train = train[train['feature_39']!=-3]
train = train[train['feature_39']!=-2]
train = train[train['feature_39']!=63]

train = train[train['feature_42']!=37]
train = train[train['feature_42']!=-2]
train = train[train['feature_42']!=-1]

train = train[train['feature_43']!=33]
train = train[train['feature_43']!=31]

print(len(train))

In [None]:
train_df = train[feature_cols]
y = pd.DataFrame(train['target'])
y = np.array(y)
y = y.ravel()
test_df = test[feature_cols]

In [None]:
# Calculate the classes weights
#from lightgbm import LGBMClassifier
from sklearn.utils import class_weight
class_ratio = class_weight.compute_class_weight('balanced', np.unique(y), y)
print(class_ratio)

In [None]:
#https://www.kaggle.com/omkarchoulwar/tps-may-21-eda-and-models

l_neg = [19,30,31,32,35,38,39,42]
lneg = []
for i in l_neg:
    name = 'feature_'+str(i)
    lneg.append(name)
    
cols = [x for x in feature_cols if x not in lneg]
for i in cols:
    train_df[i] = np.sqrt(train_df[i])
    test_df[i] = np.sqrt(test_df[i])

In [None]:
from sklearn.preprocessing import QuantileTransformer,RobustScaler
rs = RobustScaler()
train_df[feature_cols] = rs.fit_transform(train_df[feature_cols])
test_df[feature_cols] = rs.transform(test_df[feature_cols])

### LGBM

In [None]:
from sklearn.metrics import accuracy_score
from lightgbm import Dataset, cv
import optuna

def objective(trial, model, train_df, y):
    train_set = Dataset(train_df[feature_cols], label = y)
    hparams = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'num_classes': 4,
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('lambda_l1', 0.001, 1),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.001, 12.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.001, 12.0),
        'num_leaves': trial.suggest_int('num_leaves', 5, 25),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 0.9),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 0.9),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),        
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.5),
        'cat_smooth' : trial.suggest_int('cat_smooth', 10, 100),
        'cat_l2': trial.suggest_int('cat_l2', 1, 10),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 100),
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
    }
    
    k = 5
    cv_results = cv(
            params = hparams,
            train_set = train_set,
            num_boost_round = 2500,
            nfold = k,
            stratified = True,
            early_stopping_rounds = 100,
            verbose_eval = False,
        )
        
    trial.set_user_attr("n_estimators", len(cv_results['multi_logloss-mean']))
    # Extract the best score.
    best_score = cv_results['multi_logloss-mean'][-1]
    return best_score

In [None]:
study_lgbm = optuna.create_study(direction = 'minimize')
study_lgbm.optimize(lambda trial: objective(trial, 'lgbm', train_df[feature_cols], y), n_trials = 100, timeout = 600)

In [None]:
print("Number of finished trials: ", len(study_lgbm.trials))
print("Best trial:")
trial = study_lgbm.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

print("  Number of estimators: {}".format(trial.user_attrs["n_estimators"]))

In [None]:
#Referenced from https://www.kaggle.com/pranjalverma08/tps-may-21-the-ensemble-approach-with-optuna

params = {}
params["objective"] = "multiclass"
params["num_class"] = "4"

params["boosting"] = "gbdt"
params['metric']= "multi_logloss",

params["max_depth"] = 10
#params["min_data_in_leaf"] = 1
params["min_child_samples"] = 99
params["colsample_bytree"] = 0.18413248040220542
params["subsample"] = 0.013

params["cat_l2"] =  10
params["max_bin"] =  8
params["min_data_per_group"] =  50

params["lambda_l1"] =  0.029717160559365505
params["lambda_l2"] = 3.5230989314953916
params["learning_rate"] = 0.001
params["feature_fraction"] = 0.14093762554475536
params["bagging_fraction"] = 0.8886889549576145
params["bagging_freq"] = 1
params["num_leaves"] = 25   #50
params["n_estimators"] = 1000
params["cat_l2"] = 10
params["cat_smooth"] = 10
params["min_child_samples"] = 99
params["cat_l2"] = 4
params["nthread"] =  4
params["verbosity"] = -1
params['early_stopping_rounds'] = 500
num_rounds = 1000

In [None]:
%%time

ooflgb = np.zeros((train_df.shape[0],4))
predictionslgb= np.zeros((test_df.shape[0],4))

fold=StratifiedKFold(n_splits=5,shuffle=True,random_state=2021)
i=1

for dev_index, val_index in fold.split(train_df[feature_cols],y):    

    dev_X, val_X = train_df[feature_cols].loc[dev_index,:], train_df[feature_cols].loc[val_index,:]
    dev_y, val_y = y[dev_index], y[val_index]
    
    lgtrain = lgb.Dataset(dev_X, label=dev_y)
    lgtest = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, num_rounds,
                          valid_sets=[lgtest], early_stopping_rounds=300, verbose_eval=50)
    
    pred_val  = model.predict(val_X, num_iteration=model.best_iteration)
    pred_test = model.predict(test_df[feature_cols], num_iteration=model.best_iteration)
      
    ooflgb[val_index] = pred_val
    predictionslgb += pred_test
    
predictionslgb /= 5.

In [None]:
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv")
sample_submission['Class_1'] = np.clip(pd.DataFrame(predictionslgb)[0], 0.08,0.95)
sample_submission['Class_2'] = np.clip(pd.DataFrame(predictionslgb)[1], 0.08,0.95)
sample_submission['Class_3'] = np.clip(pd.DataFrame(predictionslgb)[2], 0.08,0.95)
sample_submission['Class_4'] = np.clip(pd.DataFrame(predictionslgb)[3], 0.08,0.95)
sample_submission.to_csv(path_or_buf='submission_lgb.csv', index=False)

### Tabnet

In [None]:
!pip install -q /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.1-py3-none-any.whl

In [None]:
from sklearn.model_selection import train_test_split, KFold
from pytorch_tabnet.tab_model import TabNetClassifier

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_valid, y_train, y_valid = train_test_split(train_df, y, test_size=0.33,shuffle=False,random_state=42)
#X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

In [None]:
#train_df[feature_cols].nunique().sort_values()

In [None]:
data = pd.concat([train,test],axis=0)
data.shape,train_df.shape,test_df.shape

In [None]:
# Features analysed using PyCaret
data["feature_0"]  = data["feature_0"].astype('category')
data["feature_2"]  = data["feature_2"].astype('category')
data["feature_5"]  = data["feature_5"].astype('category')
data["feature_9"] = data["feature_9"].astype('category')
data["feature_10"] = data["feature_10"].astype('category')
data["feature_11"] = data["feature_11"].astype('category')
data["feature_12"] = data["feature_12"].astype('category')
data["feature_13"] = data["feature_13"].astype('category')
data["feature_16"] = data["feature_16"].astype('category')
data["feature_17"] = data["feature_17"].astype('category')
data["feature_18"] = data["feature_18"].astype('category')
data["feature_20"] = data["feature_20"].astype('category')
data["feature_22"] = data["feature_22"].astype('category')
data["feature_23"] = data["feature_23"].astype('category')
data["feature_29"] = data["feature_29"].astype('category')
data["feature_36"] = data["feature_36"].astype('category')
data["feature_37"] = data["feature_37"].astype('category')
data["feature_44"] = data["feature_44"].astype('category')
data["feature_49"] = data["feature_49"].astype('category')

In [None]:
# for col in data.columns[data.dtypes == 'category']:
#     print(col, data[col].nunique())

In [None]:
from sklearn.preprocessing import LabelEncoder
categorical_columns = []
categorical_dims =  {}
for col in data.columns[data.dtypes == 'category']:
    print(col, data[col].nunique())
    l_enc = LabelEncoder()
    #train[col] = train[col].fillna("VV_likely")
    data[col] = l_enc.fit_transform(data[col].values)
    
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

In [None]:
test_df = data.loc[data.target.isnull()]
train_df = data.loc[data.target.isnull() == False]

In [None]:
#features = feature_cols #[ col for col in train.columns if col not in unused_feat+[target]] 
cat_idxs = [ i for i, f in enumerate(feature_cols) if f in categorical_columns]
cat_dims = [ categorical_dims[f] for i, f in enumerate(feature_cols) if f in categorical_columns]

In [None]:
y = pd.DataFrame(train['target'])
y = np.array(y)
y = y.ravel()

In [None]:
train_df.shape,test_df.shape,y.shape

In [None]:
train_df.reset_index(drop=True, inplace=True)

In [None]:
%%time
from sklearn.metrics import accuracy_score
scores = []
ooftabnet = np.zeros((train_df.shape[0],4))
predictionstabnet= np.zeros((test_df.shape[0],4))

fold=StratifiedKFold(n_splits=5,shuffle=True,random_state=2021)
i=1
n_d = n_a = 50

tabnet_params = dict(verbose=40)

for dev_index, val_index in fold.split(train_df[feature_cols],y): 
    #print(dev_index)
    dev_X, val_X = train_df[feature_cols].loc[dev_index,:], train_df[feature_cols].loc[val_index,:]
    dev_y, val_y = y[dev_index], y[val_index]
    
    model = TabNetClassifier(
    n_d=n_d,n_a=n_a, n_steps=5,
    gamma=1.5, n_independent=2, n_shared=2,
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=1,
    lambda_sparse=1e-2, momentum=0.3, clip_value=2.,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params = {"gamma": 0.95,
                     "step_size": 10},
    scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15)
    
    model.fit(
      dev_X.values, dev_y,
      eval_set=[(val_X.values, val_y)],
      patience=100,
      max_epochs=300,
    )
    scores.append(accuracy_score(val_y, model.predict(val_X.values)))
    
    pred_val  = model.predict_proba(val_X.values)
    pred_test = model.predict_proba(test_df[feature_cols].values)
      
    ooftabnet[val_index] = pred_val
    predictionstabnet += pred_test
    
predictionstabnet /= 5.

In [None]:
sample_submission['Class_1'] = np.clip(pd.DataFrame(predictionstabnet)[0], 0.025,0.975)
sample_submission['Class_2'] = np.clip(pd.DataFrame(predictionstabnet)[1], 0.025,0.975)
sample_submission['Class_3'] = np.clip(pd.DataFrame(predictionstabnet)[2], 0.025,0.975)
sample_submission['Class_4'] = np.clip(pd.DataFrame(predictionstabnet)[3], 0.025,0.975)
sample_submission.to_csv(path_or_buf='submission_tab.csv', index=False)

### Cat Boost Classifier

In [None]:
# Reference the hyper parameters from https://www.kaggle.com/antonellomartiello/tps-may-catboost-optuna-clip-probabilities
param_cb ={
    'depth': 3, 
    'l2_leaf_reg': 4.287566030099442, 
    'bagging_temperature': 27.174417642203863, 
    #'auto_class_weights': None, 
    'loss_function': 'MultiClassOneVsAll',
    'eval_metric': 'AUC',    
    'grow_policy': 'Lossguide',
    'bootstrap_type': 'Poisson', 
    'learning_rate': 0.06389970558475937, 
    'max_bin': 484, 
    'min_data_in_leaf': 414,
    'cat_features': cat_idxs,
    'task_type':'GPU',
    'iterations':10000,
    'random_state':2021,
    'subsample': 0.13534551086578891}

In [None]:
%%time
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score
from catboost import CatBoostClassifier


# categorical_features_indices = np.where(train_df.dtypes =='category')[0]
# categorical_features_indices
oofcat         = np.zeros((train.shape[0],4))
predictionscat = np.zeros((test.shape[0],4))


from sklearn.model_selection import KFold,StratifiedKFold
fold=StratifiedKFold(n_splits=5,shuffle=True,random_state=2021)
i=1
for dev_index, val_index in fold.split(train_df[feature_cols],y):    
    dev_X, val_X = train_df[feature_cols].loc[dev_index,:], train_df[feature_cols].loc[val_index,:]
    dev_y, val_y = y[dev_index], y[val_index]

    m=CatBoostClassifier(**param_cb)
    m.fit(dev_X,dev_y,eval_set=[(val_X, val_y)], early_stopping_rounds=100,verbose=100)
  
    oofcat[val_index]  = m.predict_proba(val_X.values)
    pred_test = m.predict_proba(test_df[feature_cols].values)
    predictionscat += pred_test

predictionscat = predictionscat/5 

In [None]:
sample_submission['Class_1'] = pd.DataFrame(predictionscat)[0]
sample_submission['Class_2'] = pd.DataFrame(predictionscat)[1]
sample_submission['Class_3'] = pd.DataFrame(predictionscat)[2]
sample_submission['Class_4'] = pd.DataFrame(predictionscat)[3]
sample_submission.to_csv(path_or_buf='submission.csv', index=False)

### Ensemble

In [None]:
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv")
col1 = (pd.DataFrame(predictionscat)[0] * 0.50 + pd.DataFrame(predictionslgb)[0] * 0.50) 
col2 = (pd.DataFrame(predictionscat)[1] * 0.50 + pd.DataFrame(predictionslgb)[1] * 0.50)
col3 = (pd.DataFrame(predictionscat)[2] * 0.50 + pd.DataFrame(predictionslgb)[2] * 0.50)
col4 = (pd.DataFrame(predictionscat)[3] * 0.50 + pd.DataFrame(predictionslgb)[3] * 0.50)
sample_submission['Class_1'] =  col1
sample_submission['Class_2'] =  col2
sample_submission['Class_3'] =  col3
sample_submission['Class_4'] =  col4
sample_submission.to_csv(path_or_buf='submission_blend_lgb_cat.csv', index=False)

In [None]:
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv")
col1 = (pd.DataFrame(predictionscat)[0] * 0.80 + pd.DataFrame(predictionstabnet)[0] * 0.20) 
col2 = (pd.DataFrame(predictionscat)[1] * 0.80 + pd.DataFrame(predictionstabnet)[1] * 0.20)
col3 = (pd.DataFrame(predictionscat)[2] * 0.80 + pd.DataFrame(predictionstabnet)[2] * 0.20)
col4 = (pd.DataFrame(predictionscat)[3] * 0.80 + pd.DataFrame(predictionstabnet)[3] * 0.20)
sample_submission['Class_1'] =  col1
sample_submission['Class_2'] =  col2
sample_submission['Class_3'] =  col3
sample_submission['Class_4'] =  col4
sample_submission.to_csv(path_or_buf='submission_blend_tab_cat.csv', index=False)

In [None]:
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv")
col1 = (pd.DataFrame(predictionscat)[0] * 0.80 + pd.DataFrame(predictionslgb)[0] * 0.20) 
col2 = (pd.DataFrame(predictionscat)[1] * 0.80 + pd.DataFrame(predictionslgb)[1] * 0.20)
col3 = (pd.DataFrame(predictionscat)[2] * 0.80 + pd.DataFrame(predictionslgb)[2] * 0.20)
col4 = (pd.DataFrame(predictionscat)[3] * 0.80 + pd.DataFrame(predictionslgb)[3] * 0.20)
sample_submission['Class_1'] =  col1
sample_submission['Class_2'] =  col2
sample_submission['Class_3'] =  col3
sample_submission['Class_4'] =  col4
sample_submission.to_csv(path_or_buf='submission_blend_lgb_cat1.csv', index=False)

In [None]:
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv")
col1 = (pd.DataFrame(predictionscat)[0] * 0.80 + pd.DataFrame(predictionslgb)[0] * 0.22) 
col2 = (pd.DataFrame(predictionscat)[1] * 0.80 + pd.DataFrame(predictionslgb)[1] * 0.22)
col3 = (pd.DataFrame(predictionscat)[2] * 0.80 + pd.DataFrame(predictionslgb)[2] * 0.22)
col4 = (pd.DataFrame(predictionscat)[3] * 0.80 + pd.DataFrame(predictionslgb)[3] * 0.22)
sample_submission['Class_1'] =  col1
sample_submission['Class_2'] =  col2
sample_submission['Class_3'] =  col3
sample_submission['Class_4'] =  col4
sample_submission.to_csv(path_or_buf='submission_blend_lgb_cat2.csv', index=False)

In [None]:
# Finish