<div style="background-color:skyblue;">
    <h1><center>Tabular Playground Series - NOV 2021</center></h1>
</div>

![](https://storage.googleapis.com/kaggle-competitions/kaggle/26480/logos/header.png?t=2021-04-09-00-57-05)

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import optuna

In [None]:
train = pd.read_csv(r'../input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv(r'../input/tabular-playground-series-nov-2021/test.csv')
sample_submission = pd.read_csv(r'../input/tabular-playground-series-nov-2021/sample_submission.csv')

In [None]:
print(f'train set have {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'test set have {test.shape[0]} rows and {test.shape[1]} columns.') 
print(f'sample_submission set have {sample_submission.shape[0]} rows and {sample_submission.shape[1]} columns.') 

In [None]:
train.head()

In [None]:
train.drop('id',axis=1,inplace=True)
test.drop('id',axis=1,inplace=True)

<div style="background-color:dark;">
    <h1><center>Data visualization</center></h1>
</div>

#### Target Distribution:

In [None]:
plt.figure(figsize=(14,5))
target_values = train['target'].value_counts()
sns.barplot(x=target_values.index, y=target_values.values,linewidth=1.5, facecolor='red',
                 errcolor=".2", edgecolor=".2")
plt.title("Target unique values", fontdict={'fontsize':20})
plt.show()

#### Features Distribution:

In [None]:
fig = plt.figure(figsize = (15, 60))
for i in range(len(train.columns.tolist()[0:100])):
    plt.subplot(24,5,i+1)
    sns.set_style("white")
    plt.title(train.columns.tolist()[0:100][i], size = 10, fontname = 'monospace')
    a = sns.kdeplot(train[train.columns.tolist()[0:100][i]], shade = True, alpha = 0.9, linewidth = 1.5, facecolor='red', edgecolor=".2")
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname = 'monospace')
    plt.yticks([])
    for j in ['right', 'left', 'top']:
        a.spines[j].set_visible(False)
        a.spines['bottom'].set_linewidth(1.2)
        
fig.tight_layout(h_pad = 3)
plt.show()

<div style="background-color:dark;">
    <h1><center>Building Model - CatBoost</center></h1>
</div>

In [None]:
y = train['target']
train.drop('target',axis=1,inplace=True)

#### Optuna:

In [None]:
def fit_cat(trial, x_train, y_train, x_test, y_test):
    params = {'iterations':trial.suggest_int("iterations", 1000, 100000),
              'od_wait':trial.suggest_int('od_wait', 500, 5000),
              'task_type':"GPU",
              'learning_rate' : trial.suggest_uniform('learning_rate', 0.02 , 0.06),
              'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.30 , 0.33),
              'subsample': trial.suggest_uniform('subsample',0.8,1.0),
              'random_strength': trial.suggest_uniform('random_strength',10,50),
              'depth': trial.suggest_int('depth',1,15),
              'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,50),
              'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
              'bootstrap_type':'Poisson'
               }
    
    
    model = CatBoostClassifier(**params)
    model.fit(x_train, y_train,eval_set=[(x_test,y_test)], early_stopping_rounds=150, verbose=False)
    
    y_train_pred = model.predict_proba(x_train)[:,1]
    
    y_test_pred = model.predict_proba(x_test)[:,1]
    y_train_pred = np.clip(y_train_pred, 0.1, None)
    y_test_pred = np.clip(y_test_pred, 0.1, None)
    
    log = {
        "train roc_auc": roc_auc_score(y_train, y_train_pred),
        "valid roc_auc": roc_auc_score(y_test, y_test_pred)
    }
    
    return model, log

In [None]:
def objective(trial):
    roc_auc = 0
    x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=0.20)
    model, log = fit_cat(trial, x_train, y_train, x_test, y_test)
    roc_auc += log['valid roc_auc']
        
    return roc_auc

#### CatBoost:

In [None]:
# catboost params
cat_params = {'iterations': 51269,
 'od_wait': 4941,
 'learning_rate': 0.04213735801762003,
 'reg_lambda': 0.30872600062215705,
 'subsample': 0.8228796292309075,
 'random_strength': 13.402944110034412,
 'depth': 7,
 'min_data_in_leaf': 45,
 'leaf_estimation_iterations': 6,
 'task_type':"GPU",
 'bootstrap_type':'Poisson'}

In [None]:
folds = KFold(n_splits = 5, random_state = 2021, shuffle = True)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    print(f"Fold: {fold}")
    X_train, X_test = train.iloc[trn_idx], train.iloc[val_idx]
    y_train, y_test = y.iloc[trn_idx], y.iloc[val_idx]

    model = CatBoostClassifier(**cat_params)
   
    model.fit(X_train, y_train,
              eval_set=[(X_test, y_test)],
                early_stopping_rounds=400,
                verbose=False)
    pred = model.predict_proba(X_test)[:,1]
    roc = roc_auc_score(y_test, pred)
    print(f" roc_auc_score: {roc}")
    print("-"*50)
    
    predictions += model.predict_proba(test)[:,1] / folds.n_splits 

In [None]:
sample_submission['target'] = predictions
sample_submission.to_csv(f'cat.csv',index = False)