<div style="background-color:skyblue;">
    <h1><center>Tabular Playground Series - SEP 2021</center></h1>
</div>

![](https://storage.googleapis.com/kaggle-competitions/kaggle/26480/logos/header.png?t=2021-04-09-00-57-05)

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import optuna

In [None]:
train = pd.read_csv(r'../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv(r'../input/tabular-playground-series-sep-2021/test.csv')
sample_submission = pd.read_csv(r'../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
print(f'train set have {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'test set have {test.shape[0]} rows and {test.shape[1]} columns.') 
print(f'sample_submission set have {sample_submission.shape[0]} rows and {sample_submission.shape[1]} columns.') 

In [None]:
train.head()

In [None]:
train.drop('id',axis=1,inplace=True)
test.drop('id',axis=1,inplace=True)

<div style="background-color:powderblue;">
    <h1><center>Data Preprocessing</center></h1>
</div>

In [None]:
y = train['claim']
train.drop('claim',axis=1,inplace=True)

In [None]:
features = []
for feature in train.columns:
    features.append(feature)
print(features)

* adding a new column **missing**

In [None]:
train['missing'] = train[features].isna().sum(axis=1)
test['missing'] = test[features].isna().sum(axis=1)

* imputing missing values

In [None]:
from sklearn.impute import SimpleImputer
ss = SimpleImputer(missing_values=np.nan, strategy='mean')
train[features] = ss.fit_transform(train[features])
test[features] = ss.transform(test[features])

In [None]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
train[features] = scale.fit_transform(train[features])
test[features] = scale.transform(test[features])
X = train

<div style="background-color:powderblue;">
    <h1><center>Building Model</center></h1>
</div>

<div style="background-color:powderblue;">
    <h2><center>catboost</center></h2>
</div>

In [None]:
def fit_cat(trial, x_train, y_train, x_test, y_test):
    params = {'iterations':trial.suggest_int("iterations", 1000, 20000),
              'od_wait':trial.suggest_int('od_wait', 500, 2000),
              'task_type':"GPU",
              'learning_rate' : trial.suggest_uniform('learning_rate', 0.03 , 0.04),
              'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.32 , 0.33),
              'subsample': trial.suggest_uniform('subsample',0.9,1.0),
              'random_strength': trial.suggest_uniform('random_strength',10,50),
              'depth': trial.suggest_int('depth',1,15),
              'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
              'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
              'bootstrap_type':'Poisson'
               }
    
    
    model = CatBoostClassifier(**params)
    model.fit(x_train, y_train,eval_set=[(x_test,y_test)], early_stopping_rounds=150, verbose=False)
    
    y_train_pred = model.predict_proba(x_train)[:,1]
    
    y_test_pred = model.predict_proba(x_test)[:,1]
    y_train_pred = np.clip(y_train_pred, 0.1, None)
    y_test_pred = np.clip(y_test_pred, 0.1, None)
    
    log = {
        "train roc_auc": roc_auc_score(y_train, y_train_pred),
        "valid roc_auc": roc_auc_score(y_test, y_test_pred)
    }
    
    return model, log

In [None]:
def objective(trial):
    roc_auc = 0
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    model, log = fit_cat(trial, x_train, y_train, x_test, y_test)
    roc_auc += log['valid roc_auc']
        
    return roc_auc

* these are the best params recovered from **Optuna**.

In [None]:
cat_params = {'iterations': 11827,
 'od_wait': 620,
 'learning_rate': 0.033785210897265716,
 'reg_lambda': 0.32796141625302366,
 'subsample': 0.9036907302202852,
 'random_strength': 43.508618917973784,
 'depth': 3,
 'min_data_in_leaf': 23,
 'leaf_estimation_iterations': 12,'task_type':"GPU",'bootstrap_type':'Poisson'}

In [None]:
folds = KFold(n_splits = 5, random_state = 2021, shuffle = True)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X)):
    print(f"Fold: {fold}")
    X_train, X_test = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_test = y.iloc[trn_idx], y.iloc[val_idx]

    model = CatBoostClassifier(**cat_params)
   
    model.fit(X_train, y_train,
              eval_set=[(X_test, y_test)],
                early_stopping_rounds=400,
                verbose=False)
    pred = model.predict_proba(X_test)[:,1]
    roc = roc_auc_score(y_test, pred)
    print(f" roc_auc_score: {roc}")
    print("-"*50)
    
    predictions += model.predict_proba(test)[:,1] / folds.n_splits 

<div style="background-color:powderblue;">
    <h2><center>Prediction and submission</center></h2>
</div>

In [None]:
sample_submission['claim'] = predictions
sample_submission.to_csv(f'cat.csv',index = False)

<div class="alert alert-block alert-info">
<h4>If you like this notebook, please upvote it! 
     Thank you! :)</h4>
</div>