<div style="background-color:skyblue;">
    <h1><center>Tabular Playground Series - SEP 2021</center></h1>
</div>

![](https://storage.googleapis.com/kaggle-competitions/kaggle/26480/logos/header.png?t=2021-04-09-00-57-05)

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
import optuna

In [None]:
train = pd.read_csv(r'../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv(r'../input/tabular-playground-series-sep-2021/test.csv')
sample_submission = pd.read_csv(r'../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
print(f'train set have {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'test set have {test.shape[0]} rows and {test.shape[1]} columns.') 
print(f'sample_submission set have {sample_submission.shape[0]} rows and {sample_submission.shape[1]} columns.') 

In [None]:
train.head()

In [None]:
train.drop('id',axis=1,inplace=True)
test.drop('id',axis=1,inplace=True)

In [None]:
print('train: ')
train.describe().T.style.bar(subset=['mean'], color='#606ff2')\
                            .background_gradient(subset=['std'], cmap='PuBu')\
                            .background_gradient(subset=['50%'], cmap='PuBu')

In [None]:
print('test: ')
test.describe().T.style.bar(subset=['mean'], color='#606ff2')\
                            .background_gradient(subset=['std'], cmap='PuBu')\
                            .background_gradient(subset=['50%'], cmap='PuBu')

<div style="background-color:powderblue;">
    <h1><center>data visualization</center></h1>
</div>

#### **Target Distribution:**

In [None]:
plt.figure(figsize=(14,5))
target_values = train['claim'].value_counts()
sns.barplot(x=target_values.index, y=target_values.values,linewidth=1.5, facecolor='aquamarine',
                 errcolor=".2", edgecolor=".2")
plt.title("Target unique values", fontdict={'fontsize':20})
plt.show()

#### **Features Distribution:**

In [None]:
fig = plt.figure(figsize = (15, 60))
for i in range(len(train.columns.tolist()[0:118])):
    plt.subplot(24,5,i+1)
    sns.set_style("white")
    plt.title(train.columns.tolist()[0:118][i], size = 10, fontname = 'monospace')
    a = sns.kdeplot(train[train.columns.tolist()[0:118][i]], shade = True, alpha = 0.9, linewidth = 1.5, facecolor='aquamarine', edgecolor=".2")
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname = 'monospace')
    plt.yticks([])
    for j in ['right', 'left', 'top']:
        a.spines[j].set_visible(False)
        a.spines['bottom'].set_linewidth(1.2)
        
fig.tight_layout(h_pad = 3)
plt.show()

In [None]:
fig = plt.figure(figsize = (15, 60))
for i in range(len(train.columns.tolist()[0:118])):
    plt.subplot(24,5,i+1)
    sns.set_style("white")
    plt.title(train.columns.tolist()[0:118][i], size = 10, fontname = 'monospace')
    a = sns.boxplot(train[train.columns.tolist()[0:118][i]], linewidth = 1.5, color="aquamarine")
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname = 'monospace')
    plt.yticks([])
    for j in ['right', 'left', 'top']:
        a.spines[j].set_visible(False)
        a.spines['bottom'].set_linewidth(1.2)
        
fig.tight_layout(h_pad = 3)
plt.show()

<div style="background-color:powderblue;">
    <h1><center>Data Preprocessing</center></h1>
</div>

In [None]:
y = train['claim']
train.drop('claim',axis=1,inplace=True)

In [None]:
features = []
for feature in train.columns:
    features.append(feature)
print(features)

* adding a new column **missing**

In [None]:
train['missing'] = train[features].isna().sum(axis=1)
test['missing'] = test[features].isna().sum(axis=1)

* imputing missing values

In [None]:
from sklearn.impute import SimpleImputer
ss = SimpleImputer(missing_values=np.nan, strategy='mean')
train[features] = ss.fit_transform(train[features])
test[features] = ss.transform(test[features])

In [None]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
train[features] = scale.fit_transform(train[features])
test[features] = scale.transform(test[features])
X = train

<div style="background-color:powderblue;">
    <h1><center>Building Model</center></h1>
</div>

<div style="background-color:powderblue;">
    <h2><center>lightgbm</center></h2>
</div>

In [None]:
def fit_lgb(trial, x_train, y_train, x_test, y_test):
    params = {
        'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 1e-4, 1e4),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda' ,1e-4, 1e4),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0.03 , 0.07),
        'max_depth' : trial.suggest_int('max_depth', 1 , 20),
        'n_estimators' : trial.suggest_int('n_estimators', 100 , 20000),
        'min_child_weight' : trial.suggest_loguniform('min_child_weight', 1e-4, 1e4),
        'subsample' : trial.suggest_uniform('subsample' , 0.01 , 2.0), 
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree', 0.52 , 1),
        'min_child_samples' : trial.suggest_int('min_child_samples', 76, 80),
        'device_type' : 'gpu','n_jobs':4
    } 
    
    
    model = LGBMClassifier(**params)
    model.fit(x_train, y_train,eval_set=[(x_test,y_test)], early_stopping_rounds=150, verbose=False)
    
    y_train_pred = model.predict_proba(x_train)[:,1]
    
    y_test_pred = model.predict_proba(x_test)[:,1]
    y_train_pred = np.clip(y_train_pred, 0.1, None)
    y_test_pred = np.clip(y_test_pred, 0.1, None)
    
    log = {
        "train roc_auc": roc_auc_score(y_train, y_train_pred),
        "valid roc_auc": roc_auc_score(y_test, y_test_pred)
    }
    
    return model, log

In [None]:
def objective(trial):
    roc_auc = 0
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    model, log = fit_lgb(trial, x_train, y_train, x_test, y_test)
    roc_auc += log['valid roc_auc']
        
    return roc_auc

* these are the best params recovered from **Optuna**.

In [None]:
lgb_params = {'reg_alpha': 555.3212078027055, 
              'reg_lambda': 15.677857553252077, 
              'learning_rate': 0.0458129866340546, 
              'max_depth': 16, 
              'n_estimators': 8917, 
              'min_child_weight': 1168.6272539629065, 
              'subsample': 0.1477767833524252, 
              'colsample_bytree': 0.5442132906548389, 
              'min_child_samples': 80,
              'device_type' : 'gpu',
              'importance_type':'gain',
              'n_jobs':4}

In [None]:
folds = KFold(n_splits = 5, random_state = 2021, shuffle = True)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X)):
    print(f"Fold: {fold}")
    X_train, X_test = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_test = y.iloc[trn_idx], y.iloc[val_idx]

    model = LGBMClassifier(**lgb_params)
   
    model.fit(X_train, y_train,
              eval_set=[(X_test, y_test)],
                early_stopping_rounds=400,
                verbose=False)
    pred = model.predict_proba(X_test)[:,1]
    roc = roc_auc_score(y_test, pred)
    print(f" roc_auc_score: {roc}")
    print("-"*50)
    
    predictions += model.predict_proba(test)[:,1] / folds.n_splits 

<div style="background-color:powderblue;">
    <h2><center>Prediction and submission</center></h2>
</div>

In [None]:
sample_submission['claim'] = predictions
sample_submission.to_csv(f'lgb.csv',index = False)

<div class="alert alert-block alert-info">
<h4>If you like this notebook, please upvote it! 
     Thank you! :)</h4>
</div>