In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Import Data

In [None]:
import plotly as py
from statistics import mean
import plotly.graph_objects as go
import plotly.express as px 
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from umap import UMAP

#Models
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold ,KFold
from sklearn.ensemble import VotingClassifier


import optuna
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/train.csv")
train_df.head()

In [None]:
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/test.csv")
test_df.head()

In [None]:
features = train_df.drop(["id","claim"],axis=1).columns.tolist()
print(features, end ="")

In [None]:
 def missing_rate(data):
    missing_rate = {}
    for col in data.columns:
        column_missing_rate = data[col].isna().sum() / len(data[col]) * 100
        missing_rate[col] = round(column_missing_rate,2)

    missing_rate = pd.DataFrame(missing_rate.items(),index=None)
    return missing_rate


In [None]:
missing_rate_train = missing_rate(train_df[features])
missing_rate_train
missing_rate_test = missing_rate(test_df[features])
missing_rate_test
total_missing = pd.concat([missing_rate_train,missing_rate_test],axis=1)
total_missing

# EDA

In [None]:
sns.histplot(data=train_df['claim'],palette="viridis",bins=10,color="red")


In [None]:
claim_percentage = pd.DataFrame(train_df["claim"].value_counts() / len(train_df.claim))
claim_percentage.T

In [None]:
sns.countplot(train_df.claim ,palette="Set3")

In [None]:
fig, axes = plt.subplots(nrows= 10 , ncols=6 ,figsize =(30,36))
column_number = 1
for i in range(0,10):
    for j in range(0,6):
        subchart = sns.kdeplot(data=train_df , x = str(train_df.columns[column_number]),ax =axes[i,j],color="red",label ="Train")
        subchart = sns.kdeplot(data=test_df , x = str(test_df.columns[column_number]),ax =axes[i,j],color="blue",label ="Train")
        column_number +=1

In [None]:
fig, axes = plt.subplots(nrows= 10 , ncols=6 ,figsize =(30,36))
column_number = 58
for i in range(0,10):
    for j in range(0,6):
        subchart = sns.kdeplot(data=train_df , x = str(train_df.columns[column_number]),ax =axes[i,j],color="red",label ="Train")
        subchart = sns.kdeplot(data=test_df , x = str(test_df.columns[column_number]),ax =axes[i,j],color="blue",label ="Test")
        column_number +=1

In [None]:
matrix = np.triu(train_df.drop("id" , axis=1).corr())
plt.figure(figsize=(20 ,10))
sns.heatmap(train_df.drop("id", axis=1).corr() , annot= False , cmap="icefire" , mask=matrix , linecolor="white" ,cbar=True ,vmin= - 0.05  , vmax= 0.05 ,linewidths=0.1)
plt.show()

# Preproccesing

In [None]:
features = train_df.columns.to_list()[1:119]
train_df["n_missing"] = train_df[features].isna().sum(axis=1)
test_df["n_missing"] = test_df[features].isna().sum(axis=1)

train_df['std'] = train_df[features].std(axis=1)
test_df["std"] = test_df[features].std(axis=1)

features += ["n_missing" , "std"]

In [None]:
simple_imputer = SimpleImputer(strategy="mean")
train_df[features] = simple_imputer.fit_transform(train_df[features])
test_df[features] = simple_imputer.transform(test_df[features])

In [None]:
standard_scaler = StandardScaler()
train_df[features] = standard_scaler.fit_transform(train_df[features])
test_df[features] = standard_scaler.transform(test_df[features])

In [None]:
X = train_df.drop(["id","claim"] ,axis=1)
Y = train_df["claim"]
X_test = test_df.drop("id" , axis=1)

# XGBoost Model

Train with Optuna 

In [None]:
def objective(trial , data=X , target= Y):
    params ={"max_depth" :trial.suggest_int("max_depth" ,2,8) , 
          "learning_rate" : trial.suggest_float("learning_rate" , 0.005 , 0.2),
          "n_estimators" : trial.suggest_int("n_estimators" , 1000 ,5000),
          "min_child_weight" : trial.suggest_int("min_child_weight" , 1,500),
          "gamma" : trial.suggest_float("gamma" ,0.0001 , 1.0 , log = True),
          "alpha": trial.suggest_float("alpha" , 0.0001 , 10 ,log = True),
          "lambda": trial.suggest_float("lambda" ,0.0001, 10.0 , log = True),
          "colsample_bytree": trial.suggest_float("colsample_bytree" , 0.1 , 0.8), 
          "subsample": trial.suggest_float("subsample" , 0.1,0.9),
          "tree_method" : "gpu_hist",
          "booster" : "gbtree",
           "random_state": 228 ,
           "use_label_encoder" : False,
           "eval_metric" : "auc"
          }
    model = XGBClassifier(**params)
    scores = []
    K = StratifiedKFold(n_splits=4,random_state=228 , shuffle=True)
    for i ,(train_idx , val_idx) in enumerate(K.split(X,Y)):
        X_train ,X_val = X.iloc[train_idx],X.iloc[val_idx]
        Y_train ,Y_val = Y.iloc[train_idx],Y.iloc[val_idx]
        model.fit(X_train ,Y_train ,eval_set = [(X_val,Y_val)] ,early_stopping_rounds =300 ,verbose = False)
        
        train_prediction = model.predict_proba(X_train)[:,1]
        train_score = roc_auc_score(Y_train ,train_prediction)
        
        validate_prediction = model.predict_proba(X_val)[:,1]
        validate_score = roc_auc_score(Y_val , validate_prediction)
        scores.append((train_score , validate_score))
        
        print(f"Fold {i+1} | AUC : {validate_score} ")
        
    scores = pd.DataFrame(scores ,columns=["train Score" , "Validation Score"])
    return scores["Validation Score"].mean() 

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective ,n_trials= 20)
print("Numbers of finished trials : " , len(study.trials))
print("Best Trials : ", study.best_trial.params)
print("Best Values : " , study.best_value)

In [None]:
xgb_params = study.best_trial.params
xgb_params

Predict X_Test

In [None]:
folds = StratifiedKFold(n_splits=5,random_state=228,shuffle=True)
predictions = np.zeros(len(X_test))
for fold,(train_idx,validate_idx) in enumerate(folds.split(X,Y)):
    X_train,X_validate = X.iloc[train_idx] ,X.iloc[validate_idx]
    Y_train,Y_validate = Y.iloc[train_idx] ,Y.iloc[validate_idx]
    xgb_model = XGBClassifier(**xgb_params ,tree_method= "gpu_hist",booster = "gbtree" ,random_state = 228,use_label_encoder = False ,eval_metric = "auc")
    xgb_model.fit(X_train,Y_train,eval_set = [(X_validate,Y_validate)],verbose =False,early_stopping_rounds =300)
    predictions += xgb_model.predict_proba(X_test)[:,1] /folds.n_splits

In [None]:
submit = pd.DataFrame({"id":test_df['id'] , "claim": predictions})
submit.to_csv("/kaggle/working/xgb_submit.csv",index=False)

# CatBoost

Train model with uptuna

In [None]:
def objective_cb(trial , data=X ,target=Y):
    params = {"depth":trial.suggest_int("depth" ,2,6),
              "learning_rate":trial.suggest_float("learning_rate" ,0.005 ,0.2),
              "iterations":trial.suggest_int("iterations" ,10000 , 50000),
              "max_bin":trial.suggest_int("max_bin" , 1,300),
              "min_data_in_leaf":trial.suggest_int("min_data_in_leaf" , 1,300),
              "l2_leaf_reg":trial.suggest_float("l2_leaf_reg" , 0.0001 , 1.0 ,log =True),
              "subsample":trial.suggest_float("subsample" , 0.1 , 0.8 ),
              "grow_policy":trial.suggest_categorical("grow_policy" ,['SymmetricTree', 'Depthwise', 'Lossguide']),
              "leaf_estimation_method":trial.suggest_categorical("leaf_estimation_method" , ["Newton" , "Gradient"]),
              "bootstrap_type" :"Bernoulli",
              "random_seed" : 228 , 
              "loss_function":"Logloss",
              "eval_metric":"AUC",
              "task_type" : "GPU"
             }
    model  = CatBoostClassifier(**params)
    scores = []
    K = StratifiedKFold(n_splits=4,random_state=228 , shuffle=True)
    for i ,(train_idx , val_idx) in enumerate(K.split(X,Y)):
        X_train ,X_val = X.iloc[train_idx],X.iloc[val_idx]
        Y_train ,Y_val = Y.iloc[train_idx],Y.iloc[val_idx]
        model.fit(X_train ,Y_train ,eval_set = [(X_val,Y_val)] ,early_stopping_rounds =300 ,verbose = False)
        
        train_prediction = model.predict_proba(X_train)[:,1]
        train_score = roc_auc_score(Y_train ,train_prediction)
        
        validate_prediction = model.predict_proba(X_val)[:,1]
        validate_score = roc_auc_score(Y_val , validate_prediction)
        scores.append((train_score , validate_score))
        
        print(f"Fold {i+1} | AUC : {validate_score} ")
        
    scores = pd.DataFrame(scores ,columns=["train Score" , "Validation Score"])
    return scores["Validation Score"].mean()

In [None]:
study_cb = optuna.create_study(direction ="maximize")
study_cb.optimize(objective_cb,n_trials=10)
print("Numbers of finished trials : " , len(study_cb.trials))
print("Best Trials : ", study_cb.best_trial.params)
print("Best Values : " , study_cb.best_value)

Predict X-test

In [None]:
cb_params = study_cb.best_trial.params
cb_params

In [None]:
folds = StratifiedKFold(n_splits=5,random_state=228,shuffle=True)
predictions = np.zeros(len(X_test))
for fold,(train_idx,validate_idx) in enumerate(folds.split(X,Y)):
    X_train,X_validate = X.iloc[train_idx] ,X.iloc[validate_idx]
    Y_train,Y_validate = Y.iloc[train_idx] ,Y.iloc[validate_idx]
    cb_model = CatBoostClassifier(**cb_params ,bootstrap_type="Bernoulli",random_seed= 228 ,loss_function="Logloss",eval_metric="AUC",task_type= "GPU")
    cb_model.fit(X_train,Y_train,eval_set = [(X_validate,Y_validate)],verbose =False,early_stopping_rounds =300)
    predictions += cb_model.predict_proba(X_test)[:,1] /folds.n_splits

In [None]:
submit = pd.DataFrame({"id":test_df['id'] , "claim": predictions})
submit.to_csv("/kaggle/working/catboost_submit.csv",index=False)


# LGBM

Train Model with optuna

In [None]:
def objective_lgbm(trial , data = X ,Target=Y):
    
    params = {"n_estimators" : trial.suggest_int("n_estimators" , 1000 , 15000),
             "max_depth" : trial.suggest_int("max_depth", 2,4),
             "learning_rate": trial.suggest_float("learning_rate",0.005 ,0.2),
             "reg_alpha": trial.suggest_float("reg_alpha" , 0.001 , 10 ),
             "reg_lambda" : trial.suggest_float("reg_lambda" , 0.001 , 10),
             "num_leaves":trial.suggest_int("num_leaves" , 50 ,500),
             "min_data_per_group":trial.suggest_int("min_data_per_group",50,200),
             "min_child_samples":trial.suggest_int("min_child_samples",5,200),
             "colsample_bytree":trial.suggest_float("colsample_bytree",0.1 ,0.8),
             "boosting_type": "gbdt",
             "objective": "binary",
             "random_state": 228,
             "metric": "auc",
             "device": "gpu"
             }
    
    model  = LGBMClassifier(**params)
    scores = []
    
    K = StratifiedKFold(n_splits=4,random_state=228 , shuffle=True)
    for i ,(train_idx , val_idx) in enumerate(K.split(X,Y)):
        X_train ,X_val = X.iloc[train_idx],X.iloc[val_idx]
        Y_train ,Y_val = Y.iloc[train_idx],Y.iloc[val_idx]
        model.fit(X_train,Y_train,eval_set=[(X_val,Y_val)],early_stopping_rounds=300 , verbose =False)
        
        train_prediction = model.predict_proba(X_train)[:,1]
        train_score = roc_auc_score(Y_train ,train_prediction)
        
        validate_prediction = model.predict_proba(X_val)[:,1]
        validate_score = roc_auc_score(Y_val , validate_prediction)
        scores.append((train_score , validate_score))
        
        print(f"Fold {i+1} | AUC : {validate_score} ")
    
    scores = pd.DataFrame(scores ,columns=["train Score" , "Validation Score"])
    return scores["Validation Score"].mean()

In [None]:
study_lgbm = optuna.create_study(direction="maximize")
study_lgbm.optimize(objective_lgbm ,n_trials=10)
print("Numbers of finished trials : " , len(study_lgbm.trials))
print("Best Trials : ", study_lgbm.best_trial.params)
print("Best Values : " , study_lgbm.best_value)

Predict X_test

In [None]:
lgbm_params = study_lgbm.best_trial.params
lgbm_params

In [None]:
lgbm_params ={'n_estimators': 6630, 'max_depth': 3, 'learning_rate': 0.053625067203773684,
              'reg_alpha': 4.618041066261469, 'reg_lambda': 7.9389723810790604, 'num_leaves': 203,
              'min_data_per_group': 83, 'min_child_samples': 141, 'colsample_bytree': 0.13048987522123276}

In [None]:
folds = StratifiedKFold(n_splits=5,random_state=228,shuffle=True)
predictions = np.zeros(len(X_test))
for fold,(train_idx,validate_idx) in enumerate(folds.split(X,Y)):
    X_train,X_validate = X.iloc[train_idx] ,X.iloc[validate_idx]
    Y_train,Y_validate = Y.iloc[train_idx] ,Y.iloc[validate_idx]
    lgbm_model = LGBMClassifier(**lgbm_params,boosting_type="gbdt",objective="binary",random_state=228,metric="auc",device="gpu")
    lgbm_model.fit(X_train,Y_train,eval_set = [(X_validate,Y_validate)],verbose =False,early_stopping_rounds =300)
    predictions += lgbm_model.predict_proba(X_test)[:,1] /folds.n_splits

In [None]:
submit = pd.DataFrame({"id":test_df['id'] , "claim": predictions})
submit.to_csv("/kaggle/working/lgbm_submit.csv",index=False)
