# Importing Libraries

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

from sklearn.model_selection import train_test_split, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, auc, roc_auc_score

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
sns.set_style('darkgrid')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load Data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
submit = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

In [None]:
train.head(10)

In [None]:
test.head(10)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.isnull().sum()

In [None]:
train.duplicated().sum()

In [None]:
train.corr()

In [None]:
train.describe()

# EDA

In [None]:
plt.figure(figsize=(7,6))
sns.countplot(x = 'target', data = train, palette="mako")

In [None]:
train.corr()["target"].plot(y='target',figsize=(20, 10),kind="bar", color=['black'])

In [None]:
plt.figure(figsize=(35, 30))
sns.heatmap(train.corr(), linewidths = 0.5, linecolor = 'black', annot=True, cmap='vlag')
plt.xticks(rotation=90)
plt.yticks(rotation=0)

# Prepare Data (train_test_spilt)

In [None]:
test_id = test.loc[:, 'id']
train_target = train.loc[:, 'target']
train.drop(['id', 'target'], axis=1, inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, train_target, test_size=0.25, shuffle=True, 
                                                    random_state=42)
(X_train.shape, X_test.shape)

# Model Building

In [None]:
def objective(trial):
    
    # Hyperparameters for XGBClassifier
    xgb_params = {
        'random_state': 1, 
        'n_jobs': 3,
        'booster': 'gbtree',
        'eval_metric': 'auc',
        'n_estimators': 5000,
        'learning_rate': 0.05,
        'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),  
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
        'subsample': trial.suggest_float('subsample', 0.2, 1.),
        'colsample_bytree': trial.suggest_float('colsample_bytree',0.2, 1.),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'use_label_encoder': False,
    }
    
    #K-Fold Split
    skf = StratifiedKFold(n_splits=5, shuffle=False, random_state=42)
    final = []
    
    for train_idx, test_idx in skf.split(X_train, y_train):
        X_train_val, X_test_val = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_train_val, y_test_val = y_train.iloc[train_idx], y_train.iloc[test_idx]
        
        model = XGBClassifier(**xgb_params, tree_method = 'gpu_hist')
        model.fit(
            X_train_val, y_train_val,
            eval_metric='auc',
            early_stopping_rounds = 200,
            eval_set = [(X_test_val, y_test_val)], 
            verbose = 0
        )
        
        pred = model.predict_proba(X_test)
        pred = pred[:, 1]
        final.append(pred)
    
    score = np.mean(final, axis = 0)
    return roc_auc_score(y_test, score)

# Optuna (Studying Various Parameters)

In [None]:
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective, n_trials = 30)

In [None]:
study_xgb.best_trial, study_xgb.best_params

# Model Building & Predictions (Using Best Parameters)

In [None]:
final_preds = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, test_idx in skf.split(train, train_target):
        X_train, X_test = train.iloc[train_idx], train.iloc[test_idx]
        y_train, y_test = train_target.iloc[train_idx], train_target.iloc[test_idx]
        
        xgb_model = XGBClassifier(**study_xgb.best_params, tree_method = 'gpu_hist', use_label_encoder=False, objective='binary:logistic',
                                 eval_metric='auc')
        xgb_model.fit(
            X_train, y_train,
            eval_metric='auc',
            early_stopping_rounds = 200,
            eval_set = [(X_test, y_test)], 
            verbose = 0
        )
        
        print(xgb_model.best_score)
        test_preds = xgb_model.predict_proba(test.drop('id', axis=1))
        test_preds = test_preds[:, 1]
        final_preds.append(test_preds)

# Submissions

In [None]:
submit.target = np.mean(final_preds, axis=0)
submit.to_csv('submission.csv', index=False)