In [None]:
# Importing the required libraries
import numpy as np 
from numpy import percentile
import pandas as pd 
from sklearn import preprocessing,model_selection,metrics
from matplotlib import pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import optuna

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Loading the dataset

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/test.csv")

### Understanding the training data 

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(x=train_df.target)

### The dataset is balanced. Checking if there are any missing values.

In [None]:
train_df.isnull().sum()

### There are no missing values in the dataset.

#### Plotting a histogram for each numerical column

In [None]:
numerical_columns = [col for col in train_df.columns if train_df[col].dtypes!='object' and col not in ('id','target')]
cat_column = [col for col in train_df.columns if train_df[col].dtypes=='object']

In [None]:
len(numerical_columns),len(cat_column)

In [None]:
train_df[numerical_columns].hist(figsize=(25,25))

## Here are the some observations from the above plot:<br> 
*     ### It can be noticed that the numerical features are not uniformly scaled.
     * #### <b>For instance, "f_28" ranges between -1000 to 1000 whereas many other features lie either between -4 to 4 or -10 to 10.</b>
     * #### <b>Also, features "f_07" to "f_18" are left skewed while other features appear to be normally distributed.</b>
*    ### Most importantly, since the base algorithm will be XGBoost, it is not required to perform scaling.

In [None]:
train_df["kfold"] = -1
train_df = train_df.sample(frac=1).reset_index(drop=True)
X,y = train_df.drop('target',axis=1),train_df.target

In [None]:
X.shape,y.shape

### Performing Stratified K-Fold validation with 10 splits

In [None]:
kfold = model_selection.StratifiedKFold(n_splits=10,shuffle=True)

In [None]:
for fold,(train_index,valid_index) in enumerate(kfold.split(X=X,y=y)):
    train_df.loc[valid_index,'kfold'] = fold
train_df.to_csv("train_folds.csv",index=False)

In [None]:
useful_cols = [col for col in train_df.columns if col not in ('id','kfold','f_27','target')]
useful_cols

In [None]:
df_folds = pd.read_csv("./train_folds.csv")

In [None]:
df_folds.shape

#### Finding the optimal hyperparameter values by using fold 0 data.

In [None]:
def run(trial):
    fold = 0
    # Defining the hyperparameters 
    n_estimators = trial.suggest_int("n_estimators",1000,10000)
    learning_rate = trial.suggest_float("learning_rate",0.01,0.25,log=True)
    max_depth = trial.suggest_int("max_depth",1,10)
    reg_lambda = trial.suggest_loguniform("reg_lambda",1e-8,100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha",1e-8,100.0)
    subsample = trial.suggest_float("subsample",0.1,1.0)
    colsample_bytree = trial.suggest_float("subsample",0.1,1.0)
    
    df_train = df_folds[df_folds.kfold!=fold].reset_index(drop=True)
    df_validation = df_folds[df_folds.kfold==fold].reset_index(drop=True)
    
    X_train,Y_train = df_train.drop(['target'],axis=1),df_train.target
    X_valid,Y_valid = df_validation.drop(['target'],axis=1),df_validation.target
    
    X_train,X_valid = X_train[useful_cols],X_valid[useful_cols]
    
    # Building the model 
    model = XGBClassifier(
                                tree_method="gpu_hist",gpu_id=1,predictor="gpu_predictor",
                                n_estimators=n_estimators,learning_rate=learning_rate,max_depth=max_depth,
                                reg_lambda=reg_lambda,reg_alpha=reg_alpha,subsample=subsample,colsample_bytree=colsample_bytree 
                             )
    
    model.fit(X_train,Y_train,early_stopping_rounds=500,eval_set=[(X_valid,Y_valid)],verbose=1000)
    Y_pred_valid = model.predict(X_valid)
    accuracy = metrics.accuracy_score(Y_valid,Y_pred_valid)
    print(f'Accuracy on validation data: {accuracy}')  
    return accuracy

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(run,n_trials=25)

In [None]:
study.best_params

In [None]:
test_predictions = []
def run_fold(fold):
    print(f'Processing fold:{fold}')
    df_train = df_folds[df_folds.kfold!=fold].reset_index(drop=True)
    df_validation = df_folds[df_folds.kfold==fold].reset_index(drop=True)
    
    X_train,Y_train = df_train.drop(['target'],axis=1),df_train.target
    X_valid,Y_valid = df_validation.drop(['target'],axis=1),df_validation.target
    
    X_train,X_valid = X_train[useful_cols],X_valid[useful_cols]
    X_test = test_df.copy()
    X_test = X_test[useful_cols]
    
    # Building the model 
    clf = XGBClassifier(tree_method='gpu_hist',gpu_id=1,
                        predictor='gpu_predictor',
                        n_estimators = 4526,
                        learning_rate = 0.038442373775622506,
                        max_depth = 10,
                        reg_lambda = 4.4146229302758256e-07,
                        reg_alpha =  3.2287428442732536e-05,
                        subsample = 0.8711371422200682)
    
    clf.fit(X_train,Y_train)
    Y_pred_valid = clf.predict(X_valid)
    print(f'Accuracy on validation data: {metrics.accuracy_score(Y_valid,Y_pred_valid)}')
    Y_pred_test = clf.predict(X_test)
    test_predictions.append(Y_pred_test)

In [None]:
print('Building an XGB classifier model')
for index in range(10):
    run_fold(index)

In [None]:
final_predictions = np.mean(np.column_stack(test_predictions),axis=1).tolist()

In [None]:
submissions_df = pd.read_csv("../input/tabular-playground-series-may-2022/sample_submission.csv")
submissions_df.target = final_predictions

In [None]:
submissions_df.to_csv("submission2.csv",index=False)