### In this notebook, I'm trying `XGBoost` extensively by finding right parameters using Grid search
### Please upvote if you find it helpful. <br> Do comment if you find something can be improved, & what else to try next.

In [None]:
# imports
import os
from pathlib import Path
import warnings

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

## Data load

In [None]:
path = Path('/kaggle/input/tabular-playground-series-jun-2021/')
train_df = pd.read_csv(path/'train.csv')
test_df = pd.read_csv(path/'test.csv')
sample_submission = pd.read_csv(path/'sample_submission.csv')

In [None]:
sample_submission.head()

In [None]:
train_df.shape, test_df.shape

In [None]:
# remove unnecessary columns like id column
train_ids = train_df.id
train_df.drop('id', axis=1, inplace=True)
test_ids = test_df.id
test_df.drop('id', axis=1, inplace=True)

In [None]:
target = train_df.target
train_df.drop('target', axis=1, inplace=True)

In [None]:
# Encode target variable
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()
target_labels = pd.Series(label_enc.fit_transform(target), name='target')
target_labels[:5]

In [None]:
train_df.shape, test_df.shape

In [None]:
# null value check
train_df.isnull().sum().sum(), test_df.isnull().sum().sum()

*Seems no null values*

## EDA

In [None]:
# target distribution
target_value_counts = target.value_counts()
target_value_percent = target.value_counts()*100/len(train_df)

plt.figure(figsize=(10, 4))
plt.bar(target_value_counts.keys(), target_value_counts.values)
for (label, value), percent in zip(target_value_counts.items(), target_value_percent.values):
    plt.text(label, value+1000, f'{percent:.1f}%', ha='center')
plt.ylim(0, 60000)
plt.title('Target class counts')
plt.xlabel('Target class')
plt.ylabel('Count')
plt.grid(False)
plt.tight_layout()
plt.show()

In [None]:
def get_num_unique(x):
    return len(x.unique())

feature_unique_df = train_df.apply(lambda x: get_num_unique(x), axis=0).to_frame('n_unique')
zero_percent_df = train_df.apply(lambda x: x.value_counts()[0]*100/len(train_df), axis=0).to_frame('per_zero')

train_describe_df = pd.concat([
    train_df.describe(percentiles=[0.25, 0.5, 0.75, 0.95]).T.drop('count', axis=1),
    feature_unique_df,
    zero_percent_df
], axis=1)
train_describe_df.head()

In [None]:
train_describe_df.sort_values(by=['std','n_unique']) \
                 .style.bar(subset=['mean', 'per_zero']) \
                 .background_gradient(subset=['n_unique', 'std'])

## Data prep

In [None]:
drop_the_zero_features = 0
zero_feature_columns = ['feature_15', 'feature_17', 'feature_22', 'feature_36', 'feature_47', 'feature_49', 'feature_66', 'feature_74']
if drop_the_zero_features:
    train_df.drop(zero_feature_columns, axis=1, inplace=True)
    test_df.drop(zero_feature_columns, axis=1, inplace=True)

train_df.shape, test_df.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df, target_labels, test_size=0.15, random_state=13,
                                                    shuffle=True, stratify=target_labels)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb_baseline = XGBClassifier(n_estimators=1000,
                             learning_rate=0.1,
                             max_depth=5,
                             min_child_weight=1,
                             subsample=0.8,
                             colsample_bytree=0.8,
                             objective='multi:softprob',
                             eval_metric='mlogloss',
                             gamma=0,
                             num_class=9,
                             use_label_encoder=False,
                             tree_method='gpu_hist',
                             n_jobs=-1,
                             seed=13)
xgb_baseline.fit(X_train, y_train, early_stopping_rounds=10, verbose=False, eval_set=[(X_test, y_test)])

In [None]:
# utility function to create submission csv for further predictions
def create_submission_df(test_ids, predictions):
    columns = ['class_'+str(x) for x in range(1,10)]
    predictions_df = pd.DataFrame(predictions, columns=columns)
    submission_df = pd.concat([test_ids, predictions_df], axis=1)
    return submission_df

In [None]:
test_pred = xgb_baseline.predict_proba(test_df)
submission_df = create_submission_df(test_ids, test_pred)
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)

### Feature importances

In [None]:
feature_importance_dict = {
    'columns': train_df.columns,
    'score': xgb_baseline.feature_importances_
}
feature_importance_df = pd.DataFrame(feature_importance_dict, columns=['columns', 'score'])
feature_importance_df.head(3)

In [None]:
feature_importance_df.sort_values('score').plot(kind='barh', figsize=(16, 18));

In [None]:
# lets take top 30 features and do a grid search
top_columns = feature_importance_df.sort_values('score')['columns'][:30]
# top_columns[:5]

## Grid search
### Grid search on max_depth & min_child_weight

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'max_depth': range(3, 10, 2),  # maximum depth tree can reach from root to node
    'min_child_weight': range(1, 6, 2),  # minimum no. of children in a node to make a split
}

In [None]:
grid_cv = GridSearchCV(xgb_baseline, param_grid=param_grid, cv=3, scoring='roc_auc_ovr', verbose=3)
grid_cv.fit(X_train, y_train)

In [None]:
grid_cv.best_params_

In [None]:
xgb_grid_1 = XGBClassifier(n_estimators=1000,
                               max_depth=3,
                               min_child_weight=5,
                               objective='multi:softprob',
                               eval_metric='mlogloss',
                               num_class=9,
                               use_label_encoder=False,
                               tree_method='gpu_hist',
                               n_jobs=-1)
xgb_grid_1.fit(X_train, y_train, early_stopping_rounds=10, verbose=0, eval_set=[(X_test, y_test)])

In [None]:
xgb_grid_1.best_iteration, xgb_grid_1.best_score

### Grid search on gamma

In [None]:
param_grid = {
    'gamma':[i/10.0 for i in range(0,5)]
}

In [None]:
grid_cv_2 = GridSearchCV(xgb_grid_1, param_grid, scoring='roc_auc_ovr', verbose=3, cv=5)
grid_cv_2.fit(X_train, y_train)

In [None]:
grid_cv_2.best_params_

In [None]:
xgb_grid_2 = XGBClassifier(n_estimators=999,
                           max_depth=3,
                           min_child_weight=5,
                           gamma=0.4,
                           objective='multi:softprob',
                           eval_metric='mlogloss',
                           num_class=9,
                           use_label_encoder=False,
                           tree_method='gpu_hist',
                           n_jobs=-1)

xgb_grid_2.fit(X_train, y_train, early_stopping_rounds=10, verbose=0, eval_set=[(X_test, y_test)])

In [None]:
xgb_grid_2.best_iteration, xgb_grid_2.best_score

### Grid serach on subsample & colsample_by_tree

In [None]:
param_grid = {
    'subsample': [i/10.0 for i in range(6,10)],  # fraction of data points to consider for building tree
    'colsample_bytree': [i/10.0 for i in range(6,10)],  # fraction of features to consider for building tree
}

In [None]:
grid_cv_3 = GridSearchCV(xgb_grid_2, param_grid, scoring='roc_auc_ovr', verbose=3, cv=5)
grid_cv_3.fit(X_train, y_train)

In [None]:
grid_cv_3.best_params_

In [None]:
xgb_grid_3 = XGBClassifier(n_estimators=999,
                           max_depth=3,
                           min_child_weight=5,
                           gamma=0.4,
                           colsample_bytree=0.6,
                           subsample=.9,
                           objective='multi:softprob',
                           eval_metric='mlogloss',
                           num_class=9,
                           use_label_encoder=False,
                           tree_method='gpu_hist',
                           n_jobs=-1)

xgb_grid_3.fit(X_train, y_train, early_stopping_rounds=10, verbose=0, eval_set=[(X_test, y_test)])

In [None]:
xgb_grid_3.best_iteration, xgb_grid_3.best_score

### Grid search on learning rate

In [None]:
param_grid = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
}

In [None]:
grid_cv_4 = GridSearchCV(xgb_grid_3, param_grid, scoring='roc_auc_ovr', verbose=3, cv=5)
grid_cv_4.fit(X_train, y_train)

In [None]:
grid_cv_4.best_params_

In [None]:
xgb_grid_4 = XGBClassifier(n_estimators=1000,
                           max_depth=3,
                           min_child_weight=5,
                           eta=0.1,
                           gamma=0.4,
                           subsample=.9,
                           colsample_bytree=0.6,
                           reg_alpha=100,
                           num_class=9,
                           use_label_encoder=False,
                           tree_method='gpu_hist',
                           objective='multi:softprob',
                           eval_metric='mlogloss')
xgb_grid_4.fit(X_train, y_train, verbose=False, early_stopping_rounds=10, eval_set=[(X_test, y_test)])

In [None]:
xgb_grid_4.best_iteration, xgb_grid_4.best_score

## Final XGBoost with best parameters

In [None]:
train_df.shape, test_df.shape, target_labels.shape

In [None]:
xgb_final = XGBClassifier(n_estimators=550,
                           max_depth=3,
                           min_child_weight=5,
                           eta=0.1,
                           subsample=.9,
                           colsample_bytree=0.6,
                           gamma=0.4,
                           reg_alpha=100,
                           num_class=9,
                           use_label_encoder=False,
                           tree_method='gpu_hist',
                           objective='multi:softprob',
                           eval_metric='mlogloss')
xgb_final.fit(train_df, target_labels)

In [None]:
test_pred = xgb_grid_4.predict_proba(test_df)
submission_df = create_submission_df(test_ids, test_pred)
submission_df.head()

In [None]:
submission_df.to_csv('xgb_final.csv', index=False)