> https://www.kaggle.com/c/tabular-playground-series-may-2021/data

In [None]:
import os
import warnings
from pathlib import Path

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
pd.options.display.max_columns = 100
warnings.filterwarnings('ignore')

## Data loading

In [None]:
data_path = Path('/kaggle/input/tabular-playground-series-may-2021/')

In [None]:
train_df = pd.read_csv(data_path/'train.csv')
test_df = pd.read_csv(data_path/'test.csv')

train_df.shape, test_df.shape

In [None]:
# concat train and test data for EDA study

df = pd.concat([train_df.drop(['id', 'target'], axis=1), test_df.drop('id', axis=1)], axis=0)
len(df)

In [None]:
sample_submission_df = pd.read_csv(data_path/'sample_submission.csv')
sample_submission_df.head()

In [None]:
target = train_df.target
train_df.drop(['id', 'target'], axis=1, inplace=True)

In [None]:
test_ids = test_df.id
test_df.drop('id', axis=1, inplace=True)

## Feature engineering

In [None]:
print(df.columns)
print(f'Total no. of features: {len(df.columns)}')

### Duplicate rows removal

In [None]:
print(len(train_df))
train_df.drop_duplicates(keep='first', inplace=True)
len(train_df)

In [None]:
target = target[train_df.index]
len(target)

*4 rows are duplicates*

### Imputation

In [None]:
df.isnull().sum().sum()

> *No missing values in the training & testing datasets*

### Identifying Categorical vs Numeric

In [None]:
df.head()

> *All features are numeric*<br>

Let's find out more from Histogram plot

### Histogram plots of all features

In [None]:
# plotting the histogram graph with y axis set to log scale, as most of values are zeros

In [None]:
figure, ax = plt.subplots(10, 5, figsize=(20, 30))
ax = [x for y in ax for x in y]
for axis, feature in zip(ax, df.columns):
    axis.hist(df[feature], bins=20, range=(df[feature].min(), df[feature].max()))
    axis.set_yscale('log')
    axis.set_title(feature)
figure.tight_layout()
plt.show()

### Insights from histogram plots
1. All the feature data is right skewed
1. Maximum values in any feature column are zeros, >90% in all columns
2. Feature 2, 13, 22, 36 - looks like categorical features
4. Lot of features looks correlated with each other. For example: feature_0 ~= feature_6 & feature_11 ~= feature_12 etc.,

> Should we consider all the features with max value < 10 as **Categorical**?

In [None]:
def get_min_max(x):
    return x.min(), x.max(), len(x.unique())

In [None]:
min_max_df = train_df.apply(lambda x: get_min_max(x), axis=0)
min_max_df.index = ['min', 'max', 'num unique']
min_max_df = min_max_df.transpose()
min_max_df.sort_values('num unique').head(15)

1. Not much difference between features with max value >= 10.
2. Considering only 13, 36, 22 and 2 as categorical

In [None]:
cat_features = ['feature_13', 'feature_36', 'feature_22', 'feature_2']

In [None]:
train_df = pd.get_dummies(train_df, columns=cat_features, drop_first=True)
test_df = pd.get_dummies(test_df, columns=cat_features, drop_first=True)

train_df.shape, test_df.shape

### Feature selection using Lasso

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
target_labels = label_encoder.fit_transform(target)
target_labels[:10]

In [None]:
from sklearn.linear_model import LassoCV

lasso = LassoCV()
lasso.fit(train_df, target_labels)

In [None]:
print(f'best alpha: {lasso.alpha_}')
print(f'best score: {lasso.score(train_df, target_labels)}')

## Baseline model building - XGBoost

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_df, target_labels, test_size=0.2, random_state=13, shuffle=True,
                                                    stratify=target_labels)

X_train.shape, X_test.shape

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(objective='multi:softprob', 
                          random_state=13,
                          use_label_encoder=False, 
                          num_class=4,
                          eta=0.2,
                          max_depth = 3,
                          n_estimators = 1500)
xgb_model.fit(X_train, y_train,
              verbose=False,
              early_stopping_rounds=10,
              eval_metric=['mlogloss'],
              eval_set=[(X_train, y_train), (X_test, y_test)])

In [None]:
xgb_model.best_score

In [None]:
# test results

predictions = xgb_model.predict_proba(test_df)
predictions_df = pd.DataFrame(predictions, columns=['class_1', 'class_2', 'class_3', 'class_4'])

In [None]:
submission_df = pd.concat([test_ids, predictions_df], axis=1)
submission_df.head()

In [None]:
submission_df.to_csv('xgb_baseline.csv', index=False)

In [None]:
# utility function to create submission csv for further predictions
def create_submission_df(test_ids, predictions):
    predictions_df = pd.DataFrame(predictions, columns=['class_1', 'class_2', 'class_3', 'class_4'])
    submission_df = pd.concat([test_ids, predictions_df], axis=1)
    return submission_df

### Randomized grid search with XGBoost

In [None]:
param_grid = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [2, 3, 4, 5]
}

print(f'Total number of combinations: {3 * 5 * 3 * 3 * 4}')

In [None]:
# lets run xgboost on gpu for faster computationb
!nvidia-smi

In [None]:
folds = 5
param_combination = 10

In [None]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV

skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=13)

In [None]:
xgb2 = XGBClassifier(learning_rate=0.1, n_estimators=500, early_stopping_rounds=10,
                     objective='multi:softprob', eval_metric='mlogloss', verbose=1,
                     tree_method='gpu_hist')  # uses gpu !*important
random_search = RandomizedSearchCV(xgb2, param_distributions=param_grid,
                                   n_iter=param_combination, scoring='roc_auc_ovr',
                                   n_jobs=-1, cv=skf.split(X_train, y_train),
                                   verbose=1, random_state=13)

In [None]:
%time
random_search.fit(X_train, y_train)

In [None]:
random_search.best_score_, random_search.best_params_

In [None]:
from sklearn.metrics import log_loss

def eval_X_test(X_test, y_test, model):
    y_test_pred = model.predict_proba(X_test)
    return round(log_loss(y_test, y_test_pred), 6)

eval_X_test(X_test, y_test, random_search)

In [None]:
test_pred = random_search.predict_proba(test_df)
submission_df2 = create_submission_df(test_ids, test_pred)
submission_df2.head()

In [None]:
submission_df2.to_csv('xgb_random_search-2.csv', index=False)

#### Let's increase the param combination and search for more points in grid space

In [None]:
param_combination = 100

random_search_100 = RandomizedSearchCV(xgb2, param_distributions=param_grid,
                                   n_iter=param_combination, scoring='roc_auc_ovr',
                                   n_jobs=-1, cv=skf.split(X_train, y_train),
                                   verbose=10, random_state=13)

In [None]:
random_search_100.fit(X_train, y_train)

In [None]:
random_search_100.best_score_, random_search.best_params_

In [None]:
eval_X_test(X_test, y_test, random_search_100)

In [None]:
subm_pred = random_search_100.predict_proba(test_df)
submission_df = create_submission_df(test_ids, subm_pred)
submission_df.head()

In [None]:
submission_df.to_csv('xgb_random_search-100.csv', index=False)