In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import gc

from colorama import Fore, Back, Style

y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
rs_ = Style.RESET_ALL

import warnings
warnings.filterwarnings('ignore')
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/train.csv', index_col=None)
test = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/test.csv', index_col=None)
submission = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv', index_col=None)

In [None]:
train.shape, test.shape

In [None]:
pd.set_option('display.max_columns', None)
train

In [None]:
all_cols = list(train.columns[1:-1])
target_count = train.groupby('target')['id'].count().reset_index()
#target_count
colors = {'Class_1' : '#0722ab',
'Class_2' : '#fdb913',
'Class_3' : '#3d2256',
'Class_4' : '#ef4022'}
target_count.rename(columns={'id':'count'}, inplace=True)
target_count['pct'] = (target_count['count'] / target_count['count'].sum())*100
#target_count
def autopct_format(values):
    def my_format(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{v:d}%'.format(v=val)
    return my_format

explode = (0.05,0.05,0.05,0.05)
fig1, ax1 = plt.subplots(1,1, figsize=(6, 6), facecolor='w', edgecolor='b')
sizes = target_count['pct']
labels = target_count['target']
patches, texts, autotexts = ax1.pie(sizes, 
          colors = [colors[key] for key in labels], 
          labels=labels, 
          autopct=autopct_format(sizes), 
          startangle=90, 
          pctdistance=0.85, 
          explode = explode,
         textprops={'fontsize': 14,
                   'fontfamily':'Computer Modern'
                   })
[text.set_color('#4a4b52') for text in texts]
[autotext.set_color('white') for autotext in autotexts]
[autotext.set_weight('bold') for autotext in autotexts]
#draw circle
centre_circle = plt.Circle((0,0),0.70,fc='white')
art = ax1.add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
ax1 = ax1.axis('equal')  
plt.tight_layout(pad=3.0)
plt.subplots_adjust(top=0.91)
plt.suptitle('Target class distribution',fontsize = 20)
plt.show()
all_cols = train.columns[1:-1]
train[all_cols].describe().T.style.background_gradient(subset=['mean'], cmap='viridis_r')\
        .background_gradient(subset=['std'], cmap='viridis_r')\
        .background_gradient(subset=['min'], cmap='nipy_spectral')\
        .background_gradient(subset=['max'], cmap='binary')

In [None]:
test[all_cols].describe().T.style.background_gradient(subset=['mean'], cmap='viridis_r')\
        .background_gradient(subset=['std'], cmap='viridis_r')\
        .background_gradient(subset=['min'], cmap='nipy_spectral')\
        .background_gradient(subset=['max'], cmap='binary')

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
enc = le.fit_transform(train.target)
train = train.assign(target=enc)
train.head()

In [None]:
X = train[all_cols]
y = train['target']

# K-Means and SMOTE

In [None]:
from imblearn.over_sampling import KMeansSMOTE
from sklearn.cluster import MiniBatchKMeans
for label, count in zip(*np.unique(train['target'], return_counts=True)):
    print('Class {} has {} samples'.format(label, count))

kmeans_smote = KMeansSMOTE(
    sampling_strategy = 'not majority',
    random_state = 42,
    k_neighbors = 10,
    cluster_balance_threshold = 0.1,
    kmeans_estimator = MiniBatchKMeans(n_clusters=100, random_state=42)
    #kmeans_estimator = 100
)
X_resampled, y_resampled = kmeans_smote.fit_resample(train[all_cols], train['target'])

for label, count in zip(*np.unique(y_resampled, return_counts=True)):
    print('Class {} has {} samples after oversampling'.format(label, count))

In [None]:
X_new = pd.DataFrame(X_resampled, columns=all_cols, index=None)

# Optuna + XGBoost 

In [None]:
import optuna
from functools import partial
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm import tqdm

def callback(study, trial):
    if study.best_trial.number == trial.number:
        study.set_user_attr(key="best_model", value=trial.user_attrs["best_model"])

def optimize(trial, X, y, n_splits):
 
    n_estimators = trial.suggest_int("n_estimators",500,2500)
    max_depth = trial.suggest_int("max_depth",10,25)
    learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
    gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
    subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
    min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
    reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
    reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
    colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)

    params = dict(use_label_encoder=False,
                  eval_metric='logloss',
                  objective='multi:softmax',
                  n_estimators = n_estimators,
                  max_depth = max_depth,
                  learning_rate = learning_rate,
                  gamma = gamma,
                  subsample = subsample,
                  min_child_weight = min_child_weight,
                  reg_lambda = reg_lambda,
                  reg_alpha = reg_alpha,                  
                  colsample_bytree = colsample_bytree,
                  random_state=42)

    gpu_params = dict(tree_method='gpu_hist',gpu_id=0)
    params.update(gpu_params)

    model = XGBClassifier(**params)
    
    strat_split = StratifiedShuffleSplit(n_splits=n_splits, 
                                         test_size = 0.2, 
                                         random_state=42)
    lg_loss = []
    for fold, (train_idx, test_idx) in tqdm(enumerate(strat_split.split(X=X, y=y))):
        X_train = X.loc[train_idx]
        y_train = y.loc[train_idx]      
        X_val = X.loc[test_idx]
        y_val = y.loc[test_idx]

        #model.fit(X_train, y_train,eval_set=[(X_val,y_val)], early_stopping_rounds=100)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        fold_lgloss = log_loss(y_val,preds)
        lg_loss.append(fold_lgloss)

    print(f"{y_}Mean log_logss : {np.mean(lg_loss)}{rs_}")
    trial.set_user_attr(key="best_model", value=model)
    return np.mean(lg_loss)

In [None]:
num_trails = 25
study = optuna.create_study(direction='minimize', study_name='tps-may2021-xgboost-optuna')
optimization_function = partial(optimize, X=X_new, y=y_resampled, n_splits=5)
study.optimize(optimization_function,n_trials=num_trails, callbacks=[callback])

In [None]:
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))


In [None]:
study.best_trial.params

# Train XGBoost with best parameters

In [None]:
params = dict(use_label_encoder=False,
                  eval_metric='logloss',
                  objective='multi:softmax',
              verbosity=1,
              random_state=42)
params.update(study.best_trial.params)
gpu_params = dict(tree_method='gpu_hist',gpu_id=0)
params.update(gpu_params)
params

In [None]:
#best_model=study.user_attrs["best_model"]
best_model = XGBClassifier(**params)
best_model.fit(X_new,y_resampled)

# Predict

In [None]:
predictions = best_model.predict_proba(test[all_cols])

In [None]:
submit = pd.DataFrame(predictions, columns = ["Class_1", "Class_2", "Class_3", "Class_4"])
submit['id'] = submission['id']

In [None]:
submit.to_csv("xgboost_baseline.csv", index = False)