# Tabular Playground Series Jun 2021

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import log_loss
import xgboost as xgb
import lightgbm as lgb

In [None]:
DAT_DIR = '../input/tabular-playground-series-jun-2021'
train_df = pd.read_csv(os.path.join(DAT_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(DAT_DIR, 'test.csv'))
sample_submit = pd.read_csv(os.path.join(DAT_DIR, 'sample_submission.csv'))

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
train_df.dtypes.value_counts()

The previous analysis shows the training data has all numerical variables and 1 categorical target.

In [None]:
num_cols = [c for c in train_df.columns if train_df[c].dtype != 'object']
print('num_cols = \n', num_cols)
target_col = [c for c in train_df.columns if c not in num_cols][0]
print('target_col = ', target_col)

In [None]:
train_df[num_cols].agg(['min', 'mean', 'max'])

The variable distributions are similar and have similar long tails on the right end.

In [None]:
train_df[target_col].value_counts()

The distribution of target variables are quite diverse.

## Prototype models

In this section, we will use several methods to build a few prototype models. Before this, we will divide the training data into train, validation and test subsets.

In [None]:
seed = 123
tr_X, te_X, tr_y, te_y = train_test_split(train_df[num_cols], train_df[target_col], test_size=0.3, stratify=train_df['target'], random_state=seed)
va_X, te_X, va_y, te_y = train_test_split(te_X, te_y, test_size=0.5, stratify=te_y, random_state=seed)

In [None]:
print(f'tr_X.shape = {tr_X.shape}; tr_y.shape = {tr_y.shape}')
print(f'va_X.shape = {va_X.shape}; va_y.shape = {va_y.shape}')
print(f'te_X.shape = {te_X.shape}; te_y.shape = {te_y.shape}')

In [None]:
le = LabelEncoder()
le.fit(tr_y)
tr_y_tr = le.transform(tr_y)
va_y_tr = le.transform(va_y)
te_y_tr = le.transform(te_y)

In [None]:
num_classes = len(le.classes_)
print(f'num_classes = {num_classes}')
print(f'Frequency of the classes:\n{pd.Series(tr_y_tr).value_counts()}')

### XGB

In this section, we will use XGBoost to build a quick model.

In [None]:
xgb_clf = xgb.XGBClassifier(objective='multi:softmax',
                            use_label_encoder=False,
                            booster='gbtree',
                            n_jobs=10,
                            random_state=123,
                            learning_rate = 0.3,
                            max_depth=5,
                            eval_metric='mlogloss')

In [None]:
xgb_clf.fit(tr_X, tr_y_tr,
           eval_set=[(tr_X, tr_y_tr), (va_X, va_y_tr)],
           early_stopping_rounds=30,
           verbose=10)

In [None]:
te_y_pred = xgb_clf.predict(te_X)

In [None]:
pd.Series(te_y_pred).value_counts()

In [None]:
te_yprob_pred = xgb_clf.predict_proba(te_X)

In [None]:
print(f'logloss = {round(log_loss(te_y, te_yprob_pred),3)}')

### LightGBM

In this section, we will use LightGBM to build a quick model.

In [None]:
lgb_clf = lgb.LGBMClassifier(max_depth=20,
                             num_leaves=30,
                             learning_rate=0.05,
                             objective='multiclass',
                             random_state=123,
                             n_jobs=10)

In [None]:
lgb_clf.fit(tr_X, tr_y_tr,
           eval_set=[(tr_X, tr_y_tr), (va_X, va_y_tr)],
           eval_names=['Training', 'Validation'],
           eval_metric='multi_logloss',
           early_stopping_rounds=30,
           verbose=10)

In [None]:
te_y_pred = lgb_clf.predict(te_X)

In [None]:
pd.Series(te_y_pred).value_counts()

In [None]:
te_yprob_pred = lgb_clf.predict_proba(te_X)

In [None]:
print(f'logloss = {round(log_loss(te_y, te_yprob_pred),3)}')

### Pooling Model

We will try a few meta estimator to improve the model performance.

#### Voting Classifier

In [None]:
vote_clf = VotingClassifier(estimators=[('xgb', xgb_clf), ('lgb', lgb_clf)],
                           voting='soft')

In [None]:
vote_clf.fit(tr_X, tr_y_tr)

In [None]:
te_yprob_pred = vote_clf.predict_proba(te_X)

In [None]:
print(f'logloss = {round(log_loss(te_y, te_yprob_pred),3)}')

### Output

Let's retrain the model with all training data and predict on the test.

In [None]:
xgb_clf.fit(train_df[num_cols], 
            le.transform(train_df[target_col]),
            eval_set=[(train_df[num_cols], le.transform(train_df[target_col]))],
            early_stopping_rounds=30,
            verbose=10)

In [None]:
lgb_clf.fit(train_df[num_cols], 
            le.transform(train_df[target_col]),
            eval_set=[(train_df[num_cols], le.transform(train_df[target_col]))],
            eval_names=['Training'],
            eval_metric='multi_logloss',
            early_stopping_rounds=30,
            verbose=10)

In [None]:
vote_clf = VotingClassifier(estimators=[('xgb', xgb_clf), ('lgb', lgb_clf)],
                           voting='soft')

In [None]:
vote_clf.fit(train_df[num_cols],
            le.transform(train_df[target_col]))

In [None]:
test_yprob_pred = vote_clf.predict_proba(test_df[num_cols])

In [None]:
test_yprob_classes = le.inverse_transform(range(num_classes))

In [None]:
test_yprob_classes

In [None]:
out_yprob_df = pd.DataFrame(test_yprob_pred, columns=test_yprob_classes)

In [None]:
out_df = pd.concat([sample_submit['id'], out_yprob_df], axis=1)

In [None]:
out_df.to_csv('submission.csv', index=None, float_format='%.4f')