# TPS March EDA 

In [None]:
# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
import plotly.express as px

from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import log_loss

from lightgbm import LGBMClassifier
import lightgbm as lgbm
from xgboost import XGBClassifier
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import the data
train = pd.read_csv("../input/tabular-playground-series-may-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-may-2021/sample_submission.csv")

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe()

## Data Visualization

In [None]:
fig = px.histogram(train['target'], height=400, width=700, template='plotly_dark+presentation')
fig.show()

In [None]:
features = list(train.columns[1:-1])

In [None]:
def visualize_features(features, seaborn_plot, num_rows, num_cols, fig_size, **kwargs):
    """
    Visualize features in train and test data
    """
    plt.figure(figsize=fig_size)
    c = 1
    for feat in features:
        plt.subplot(num_rows, num_cols, c)
        seaborn_plot(x=train[feat], **kwargs)
        #seaborn_plot(x=test[feat], **kwargs, color='yellow')
        c = c + 1
    
    plt.show()

In [None]:
visualize_features(features, sns.kdeplot, 10, 5, (20, 35), hue=train['target'])

* Most of the features are zero
* Class 2 is the most frequent class in target.

## Umap visualization

In [None]:
umap_df = train.drop(['id', 'target'], axis=1)

In [None]:
from umap import UMAP

In [None]:
umap_2d = UMAP(n_components=2, init='random')
projections = umap_2d.fit_transform(umap_df)

In [None]:
# Visualize UMAP
fig = px.scatter(projections, 
                 x=0, 
                 y=1, 
                 color=train.target, 
                 labels={'color':'Target', '0':'x_component', '1':'y_component'}
                )
fig.show()

# Preprocessing

In [None]:
def preprocess(df):
    
    le = LabelEncoder()
    df['target'] = le.fit_transform(df['target'])
    
    return df

In [None]:
train = preprocess(train)

## Build Model

In [None]:
X = train.drop(['target', 'id'], axis=1)
y = train['target']

In [None]:
def cross_validate(X, y, model, params, folds=5):

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    for fold, (tr_idx, ts_idx) in enumerate(skf.split(X, y)):
        print(f"Fold: {fold}")
        x_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
        x_ts, y_ts = X.iloc[ts_idx], y.iloc[ts_idx]

        clf = model(**params)
        clf.fit(x_tr, y_tr,
                eval_set=[(x_ts, y_ts)],
                early_stopping_rounds=100,
                verbose=False)

        pred = clf.predict_proba(x_ts)
        loss = log_loss(y_ts, pred)
        print(f" Log loss: {loss}")
        print("-"*50)
    
    return clf

## LGBMClassifier

In [None]:
lgbm_params = {
    'learning_rate': 0.05,
    'max_depth': 10,
    'num_leaves' : 63,
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'bagging_seed': 42,
    'boosting_type': 'gbdt',
    'is_unbalance': True
}

In [None]:
lgbm_model = cross_validate(X, y, LGBMClassifier, lgbm_params)

In [None]:
lgbm.plot_importance(lgbm_model, figsize=(10, 10))

## XGBClassifier

In [None]:
xgb_params = {
     "seed":42,
    "n_estimators":10000,
    "verbosity":1,
    "eval_metric":"mlogloss",
    "alpha":7.105038963844129,
    "colsample_bytree":0.25505629740052566,
    "gamma":0.4999381950212869,
    "reg_lambda":1.7256912198205319,
    "learning_rate":0.011823142071967673,
    "max_bin":338,
    "max_depth":8,
    "min_child_weight":2.286836198630466,
    "subsample":0.618417952155855,
    'tree_method':'gpu_hist',
    'gpu_id':0
}

In [None]:
xgb_model = cross_validate(X, y, XGBClassifier, xgb_params)

In [None]:
xgb.plot_importance(xgb_model, max_num_features=25)

## Catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
cb_params = {
    "verbose":0,
    "eval_metric":"MultiClass",
    "random_state":2021,
    "num_boost_round":20000,
    "task_type":"GPU",
    "devices":"0",
}

In [None]:
cb_model = cross_validate(X, y, CatBoostClassifier, cb_params)

## Calibration

In [None]:
from sklearn.calibration import CalibratedClassifierCV

In [None]:
# Calibrating XGB
base_xgb = XGBClassifier(**xgb_params)
calibrated_xgb = CalibratedClassifierCV(base_estimator=base_xgb, cv=5)
calibrated_xgb.fit(X, y)

In [None]:
# Calibrating LGBM
base_lgbm = LGBMClassifier(**lgbm_params)
calibrated_lgbm = CalibratedClassifierCV(base_estimator=base_lgbm, cv=5)
calibrated_lgbm.fit(X, y)

## XGB with feature interaction

* XGBoost documentation for feature interaction constraints:
https://xgboost.readthedocs.io/en/latest/tutorials/feature_interaction_constraint.html

In [None]:
xgb_param_fe = xgb_params.copy()

In [None]:
xgb_param_fe['interaction_constraints'] = '[[38, 14], [34, 14, 31], [15, 19]]'

In [None]:
model_xgb1 = cross_validate(X, y, XGBClassifier, xgb_param_fe)

## Submission

In [None]:
# pred_lgbm = lgbm_model.predict_proba(test[X.columns])
# pred_xgb = xgb_model.predict_proba(test[X.columns])
# pred_xgb1 = model_xgb1.predict_proba(test[X.columns])
# pred_cb = cb_model.predict_proba(test[X.columns])

# Calibration predictions
pred_lgbm_cal = calibrated_lgbm.predict_proba(test[X.columns])
pred_xgb_cal = calibrated_xgb.predict_proba(test[X.columns])

In [None]:
# LGBM
submission.iloc[:, 1:] = pred_lgbm_cal
submission.to_csv("LGBM(Calibration).csv", index=False)

# XGB
submission.iloc[:, 1:] = pred_xgb_cal
submission.to_csv("XGB(Calibration).csv", index=False)

# XGb 1
# submission.iloc[:, 1:] = pred_xgb1
# submission.to_csv("XGB_FE.csv", index=False)

# CatBoost
# submission.iloc[:, 1:] = pred_cb
# submission.to_csv("CatBoost.csv", index=False)