<br><br>

<img
     src="https://storage.googleapis.com/kaggle-competitions/kaggle/25225/logos/header.png?t=2021-01-27-17-34-26">
     

<br>
<p style="background-color:white;font-family:Tahoma;color:dark gray;font-size:100%;text-align:left;border-radius:10px 0px;">#Kaggle  #TabularPlayGround  #LightGBM #XGBoost #Ensemble #Multiclass

## Import Libraries

In [None]:
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
# from sklearn.impute import SimpleImputer     # verified that there're no null values accordingly we don't need SimpleImputer

from sklearn.metrics import classification_report, accuracy_score, log_loss, balanced_accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

import optuna
from optuna.samplers import TPESampler
from tqdm import tqdm


plt.rcParams['axes.unicode_minus'] = False
plt.style.use('fivethirtyeight')
sns.set(font_scale = 1)  
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

print("Let's start!")

<br>

## Data Load

<br>

In [None]:
trn_df = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv', index_col='Id')
tst_df = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv', index_col='Id')

display(trn_df.shape, trn_df.head(1), tst_df.shape, tst_df.head(1))

<br>

For this competition, you will be predicting a categorical target based on a number of feature columns given in the data. The data is synthetically generated by a GAN that was trained on a the data from the Forest Cover Type Prediction. This dataset is (a) much larger, and (b) may or may not have the same relationship to the target as the original data.

<br>

<br>

## EDA

<br>

This data is too large to exlore the data information using `.info()` method.

In this case it will be helpful to create the data frame for the data information.

### Data Information

In [None]:
trn_info = pd.DataFrame(columns=['Name of Col', 'Num of Null', 'Dtype', 'N_unique'])

for i in range(0, len(trn_df.columns)):
    trn_info.loc[i] = [trn_df.columns[i],
                       trn_df[trn_df.columns[i]].isnull().sum(),
                       trn_df[trn_df.columns[i]].dtypes,
                       trn_df[trn_df.columns[i]].nunique()]
    
trn_info

<br>

* Number of Null data: None


* Data types: All variables have the same data type, int64.


* Number of Unique value:  
    * From `Elevation` to `Horizontal_Distance_To_Fire_Points`: There're so many number of unique value. It means that these columns are the numerical variables.
    * From `Wilderness_Area1` to `Soil_Type40`: each has only two kinds of values accordingly they will be the boolean variables. But there's only one value in `Soil_Type7` and `Soil_Type15` so that we need to check and decide whether will drop it or not.

<br>

<br>

### Target Variable

* `Cover_Type` is consist of 7 unique values. Let's look at its distribution.


* If the version of your matplotlib is up to date, you can use `ax.bar_label(ax.containers[0])` to show the counted values within the countplot.


* By the way, this data is quite unbalanced.
    * Most of Cover_Type are distributed on 1, 2
    * Rarely 4, 5: especially we can drop the data labled 5 because there's only 1 data for it.

In [None]:
# Target values distribution

plt.figure(figsize=(8, 5))
ax = sns.countplot(trn_df['Cover_Type'])
ax.set_title('Distribution of Cover_Type')
ax.bar_label(ax.containers[0])
plt.show()

<br>

### Numeric Variables

* The form of distribution is quite similar between train and test dataset.


* As we noticed it does not seem to matter if we drop `Soil_Type7` and `Soil_Type15`. 

In [None]:
features = trn_df.columns[:-1]
target = trn_df.columns[-1]

In [None]:
# Thanks to https://www.kaggle.com/maximkazantsev/tps-11-21-eda-xgboost-optuna#Data-preprocessing

df = pd.concat([trn_df[features], tst_df[features]], axis=0)
columns = df.columns.values
cols = 5
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,62), sharex=False)
plt.subplots_adjust(hspace = 0.3)

i=0
for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns):
            axs[r, c].set_visible(False)
        else:
            hist1 = axs[r, c].hist(trn_df[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="deepskyblue",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Train Dataset")
            hist2 = axs[r, c].hist(tst_df[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="palevioletred",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Test Dataset")
            axs[r, c].set_title(columns[i], fontsize=12, pad=5)
            axs[r, c].set_yticks(axs[r, c].get_yticks())
            axs[r, c].set_yticklabels([str(int(i/1000))+"k" for i in axs[r, c].get_yticks()])
            axs[r, c].tick_params(axis="y", labelsize=10)
            axs[r, c].tick_params(axis="x", labelsize=10)
            axs[r, c].grid(axis="y")
            if i == 0:
                axs[r, c].legend(fontsize=10)
                                  
        i+=1

plt.show();

In [None]:
# trn['sum'] = trn[features].sum(axis=1)
# tst['sum']=tst[features].sum(axis=1)

# trn['mean']=trn[features].mean(axis=1)
# tst['mean']=tst[features].mean(axis=1)

# trn['median']=trn[features].median(axis=1)
# tst['median']=tst[features].median(axis=1)

# trn['std'] = trn[features].std(axis=1)
# tst['std'] = tst[features].std(axis=1)

# trn['max'] = trn[features].max(axis=1)
# tst['max'] = tst[features].max(axis=1)

# trn['min'] = trn[features].min(axis=1)
# tst['min'] = tst[features].min(axis=1)

# trn['kurt'] = trn[features].kurtosis(axis=1)
# tst['kurt'] = tst[features].kurtosis(axis=1)

# agg_features= ['sum','mean', 'median', 'std','max','min','kurt']

In [None]:
# features = list(features)
# features.extend(agg_features)

In [None]:
# dropping the useless features and data containing Cover_Type == 5

trn_df.drop(['Soil_Type7', 'Soil_Type15'], axis = 1, inplace = True)
tst_df.drop(['Soil_Type7', 'Soil_Type15'], axis = 1, inplace = True)
trn_df = trn_df.loc[trn_df['Cover_Type'] != 5].reset_index(drop=True)

<br>

## Feature Engineering

* Newly created num_features to make numerical features standardized.

In [None]:
features = trn_df.columns[:-1]
target = trn_df.columns[-1]

In [None]:
trn = trn_df.copy()
tst = tst_df.copy()

In [None]:
num_features = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
                'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
                'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
                'Horizontal_Distance_To_Fire_Points']



In [None]:
# scaling

ss = StandardScaler()

for col in num_features:
    trn[col] = ss.fit_transform(trn[[col]])
    tst[col] = ss.fit_transform(tst[[col]])
    
display(trn.head(3), tst.head(3))

<br>

## Model

* Generated base LGBM classifier to check the performance of the given features.

In [None]:
# train_test_split

X_trn = trn[features]
y_trn = trn[target]
X_tst = tst[features]

display(X_trn.shape, y_trn.shape, X_tst.shape)

In [None]:
# k-fold cross validation, trial with the default parameter.


RANDOM_SEED = 42
n_splits = 5
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = RANDOM_SEED)

preds_lgb = []
mean_acc = 0

model_lgb = LGBMClassifier(objective='multiclass', random_state = RANDOM_SEED)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X_trn, y_trn)):
    X_train, X_val = X_trn.loc[trn_idx], X_trn.loc[val_idx]
    y_train, y_val = y_trn.loc[trn_idx], y_trn.loc[val_idx]
    
    model_lgb.fit(X_train, y_train,
                  verbose = False,
                  eval_set = [(X_train, y_train), (X_val, y_val)],
                  eval_metric = 'multi_logloss',
                  early_stopping_rounds = 100)
    
    y_pred = model_lgb.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    mean_acc += score
    
    print(f"Fold {fold}'s score: {score:.4f}")
        
    preds_lgb.append(model_lgb.predict(X_tst))

print("==========================================")
print(f"Mean auc of all folds: {mean_acc / n_splits}")

In [None]:
# k-fold cross validation, trial with the default parameter.


RANDOM_SEED = 42
n_splits = 5
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = RANDOM_SEED)

preds_xgb = []
mean_acc = 0

model_xgb = XGBClassifier(random_state = RANDOM_SEED)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X_trn, y_trn)):
    X_train, X_val = X_trn.loc[trn_idx], X_trn.loc[val_idx]
    y_train, y_val = y_trn.loc[trn_idx], y_trn.loc[val_idx]
    
    model_xgb.fit(X_train, y_train,
                  verbose = False,
                  eval_set = [(X_train, y_train), (X_val, y_val)],
                  eval_metric = 'mlogloss',
                  early_stopping_rounds = 100)
    
    y_pred = model_xgb.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    mean_acc += score
    
    print(f"Fold {fold}'s score: {score:.4f}")
        
    preds_xgb.append(model_xgb.predict(X_tst))

print("==========================================")
print(f"Mean auc of all folds: {mean_acc / n_splits}")

In [None]:
# k-fold cross validation, trial with the default parameter.


RANDOM_SEED = 42
n_splits = 5
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = RANDOM_SEED)

preds_rf = []
mean_acc = 0

model_rf = RandomForestClassifier(random_state = RANDOM_SEED)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X_trn, y_trn)):
    X_train, X_val = X_trn.loc[trn_idx], X_trn.loc[val_idx]
    y_train, y_val = y_trn.loc[trn_idx], y_trn.loc[val_idx]
    
    model_rf.fit(X_train, y_train)
    
    y_pred = model_rf.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    mean_acc += score
    
    print(f"Fold {fold}'s score: {score:.4f}")
        
    preds_rf.append(model_rf.predict(X_tst))

print("==========================================")
print(f"Mean auc of all folds: {mean_acc / n_splits}")

In [None]:
# k-fold cross validation, trial with the default parameter.


RANDOM_SEED = 42
n_splits = 5
skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = RANDOM_SEED)

preds_et = []
mean_acc = 0

model_et = ExtraTreesClassifier(random_state = RANDOM_SEED)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X_trn, y_trn)):
    X_train, X_val = X_trn.loc[trn_idx], X_trn.loc[val_idx]
    y_train, y_val = y_trn.loc[trn_idx], y_trn.loc[val_idx]
    
    model_et.fit(X_train, y_train)
    
    y_pred = model_et.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    mean_acc += score
    
    print(f"Fold {fold}'s score: {score:.4f}")
        
    preds_et.append(model_et.predict(X_tst))

print("==========================================")
print(f"Mean auc of all folds: {mean_acc / n_splits}")

In [None]:
# # HPO using opuna

# def lgb_objective(trial):
#     params = {
#         'boosting_type': 'gbdt',
#         'objective': 'multiclass',
#         'n_estimators': trial.suggest_int("n_estimators", 64, 8192, 100),
#         'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.3),
#         'num_leaves': trial.suggest_int("num_leaves", 20, 3000, 20),
#         'max_depth': trial.suggest_int("max_depth", 3, 12),
#         'feature_fraction': trial.suggest_float("feature_fraction", 0.2, 0.95, step = 0.1),
#         'min_gain_to_split' : trial.suggest_int('min_gain_to_split', 0, 15),
#         'min_data_in_leaf' : trial.suggest_int("min_data_in_leaf", 200, 10000, 100),
#         'lambda_l1': trial.suggest_int("lambda_l1", 0, 100, 5),
#         'lambda_l2': trial.suggest_int("lambda_l2", 0, 100, 5),
#         'bagging_fraction' : trial.suggest_float("bagging_fraction", 0.2, 0.95, step=0.1),
#         'bagging_freq' : trial.suggest_categorical("bagging_freq", [1]),
#         'seed': 42,
#         'metric' : 'multi_logloss',
#         'verbose':-1
#     }
    
#     X_train, X_val, y_train, y_val = train_test_split(X_trn, y_trn, stratify = y_trn, test_size = 0.3, random_state = 42)
    
#     model_lgb = LGBMClassifier(**params)
#     model_lgb.fit(X_train, y_train,
#              eval_set = [(X_train, y_train), (X_val, y_val)],
#              early_stopping_rounds = 100,
#              eval_metric = 'multi_logloss',
#              verbose = False
#              )
#     pred_val = model_lgb.predict(X_val)
    
#     return accuracy_score(y_val, pred_val)

In [None]:
# sampler = TPESampler(seed = 42)
# study = optuna.create_study(study_name = 'lgbm_hpo',
#                            direction = 'maximize',
#                            sampler = sampler)
# study.optimize(lgb_objective, n_trials = 10, show_progress_bar=True)

# print("Best ACC:", study.best_value)
# print("Best params:", study.best_params)

In [None]:
# params = study.best_params

In [None]:
# params_lgb = {'boosting_type': 'gbdt',
#           'objective': 'multiclass',
#           'n_estimators': 7276, 
#           'learning_rate': 0.013562603384785458,
#           'num_leaves': 376,
#           'max_depth': 10,
#           'feature_fraction': 0.7847065437552077,
#           'min_gain_to_split': 8,
#           'min_data_in_leaf': 794,
#           'lambda_l1': 0.0008668739724852811,
#           'lambda_l2': 0.0016878284140548435,
#           'bagging_fraction': 0.3420328146868397,
#           'bagging_freq': 3,
#           'seed': 42,
#           'metric' : 'multi_logloss',
#           'verbose':-1
#          }

In [None]:
# # k-fold cross validation, trial with the tuned parameter.


# RANDOM_SEED = 42
# n_splits = 5
# skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = RANDOM_SEED)

# preds_lgb = []
# mean_acc = 0

# model_lgb = LGBMClassifier(objective='multiclass', **params_lgb)

# for fold, (trn_idx, val_idx) in enumerate(skf.split(X_trn, y_trn)):
#     X_train, X_val = X_trn.loc[trn_idx], X_trn.loc[val_idx]
#     y_train, y_val = y_trn.loc[trn_idx], y_trn.loc[val_idx]
    
#     model_lgb.fit(X_train, y_train,
#                   verbose = False,
#                   eval_set = [(X_train, y_train), (X_val, y_val)],
#                   eval_metric = 'multi_logloss',
#                   early_stopping_rounds = 100)
    
#     y_pred = model_lgb.predict(X_val)
#     score = accuracy_score(y_val, y_pred)
#     mean_acc += score
    
#     print(f"Fold {fold}'s score: {score:.4f}")
        
#     preds_lgb.append(model_lgb.predict(X_tst))

# print("==========================================")
# print(f"Mean auc of all folds: {mean_acc / n_splits}")

In [None]:
final_preds = np.mean(preds_lgb, axis = 0).astype('int64')
submission = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')
submission['Cover_Type'] = final_preds
submission

In [None]:
submission.to_csv('submission.csv', index = False)