**Table of content**
1. Training and testing datasets with their properties.
2. Create dummies from features with low cardinality.
3. Calculate logarithm for every instance in features.
4. Base model prediction.
5. Hyperparameters search using Optuna.
6. Final model prediction and submission.
7. Prediction and submission 5 kfold model with Optuna best hyperparams.
8. SMOTE technique.
9. Finding classes weights.

In [None]:
import numpy as np
import pandas as pd
import missingno as no

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
sns.set_theme(style="whitegrid")

## Training and testing datasets with their properties

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/test.csv")
sub_sample_df = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/sample_submission.csv")

In [None]:
feat_cols = [col for col in train_df.columns if col.startswith("feature")]

In [None]:
train_df.head()

In [None]:
no.matrix(train_df, figsize=(18,4))

In [None]:
train_df.info()

In [None]:
train_df.shape, test_df.shape

The dataset has no missing values 

In [None]:
train_df.drop('id', axis=1, inplace=True)
test_df.drop('id', axis=1, inplace=True)

In [None]:
train_df.describe().T.style.bar(subset=['mean'], color=px.colors.qualitative.Pastel1[1])\
                                .background_gradient(subset=['std'], cmap='Greens')

In [None]:
label_dict = {val:idx for idx, val in enumerate(train_df['target'].unique())}
train_df['target_num'] = train_df['target'].map(label_dict)

In [None]:
fig, ax = plt.subplots(figsize=(12,12))

corr_mat = train_df.corr()
mask = np.zeros_like(corr_mat, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr_mat, mask=mask, square=True, ax=ax, linewidths=0.1,cmap='coolwarm', center=0)

In [None]:
plt.rcParams['axes.facecolor'] = 'black'

In [None]:
target_order = sorted(train_df['target'].unique())

plt.figure(figsize=(8, 4))
sns.countplot(x=train_df["target"], order=target_order, palette='coolwarm');

Target column is unbalanced with the majority (57.5 %) of class_2, (21.5%) of class_3, (12.5%) of class_4 and (8.5%) of class_1. We should use StratifiedKFold to split our dataset.

In [None]:
# Sorted correlation between features and the target
# corr_target = corr_mat["target_num"][:-1].sort_values(ascending=False)
# fig = plt.figure(figsize=(16,5))
# sns.barplot(x=corr_target.index, y=corr_target.values, palette="RdYlGn")
# plt.title("Features correlation to the target column")
# plt.xticks(rotation=45);

In [None]:
fig = plt.figure(figsize=(16,5))
sns.barplot(x=corr_mat["target_num"][:-1].index, y=corr_mat["target_num"][:-1].values, palette="RdYlGn")
plt.title("Features correlation to the target column")
plt.xticks(rotation=90);

In [None]:
fig = plt.figure(figsize=(20,30))

for i, col in enumerate(train_df.drop(['target', 'target_num'], axis=1)):
    df = train_df[[col, 'target']].groupby('target').mean()
    plt.subplot(17,3, i+1)
    sns.barplot(x=df.index, y=df[col], palette="RdYlGn")
    plt.tight_layout()

Distribution between classes in features seems to be balanced.

In [None]:
fig = plt.figure(figsize=(20,30))
fig.patch.set_facecolor('black')
sns.set_theme(style="dark")

for i, col in enumerate(train_df.drop(['target', 'target_num'], axis=1)):
    plt.subplot(17,3, i+1)
    sns.kdeplot(train_df[col], fill=True, color='red')
    sns.kdeplot(test_df[col], fill=True, color='blue')

Both train and test set have very similar distribution and they all right skewed. We might have to deal with it and check if improves a model metrics.

In [None]:
feat_cols = train_df.drop(["target", "target_num"], axis=1).columns
train_unique_list= []
test_nunique_list = []

for col in feat_cols:
    train_unique_list.append(train_df[col].nunique())
    test_nunique_list.append(test_df[col].nunique())

unique_df = pd.DataFrame(data=train_unique_list, index=feat_cols, columns=["train_nunique"])
unique_df["test_nunique"] = test_nunique_list

### Features cardinality.

In [None]:
print("Features cardinality")
unique_df.style.background_gradient(cmap="Blues")

There are some differences in number of unique values in train and test sets. If we want to apply techniques like Label_encoding on features we have to take that for consideration.

### Variance Inflation Factor

Variance Inflation Factor (VIF) is used to detect the presence of multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif["variables"] = feat_cols
vif["VIF_train"] = [variance_inflation_factor(train_df[feat_cols].values, i)\
                    for i in range(train_df[feat_cols].shape[1])]
vif["VIF_test"] = [variance_inflation_factor(test_df[feat_cols].values, i)\
                   for i in range(test_df[feat_cols].shape[1])]
vif.style.background_gradient(cmap="magma")

A rule of thumb for interpreting the variance inflation factor:
1 = not correlated.
Between 1 and 5 = moderately correlated.
Greater than 5 = highly correlated.

In [None]:
sns.set_theme(style="whitegrid")

In [None]:
fig = plt.figure(figsize=(20,35))
fig.patch.set_facecolor('white')
plt.rcParams['axes.facecolor'] = 'white'


for i, col in enumerate(feat_cols):
    plt.subplot(17, 3, i+1)
    # Draw a nested violinplot and split the violins for easier comparison
    sns.violinplot(data=train_df, x="target", y=f"feature_{i}",
                   split=True, inner="quart", linewidth=1)
    sns.despine(left=True)
    plt.tight_layout()

In [None]:
fig = plt.figure(figsize=(20,30))

for i, col in enumerate(train_df.drop(['target', 'target_num'], axis=1)):
    plt.subplot(17,3, i+1)
    sns.histplot(data=train_df[col], color='blue',bins=50)
    sns.histplot(data=test_df[col], color='red', bins=50)
    plt.xlim(0, 10)
    plt.tight_layout()

### Dummies variablbe form features with low cardinality

In [None]:
low_card_cols = ["feature_0","feature_2","feature_5","feature_13",
                 "feature_22","feature_36","feature_44"]

In [None]:
train_df['train'] = 1
test_df['train'] = 0

In [None]:
ys = train_df[['target_num','target']]

In [None]:
def create_dummies(dftrain, dftest, cols):
    ys = dftrain[["target_num","target"]]
    dftrain = dftrain.drop(['target','target_num'], axis=1)
    full_df = pd.concat([dftrain,dftest], axis=0)
    temp_df = pd.get_dummies(full_df, columns=cols, drop_first=True)
    return temp_df

In [None]:
new_full_df = create_dummies(train_df, test_df, low_card_cols)

In [None]:
new_train_df = new_full_df[new_full_df['train'] == 1].drop('train', axis=1)
new_test_df = new_full_df[new_full_df['train'] == 0].drop('train', axis=1)

In [None]:
new_train_df = pd.concat([new_train_df,ys], axis=1)

In [None]:
#for col in train_df.columns:
    #print(f"{col} unique values: {train_df[col].unique()}")

In [None]:
train_df.drop("train", axis=1, inplace=True)
test_df.drop("train", axis=1, inplace=True)

### Calculate log

In [None]:
for col in low_card_cols:
    feat_cols = feat_cols.drop(col)

In [None]:
def calculate_log(df, cols):
    df = df.copy()
    for col in cols:
        df.loc[:,col] =  df[col].apply(lambda x: np.log(x) if x>0 else x) 
    return df

In [None]:
train_transformed = calculate_log(new_train_df, feat_cols)
test_transformed = calculate_log(new_test_df, feat_cols)

In [None]:
train_transformed.describe()

## Base model

In [None]:
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, log_loss, classification_report, confusion_matrix

In [None]:
#seed = 1945
#train_transformed = train_transformed.sample(frac=1, random_state=seed).reset_index(drop=True)

#train_transformed['kfold'] = -1
#skf = StratifiedKFold(n_splits=5)

#for fold, (train_idx, valid_idx) in enumerate(skf.split(X=train_transformed, y=train_transformed['target_num'])):
    #train_transformed.loc[valid_idx, "kfold"] = fold

In [None]:
seed = 1945
train_df = train_df.sample(frac=1, random_state=seed).reset_index(drop=True)

train_df['kfold'] = -1
skf = StratifiedKFold(n_splits=5)

for fold, (train_idx, valid_idx) in enumerate(skf.split(X=train_df, y=train_df['target_num'])):
    train_df.loc[valid_idx, "kfold"] = fold

In [None]:
seed = 1945
train_df_shuffled = train_df.sample(frac=1, random_state=seed).reset_index(drop=True)
test_df_shuffled = test_df.sample(frac=1, random_state=seed).reset_index(drop=True)

In [None]:
def run_training(algo, df, test, fold, oof):
    t_df = df[df.kfold != fold].reset_index(drop=True)
    v_df = df[df.kfold == fold].reset_index(drop=True)
    
    xtrain = t_df.drop(["kfold", "target", "target_num"], axis=1)
    xvalid = v_df.drop(["kfold", "target", "target_num"], axis=1)
    
    ytrain = t_df['target_num'].values
    yvalid = v_df['target_num'].values
    
    #sc = MinMaxScaler()
    #xtrain = sc.fit_transform(xtrain)
    #xvalid = sc.transform(xvalid)
    #test = sc.transform(test)
    
    dtrain = lgbm.Dataset(xtrain, label=ytrain)
    dvalid = lgbm.Dataset(xvalid, label=yvalid)
    
    model = algo.train(
            params={
                "objective":"multiclass",
                "metrics": "multi_logloss",
                "num_class":4
            },
            train_set=dtrain,
            num_boost_round=1000,
            valid_sets=(dtrain,dvalid),
            valid_names=('train','valid'),
            early_stopping_rounds=100,
            verbose_eval=100,
    )
    
    ypred = model.predict(xvalid)
    oof[valid_idx] = ypred
    
    test_ypred = model.predict(test)
    
    print()
    print(f"Valid's logloss: {log_loss(yvalid, ypred):.5f}")
    print(f"Valid's ROC AUC: {roc_auc_score(yvalid, ypred, multi_class='ovo'):.5f}")
    
    return model, oof, test_ypred

In [None]:
NUM_CLASS=4
oof = np.zeros((len(train_transformed),NUM_CLASS))
test_oof = np.zeros((len(test_transformed), NUM_CLASS))

for fold in range(5):
    model, oof, test_preds1 = run_training(lgbm, train_df_shuffled, test_df_shuffled, fold, oof)
    test_oof += test_preds1 / NUM_CLASS

In [None]:
sub_df = pd.DataFrame(np.clip(test_oof, 0.025, 0.975))
sub_df.columns = label_dict.keys()
sub_df["id"] = sub_sample_df["id"].values

In [None]:
sub_df.to_csv("sub_base_first.csv", index=False)

In [None]:
y_true = train_transformed["target_num"].values
y_pred_ser1 = pd.Series([np.argmax(line) for line in oof])
print(classification_report(y_true, y_pred_ser1))

In [None]:
class_labels = list(label_dict.keys())

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_true, y_pred_ser1), 
            cmap='viridis', annot=True, fmt=".0f",
            xticklabels=class_labels,yticklabels=class_labels);

## Optuna

In [None]:
#import optuna
#from optuna.pruners import SuccessiveHalvingPruner
#from sklearn.model_selection import train_test_split

#X = train_transformed.drop(["kfold","target", "target_num"], axis=1)
#y = train_transformed["target_num"].values

#def objective(trial):
    
    #X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,
                                                          #random_state=1945, stratify=y)
    
    #sc = MinMaxScaler()
    #X_train = sc.fit_transform(X_train)
    #X_valid = sc.transform(X_valid)
    
    #dtrain = lgbm.Dataset(X_train, label=y_train)
    #dvalid = lgbm.Dataset(X_valid, label=y_valid)
    
    #params = {
        #"objective":"multiclass",
        #"metrics":"multi_logloss",
        #"num_class": 4,
        #"learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 1e-1),
        #"num_leaves": trial.suggest_int("num_leaves", 2, 255),
        #"max_depth": trial.suggest_int("max_depth", 3, 30),
        #"num_iterations": trial.suggest_int("num_iterations", 100, 1000),
        #"bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        #"bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        #"lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        #"lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        #"max_bin": trial.suggest_int("max_bin", 2, 255),
    #}
    
    #model = lgbm.train(params,dtrain)
    #preds = model.predict(X_valid)
    #logloss = log_loss(y_valid, preds)
    #return logloss

#study1 = optuna.create_study(direction="minimize",pruner=SuccessiveHalvingPruner())
#study1.optimize(objective, n_trials=100)

#print("Number of finished trials: ", len(study1.trials))
#print("Best trial: ", study1.best_trial.params)

In [None]:
# Hyperparameters found with train dataset without any transformation

best_params = {'learning_rate': 0.025845340173733224,
 'num_leaves': 8,
 'max_depth': 28,
 'num_iterations': 754,
 'boosting': 'gbdt',
 'bagging_fraction': 0.8681224214865364,
 'bagging_freq': 2,
 'lambda_l1': 8.538579636440432e-08,
 'lambda_l2': 3.164760963598122e-07,
 'max_bin': 149,
 'objective': 'multiclass',
 'metrics': 'multi_logloss',
 'num_class': 4}

In [None]:
# hyperparams found by optuna with transformed dataset
#best_params={'learning_rate': 0.01559063647642801,
 #'num_leaves': 11,
 #'max_depth': 9,
 #'num_iterations': 830,
 #'bagging_fraction': 0.6535376526131755,
 #'bagging_freq': 1,
 #'lambda_l1': 3.5657610302806274e-08,
 #'lambda_l2': 0.47634999839812464,
 #'max_bin': 248}

In [None]:
# best_params = study1.best_params
best_params["objective"] = "multiclass"
best_params["metrics"] = "multi_logloss"
best_params["num_class"] = 4

The best hyperparameters found with optuna:

{'learning_rate': 0.025845340173733224,
 'num_leaves': 8,
 'max_depth': 28,
 'num_iterations': 754,
 'boosting': 'gbdt',
 'bagging_fraction': 0.8681224214865364,
 'bagging_freq': 2,
 'lambda_l1': 8.538579636440432e-08,
 'lambda_l2': 3.164760963598122e-07,
 'max_bin': 149,
 'objective': 'multiclass',
 'metrics': 'multi_logloss',
 'num_class': 4}

## Final model prediction and submission

In [None]:
lgbm_final_model = lgbm.LGBMClassifier(**best_params)

In [None]:
train_df_t = train_transformed.copy()
test_df_t = test_transformed.copy()

In [None]:
#X = train_df_t.drop(["kfold","target","target_num"], axis=1)
#y = train_df_t["target_num"].values

In [None]:
X = train_df_shuffled.drop(["kfold","target","target_num"], axis=1)
y = train_df_shuffled["target_num"].values

In [None]:
sc = MinMaxScaler()
X_scaled = sc.fit_transform(X)
test_scaled = sc.transform(test_df)

In [None]:
lgbm_final_model.fit(X_scaled, y)
final_preds = lgbm_final_model.predict_proba(test_scaled)

In [None]:
sub_df = pd.DataFrame(final_preds) # np.clip(final_preds, 0.025, 0.975)
sub_df.columns = label_dict.keys()
sub_df["id"] = sub_sample_df["id"].values

In [None]:
sub_df.to_csv("optuna_sub.csv", index=False)

## Optuna best params with 5 kfolds submission

In [None]:
def run_training2(algo, df, test, fold, oof):
    t_df = df[df.kfold != fold].reset_index(drop=True)
    v_df = df[df.kfold == fold].reset_index(drop=True)
    
    xtrain = t_df.drop(["kfold", "target", "target_num"], axis=1)
    xvalid = v_df.drop(["kfold", "target", "target_num"], axis=1)
    
    ytrain = t_df['target_num'].values
    yvalid = v_df['target_num'].values
    
    sc = MinMaxScaler()
    xtrain = sc.fit_transform(xtrain)
    xvalid = sc.transform(xvalid)
    test = sc.transform(test)
    
    dtrain = lgbm.Dataset(xtrain, label=ytrain)
    dvalid = lgbm.Dataset(xvalid, label=yvalid)
    
    model = algo.train(best_params,
            train_set=dtrain,
            num_boost_round=1000,
            valid_sets=[dtrain,dvalid],
            valid_names=['train','valid'],
            early_stopping_rounds=100,
            verbose_eval=100
    )
    
    ypred = model.predict(xvalid)
    oof[valid_idx] = ypred
    
    test_ypred = model.predict(test)
    logloss = log_loss(yvalid, ypred)
    
    print()
    print(f"Fold={fold+1}")
    print(f"Valid's ROC AUC: {roc_auc_score(yvalid, ypred, multi_class='ovo'):.5f}")
    print(f"Valid's logloss: {log_loss(yvalid, ypred):.5f}")
    
    return model, oof, test_ypred, logloss


oof = np.zeros((len(train_transformed),NUM_CLASS))
test_oof2 = np.zeros((len(test_transformed), NUM_CLASS))
logloss_list = []

for fold in range(5):
    lgbm_model, oof, test_preds, logloss = run_training2(lgbm, train_df_shuffled, test_df, fold, oof)
    test_oof2 += test_preds / NUM_CLASS
    logloss_list.append(logloss)
print(f"Mean log_loss after {fold+1} folds: {np.mean(logloss_list)}")

Metrics with train dataset without any transformation:
- Valid's ROC AUC: 0.57390
- Valid's logloss: 1.09211
- Mean log_loss after 5 folds: 1.09298154529417

In [None]:
sub_df = pd.DataFrame(np.clip(test_oof2, 0.025, 0.975)) # np.clip(final_preds, 0.025, 0.975)
sub_df.columns = label_dict.keys()
sub_df["id"] = sub_sample_df["id"].values
sub_df.to_csv("sub_5kfold_optuna_hyperparams.csv", index=False)

In [None]:
y_true = train_df["target_num"].values
y_pred_ser2 = pd.Series([np.argmax(line) for line in oof])
print(classification_report(y_true, y_pred_ser2))

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_true, y_pred_ser2),
            cmap='viridis', annot=True, fmt=".0f",
            xticklabels=class_labels,yticklabels=class_labels);

The model trained on dataset without any transformation has recivied 1.08741 in LB but it is clear that the model with or without transformation doesn't predict class_1 and class_4 at all. Maybe SMOTE technique could help with that.

## SMOTE Technique

In [None]:
label_dict

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
X = train_df_t.drop(["target","target_num"], axis=1).copy()
y = train_df_t['target_num'].values

In [None]:
counter = Counter(train_df_t['target_num'])
print(counter)

In [None]:
#def objective(trial):
    
    #X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,
                                                          #random_state=1945, stratify=y)
    
    #sm = SMOTE(random_state=1945)
    #X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train.ravel())
    
    #sc = MinMaxScaler()
    #X_train_sm = sc.fit_transform(X_train_sm)
    #X_valid = sc.transform(X_valid)
    
    #dtrain = lgbm.Dataset(X_train_sm, label=y_train_sm)
    
    #params = {
        #"boosting": "gbdt",
        #"objective":"multiclass",
        #"metrics":"multi_logloss",
        #"num_class": 4,
        #"learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 1e-1),
        #"num_leaves": trial.suggest_int("num_leaves", 2, 255),
        #"max_depth": trial.suggest_int("max_depth", 3, 30),
        #"num_iterations": trial.suggest_int("num_iterations", 100, 1000),
        #"bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        #"bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        #"lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        #"lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        #"max_bin": trial.suggest_int("max_bin", 2, 255),
    #}
    
    #model = lgbm.train(params,dtrain)
    #preds = model.predict(X_valid)
    #logloss = log_loss(y_valid, preds)
    #return logloss

#study2 = optuna.create_study(direction="minimize",pruner=SuccessiveHalvingPruner())
#study2.optimize(objective, n_trials=100)

#print("Number of finished trials: ", len(study2.trials))
#print("Best trial: ", study2.best_trial.params)

In [None]:
#study2.best_params

The best hyperparameters for SMOTE dataset:

{'learning_rate': 0.09816514507767324,
 'num_leaves': 206,
 'max_depth': 30,
 'num_iterations': 1000,
 'bagging_fraction': 0.6918190371064439,
 'bagging_freq': 6,
 'lambda_l1': 0.0007670842709700264,
 'lambda_l2': 0.019842881946747774,
 'max_bin': 127}


In [None]:
# Hyperparameters found with training dataset without any transformation

#best_params = {
        #"objective": "multiclass",
        #"metrics": "multi_logloss",
        #"num_class": 4,
        #"learning_rate": 0.09816514507767324,
        #"num_leaves": 206,
        #"max_depth": 30,
        #"num_iterations":1000,
        #"bagging_fraction": 0.6918190371064439,
        #"bagging_freq": 6,
        #"lambda_l1": 0.0007670842709700264,
        #"lambda_l2": 0.019842881946747774,
        #"max_bin": 127
#}

In [None]:
# Hyperparams for transformed train dataset
best_params = {'learning_rate': 0.01382837447462176,
 'num_leaves': 117,
 'max_depth': 29,
 'num_iterations': 399,
 'bagging_fraction': 0.4885473849475542,
 'bagging_freq': 1,
 'lambda_l1': 0.00039613672631532814,
 'lambda_l2': 0.0005240426920058911,
 'max_bin': 181}

In [None]:
#best_params = study2.best_params
best_params["objective"] = "multiclass"
best_params["metrics"] = "multi_logloss"
best_params["num_class"] = 4

In [None]:
X['kfold'] = -1
skf = StratifiedKFold(n_splits=5)
valid_idx_list = []

for fold, (train_idx, valid_idx) in enumerate(skf.split(X=X, y=y)):
    X.loc[valid_idx, "kfold"] = fold
    valid_idx_list.append(valid_idx)

In [None]:
X["target_num"] = y

In [None]:
def run_training3(algo, df, test, fold, oof):
    t_df = df[df.kfold != fold].reset_index(drop=True)
    v_df = df[df.kfold == fold].reset_index(drop=True)
    
    xtrain = t_df.drop(["kfold","target_num"], axis=1)
    xvalid = v_df.drop(["kfold","target_num"], axis=1)
    
    ytrain = t_df['target_num'].values
    yvalid = v_df['target_num'].values
    
    sm = SMOTE(random_state=1945)
    xtrain_sm, ytrain_sm = sm.fit_resample(xtrain, ytrain)
    
    sc = MinMaxScaler()
    xtrain_sm = sc.fit_transform(xtrain_sm)
    xvalid = sc.transform(xvalid)
    test = sc.transform(test)
    
    dtrain = lgbm.Dataset(xtrain_sm, label=ytrain_sm)
    dvalid = lgbm.Dataset(xvalid, label=yvalid)
    
    model = algo.train(best_params,
            train_set=dtrain,
            num_boost_round=5000,
            valid_sets=[dtrain,dvalid],
            valid_names=['train','valid'],
            early_stopping_rounds=100,
            verbose_eval=100
    )
    
    ypred = model.predict(xvalid)
    oof[valid_idx_list[fold]] = ypred
    
    test_ypred = model.predict(test)
    logloss = log_loss(yvalid, ypred)
    
    print()
    print(f"Fold={fold+1}")
    print(f"Valid's ROC AUC: {roc_auc_score(yvalid, ypred, multi_class='ovo'):.5f}")
    print(f"Valid's logloss: {log_loss(yvalid, ypred):.5f}")
    
    return model, oof, test_ypred, logloss


oof = np.zeros((len(X),NUM_CLASS))
test_oof3 = np.zeros((len(test_df), NUM_CLASS))
logloss_list = []

for fold in range(5):
    lgbm_model, oof, test_preds, logloss = run_training3(lgbm, X, test_transformed, fold, oof)
    test_oof3 += test_preds / NUM_CLASS
    logloss_list.append(logloss)
print(f"Mean log_loss after {fold+1} folds: {np.mean(logloss_list)}")

In [None]:
#seed = 1945
#train_transformed = train_transformed.sample(frac=1, random_state=seed).reset_index(drop=True)

#train_transformed['kfold'] = -1
#skf = StratifiedKFold(n_splits=5)

#for fold, (train_idx, valid_idx) in enumerate(skf.split(X=train_transformed, y=train_transformed['target_num'])):
    #train_transformed.loc[valid_idx, "kfold"] = foldy_true = X["target_num"].values
y_pred_ser3 = pd.Series([np.argmax(line) for line in oof])
print(classification_report(y_true, y_pred_ser3))

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_true, y_pred_ser3),
            cmap='viridis', annot=True, fmt=".0f",
            xticklabels=class_labels, yticklabels=class_labels);

In [None]:
sub_df = pd.DataFrame(test_oof3) # np.clip(final_preds, 0.025, 0.975)
sub_df.columns = label_dict.keys()
sub_df["id"] = sub_sample_df["id"].values
sub_df.to_csv("sub_SMOTE_transform_5folds.csv", index=False)

Classification report reveals one important advantage of using this technique, which is improve prediction for other classes. Just for remider, model without using SMOTE has predicted mostly class 2.

## Class Weights

In [None]:
from collections import Counter

In [None]:
def get_class_weights(y):
    counter = Counter(y)
    majority = max(counter.values())
    return {cls: round(float(majority) / float(count), 2) for cls, count in counter.items()}

class_weights = get_class_weights(X["target_num"].values)
print(class_weights)

### Optuna

In [None]:
pd.Series(y_true).unique()

In [None]:
pd.Series(y).unique()

In [None]:
X = train_df.drop(["target","target_num"],axis=1)
y = train_df["target_num"]

In [None]:
#import optuna
#from optuna.pruners import SuccessiveHalvingPruner
#from sklearn.model_selection import train_test_split


#def objective(trial):
    
    #X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,
                                                          #random_state=1945, stratify=y)
    
    #sc = MinMaxScaler()
    #X_train = sc.fit_transform(X_train)
    #X_valid = sc.transform(X_valid)
    
    #params = {
        #"objective":"multiclass",
        #"metrics":"multi_logloss",
        #"num_class": 4,
        #"class_weight": class_weights,
        #"learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 1e-1),
        #"num_leaves": trial.suggest_int("num_leaves", 2, 255),
        #"max_depth": trial.suggest_int("max_depth", 3, 30),
        #"num_iterations": trial.suggest_int("num_iterations", 100, 1000),
        #"bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        #"bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        #"lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        #"lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        #"max_bin": trial.suggest_int("max_bin", 2, 255),
    #}
    
    #model = lgbm.LGBMClassifier(**params)
    #model.fit(X_train, y_train)
    #preds = model.predict_proba(X_valid)
    #logloss = log_loss(y_valid, preds)
    #return logloss

#study3 = optuna.create_study(direction="minimize",pruner=SuccessiveHalvingPruner())
#study3.optimize(objective, n_trials=100)

#print("Number of finished trials: ", len(study3.trials))
#print("Best trial: ", study3.best_trial.params)

best hyperparameters for model with weighted classes:
{'learning_rate': 0.03491633418671592,
 'num_leaves': 248,
 'max_depth': 29,
 'num_iterations': 852,
 'bagging_fraction': 0.9713021482558285,
 'bagging_freq': 7,
 'lambda_l1': 0.013629154943194912,
 'lambda_l2': 0.6536148007804358,
 'max_bin': 6}

In [None]:
#best_params = study3.best_params

In [None]:
# hyperparams from optuna
best_params = {'learning_rate': 0.03491633418671592,
               'num_leaves': 248,
               'max_depth': 29,
               'num_iterations': 852,
               'bagging_fraction': 0.9713021482558285,
               'bagging_freq': 7,
               'lambda_l1': 0.013629154943194912,
               'lambda_l2': 0.6536148007804358,
                'max_bin': 6}

best_params["objective"] = "multiclass"
best_params["metrics"] = "multi_logloss"
best_params["num_class"] = 4
best_params["class_weight"] = class_weights

In [None]:
train_df['kfold'] = -1
skf = StratifiedKFold(n_splits=5)
valid_idx_list = []

for fold, (train_idx, valid_idx) in enumerate(skf.split(X=train_df, y=train_df['target_num'])):
    train_df.loc[valid_idx, "kfold"] = fold
    valid_idx_list.append(valid_idx)

In [None]:
def run_training4(algo, df, test, fold, oof):
    t_df = df[df.kfold != fold].reset_index(drop=True)
    v_df = df[df.kfold == fold].reset_index(drop=True)
    
    xtrain = t_df.drop(["kfold","target","target_num"], axis=1)
    xvalid = v_df.drop(["kfold","target","target_num"], axis=1)
    
    ytrain = t_df['target_num'].values
    yvalid = v_df['target_num'].values
    
    sc = MinMaxScaler()
    xtrain = sc.fit_transform(xtrain)
    xvalid = sc.transform(xvalid)
    test = sc.transform(test)
    

    model = lgbm.LGBMClassifier(**best_params)
    model.fit(xtrain, ytrain)
    
    ypred = model.predict_proba(xvalid)
    oof[valid_idx_list[fold]] = ypred
    
    test_ypred = model.predict_proba(test)
    logloss = log_loss(yvalid, ypred)
    
    print()
    print(f"Fold={fold+1}")
    print(f"Valid's ROC AUC: {roc_auc_score(yvalid, ypred, multi_class='ovo'):.5f}")
    print(f"Valid's logloss: {log_loss(yvalid, ypred):.5f}")
    
    return model, oof, test_ypred, logloss


oof = np.zeros((len(train_df),NUM_CLASS))
test_oof4 = np.zeros((len(test_df), NUM_CLASS))
logloss_list = []

for fold in range(5):
    lgbm_model, oof, test_preds, logloss = run_training4(lgbm, train_df, test_df, fold, oof)
    test_oof4 += test_preds / NUM_CLASS
    logloss_list.append(logloss)
print(f"Mean log_loss after {fold+1} folds: {np.mean(logloss_list)}")

In [None]:
y_true = train_df["target_num"].values
y_pred_ser4 = pd.Series([np.argmax(line) for line in oof])
print(classification_report(y_true, y_pred_ser4))

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_true, y_pred_ser4), 
            cmap='viridis', annot=True, fmt=".0f",
            xticklabels=class_labels, yticklabels=class_labels);

In [None]:
sub_df = pd.DataFrame(np.clip(test_oof4, 0.025, 0.975))
sub_df.columns = label_dict.keys()
sub_df["id"] = sub_sample_df["id"].values
sub_df.to_csv("sub_class_weight.csv", index=False)