In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import confusion_matrix, mean_squared_error, mean_squared_log_error, classification_report, balanced_accuracy_score
from sklearn.metrics import log_loss

from sklearn.utils import class_weight
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from imblearn.over_sampling import SMOTE

import optuna
from optuna.samplers import TPESampler

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
test.head()

In [None]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

# counts each type of Class
sorted(train['target'].value_counts())

In [None]:
lencoder = LabelEncoder()
target = pd.DataFrame(lencoder.fit_transform(train['target']),columns=['target'])

train.drop(['target'], inplace=True, axis=1)

In [None]:
sns.countplot(x = 'target', data = target)

In [None]:
#oversample = SMOTE()
#train, target = oversample.fit_resample(train, target)
#np.sum(target, axis = 0)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.20, 
                                                  stratify = target, random_state = 2021)

In [None]:
# Taking from https://www.kaggle.com/remekkinas/tps-5-weighted-training-xgb-rf-lr-smote

def training(model, X_train_oof, y_train_oof, weighted = False, b_type = True):
    test_preds = None
    test_oof_preds = None
    train_rmse = 0
    val_rmse = 0
    n_splits = 2
    
    skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = 4042)
    for fold, (tr_index, val_index) in enumerate(skf.split(X_train_oof.values, y_train_oof.values)):
        
        print(f"\nFold {fold + 1}")
        x_train_o, x_val_o = X_train_oof.iloc[tr_index], X_train_oof.iloc[val_index]
        y_train_o, y_val_o = y_train_oof.iloc[tr_index], y_train_oof.iloc[val_index]
        
        if weighted:
            weights_y = weights_df.iloc[tr_index]

        eval_set = [(x_val_o, y_val_o)]
        
        if b_type:
            if weighted:
                model.fit(x_train_o, y_train_o, eval_set = eval_set, verbose = 500, sample_weight = weights_y)
            else:
                model.fit(x_train_o, y_train_o, eval_set = eval_set, verbose = 500)
        
        else:
            model.fit(x_train_o, y_train_o)

        train_preds = model.predict(x_train_o)
        train_rmse += mean_squared_error(y_train_o,train_preds, squared = False)
        print("\n- Training RMSE : ", mean_squared_error(y_train_o,train_preds, squared = False))

        val_preds = model.predict(x_val_o)
        val_rmse += mean_squared_error(y_val_o, val_preds, squared = False)
        print("- Validation RMSE : ", mean_squared_error(y_val_o, val_preds, squared = False))
        print('---------------')

        if test_preds is None:
            test_preds = model.predict_proba(test.values)
            test_oof_preds = model.predict_proba(X_val.values)
        else:
            test_preds += model.predict_proba(test.values)
            test_oof_preds += model.predict_proba(X_val.values)

    print("\nAverage Training RMSE : " , train_rmse / n_splits)
    print("Average Validation RMSE : " , val_rmse / n_splits)

    test_preds /= n_splits
    test_oof_preds /= n_splits
    
    return test_preds, test_oof_preds

In [None]:
from sklearn.utils.class_weight import compute_sample_weight
weights_df = pd.DataFrame(compute_sample_weight("balanced", y_train.target), columns = ['weight'])

xgb_model_weighted = xgb.XGBClassifier(eval_metric='mlogloss')
test_preds, y_pred = training(xgb_model_weighted, X_train, y_train, weighted = False)

In [None]:
y_preds = np.argmax(y_pred, axis=1)
print(f'MSE Score: {mean_squared_error(y_val,y_preds)}\n')
print(classification_report(y_val, y_preds))

sns.heatmap(pd.DataFrame(confusion_matrix(y_val, y_preds)), annot=True, linewidths=.5, fmt="d")

In [None]:
sns.countplot(x = 'target', data= pd.DataFrame(y_preds, columns=['target']))

In [None]:
params = {'objective': 'multiclass', 'num_class' : 4,  'metric': 'multi_logloss', 
              'verbosity' : -1, 'boosting_type' : 'gbdt', 'bagging_freq' : 1}

# boosting = ['gbdt', 'goss','dart'] ´rf´ ?

#, 'class_weight' : 'balanced'
# , 'is_unbalance':False

In [None]:
def objective(trial):
    
    num_iterations = trial.suggest_int('num_iterations',50,500)
    max_depth = trial.suggest_int('max_depth',3,10)
    num_leaves = trial.suggest_int('num_leaves',10,30)
    learning_rate = trial.suggest_uniform('learning_rate',0.01,0.2)
    subsample = trial.suggest_uniform('subsample',0.5, 0.9)
    feature_fraction = trial.suggest_uniform('feature fraction',0.5, 0.9)
    #min_child_samples = trial.suggest_int('min_child_samples', 1, 110),
    #min_child_weight = trial.suggest_loguniform('min_child_weight' , 1e-5 , 1),
    lambda_l2 = trial.suggest_uniform('lambda_l2',1e-5,20)   
    
    model = LGBMClassifier(**params,
            num_iterations = num_iterations,
            max_depth = max_depth,
            num_leaves = num_leaves,
            learning_rate = learning_rate,
            subsample = subsample,
            feature_fraction = feature_fraction,
            #min_child_samples = min_child_samples,
            #min_child_weight = min_child_weight,
            lambda_l2 = lambda_l2
            )
    
    nll = cross_val_score(model,X_train,y_train,scoring = 'neg_log_loss', cv = 5).mean()
    return -1*nll

In [None]:
sampler = TPESampler(seed=1111)
study = optuna.create_study(direction = 'minimize', sampler = sampler)
study.optimize(objective,n_trials = 1)
print('numbers of the finished trials:' , len(study.trials))
print(study.best_value)
print(study.best_params)

In [None]:
lgbm = LGBMClassifier(**params, 
                    num_iterations = 490,
                    max_depth = 5,
                    num_leaves = 22,
                    learning_rate = 0.026798877915977834,
                    subsample = 0.6615232298649514,
                    feature_fraction = 0.5881079099486431,
                    #min_child_samples = 27,
                    #min_child_weight = 0.04781667419116532,
                    lambda_l2 = 10.543869110101163)

lgbm.fit(X_train,y_train,verbose = False)
preds = lgbm.predict(X_val)

print('Classification report:\n')
print(classification_report(y_val,preds))
sns.heatmap(pd.DataFrame(confusion_matrix(y_val, preds)), annot=True, linewidths=.5, fmt="d")

In [None]:
sns.countplot(x = 'target', data= pd.DataFrame(preds, columns=['target']))

In [None]:
sample_submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

sample_submission[['Class_1','Class_2', 'Class_3', 'Class_4']] = lgbm.predict_proba(test.values)

sample_submission.to_csv("my_submissionOPT.csv",index = False)
sample_submission.head()

In [None]:
test.values