In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, precision_score, log_loss


from lightgbm import LGBMClassifier
import lightgbm as lgb
import optuna
import tqdm
from optuna.visualization import plot_optimization_history, plot_param_importances

In [None]:
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/sample_submission.csv')
train = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/test.csv')

In [None]:
train = train.drop('id', axis = 1)
test = test.drop('id', axis = 1)
all_df = pd.concat([train.drop('target', axis = 1), test])
all_df.head()

In [None]:
le = LabelEncoder()
le.fit(train['target'])
train['target'] = le.transform(train['target'])

In [None]:
train_len = len(train)

In [None]:
for i in range(75):
    mean, std = all_df[f'feature_{i}'].mean(), all_df[f'feature_{i}'].std()
    all_df[f'feature_{i}'] = all_df[f'feature_{i}'].apply(lambda x: (x - mean)/std)

In [None]:
features = all_df.columns.tolist()

In [None]:
train_df = all_df[:train_len]
test_df = all_df[train_len:]

**OPTUNA**

In [None]:
OPTUNA_OPTIMIZATION = True
N_SPLITS = 5 #Number of folds for validation
N_TRIALS = 5 #Number of trials to find best hyperparameters
TIME = 3600*6
FOLD_RANDOM_SEED = 3
REPEATED_FOLD = True #Whether to use RepeatedStratifiedKFold over StratifiedKFold

FIXED_PARAMS = {"random_state": 3,
                "num_classes": 9,
                "verbosity": -1,
                "n_jobs": -1}

In [None]:
start_params = {  'cat_smooth': 10,
                  "boosting_type": 'gbdt',
                  'subsample':0.1,
                  'colsample_bytree': 0.3,
                  'reg_lambda': 5,
                  'reg_alpha': 5,
                  'max_depth': 10,
                  'num_leaves': 50,
                  'learning_rate': 0.01,
                  'n_estimators': 1000,
                  "random_state": 3,
                  "objective": "multiclass",
                  "num_classes": 9,
                  "verbosity": -1,
                  "n_jobs": -1,}

In [None]:
skfold = StratifiedKFold(N_SPLITS, shuffle = True, random_state = FOLD_RANDOM_SEED)
if REPEATED_FOLD:
    skfold = RepeatedStratifiedKFold(N_SPLITS, n_repeats=2, random_state=FOLD_RANDOM_SEED)

In [None]:
def objective(trial, cv=skfold):
    
    param_to_search_lgb = {
        "objective": trial.suggest_categorical('objective', ['multiclass', 'multiclassova']),
        "boosting_type": trial.suggest_categorical('boosting_type', ['gbdt', 'goss']),
        "n_estimators": trial.suggest_categorical('n_estimators', [500, 1000, 2500, 5000, 10000]),
        #'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
        "learning_rate": trial.suggest_categorical('learning_rate', [0.001, 0.005, 0.01, 0.05, 0.1]),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024),
        'max_depth': trial.suggest_int('max_depth', -1, 32),
        'reg_alpha': trial.suggest_float('reg_alpha', 1E-16, 25),
        'reg_lambda': trial.suggest_float('reg_lambda', 1E-16, 25),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 1E-16, 1.0),
        'subsample': trial.suggest_float('subsample', 1E-16, 1.0),
        'cat_smooth': trial.suggest_float('cat_smooth', 1.0, 50.0)  
    }
    
    param_lgb = param_to_search_lgb.copy()
    param_lgb.update(FIXED_PARAMS)
    
    
    val_losses = []
    losses_1 = []
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'multi_logloss', valid_name='valid_1') 
    
    for kfold, (train_idx, val_idx) in tqdm.tqdm(enumerate(cv.split(train_df[features].values, 
                                                                    train['target'].values))):
        
        X_train = train_df.loc[train_idx, features]
        y_train = train.loc[train_idx, 'target']
        
        X_valid = train_df.loc[val_idx, features]
        y_valid = train.loc[val_idx, 'target']
        
        d_train = lgb.Dataset(X_train, label=y_train)
        d_valid = lgb.Dataset(X_valid, label=y_valid)
        watchlist = [d_train, d_valid]
        
        model = lgb.train(param_lgb,
                      train_set=d_train,
                      valid_sets=watchlist,
                      verbose_eval=0,
                      early_stopping_rounds=100,
                      callbacks=[pruning_callback])
    
        scores = model.predict(X_valid)
        loss_1 = log_loss(y_valid, scores)
        losses_1.append(loss_1)
        
    
    return np.average(losses_1)

In [None]:
if OPTUNA_OPTIMIZATION:
    study = optuna.create_study(study_name = 'lgbm_parameter_opt', direction = 'minimize',
                                pruner=optuna.pruners.MedianPruner(n_warmup_steps=25))
    
    study.enqueue_trial(start_params)
    #study.optimize(objective, n_trials=1, show_progress_bar=True)
    study.optimize(objective, n_trials = N_TRIALS, timeout = TIME, show_progress_bar=True) 
    
    trial = study.best_trial
    
    print("  Value: {}".format(trial.value))
    
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    best_params = FIXED_PARAMS.copy()
    best_params.update(trial.params)

In [None]:
plot_optimization_history(study)

In [None]:
display(study.trials_dataframe())

In [None]:
final_model = LGBMClassifier(**best_params)

In [None]:
test_preds = []
accuracies = []
loglosses = []

for kfold, (train_idx, val_idx) in enumerate(skfold.split(train_df[features].values, 
                                                          train['target'].values)):
        
        final_model.fit(train_df.loc[train_idx, features], 
                        train.loc[train_idx, 'target'])
        print('Fitted {}'.format(type(final_model).__name__))
        
        val_true = train.loc[val_idx, 'target'].values
        
        preds = final_model.predict(train_df.loc[val_idx, features])
        probs = final_model.predict_proba(train_df.loc[val_idx, features])
        
        accuracy = accuracy_score(val_true, preds)
        accuracies.append(accuracy)
        print('Fold: {}\t Validation Accuracy: {}\n'.format(kfold, accuracy))
        
        logloss = log_loss(val_true, probs)
        loglosses.append(logloss)
        print('Fold: {}\t Validation logloss: {}\n'.format(kfold, logloss))
        
        test_preds.append(final_model.predict_proba(test_df[features]))
        
print("Best Parameters mean Accuracy: {}".format(np.mean(accuracies)))
print("Best Parameters mean logloss: {}".format(np.mean(loglosses)))

In [None]:
test_predictions = np.mean(test_preds, axis = 0)

In [None]:
sub = pd.DataFrame(test_predictions, columns = ["Class_1", "Class_2", "Class_3", "Class_4","Class_5", "Class_6", "Class_7", "Class_8", "Class_9"])
sub['id'] = sample_submission['id']

In [None]:
sub.to_csv("sub5.csv", index = False)