## Tabular Playground Series March 2021

<img src="https://i.imgur.com/uHVJtv0.png">



<br><br>

### Notebook Contents:

0. [**Imports, Data Loading and Preprocessing**](#loading)

1. [**Optuna Hyperparameter Optimization**](#optuna)

2. [**Submission**](#submission)


<a id="loading"></a>

##### 0. Imports, Data Loading and Preprocessing

In [None]:
import torch
device = 'gpu' if torch.cuda.is_available() else 'cpu'
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import QuantileTransformer, StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.feature_selection import VarianceThreshold, SelectKBest
import warnings
warnings.filterwarnings('ignore')
import optuna
import gc
import os
root_path = '/kaggle/input/tabular-playground-series-mar-2021'

In [None]:
train = pd.read_csv(os.path.join(root_path, 'train.csv'))
test = pd.read_csv(os.path.join(root_path, 'test.csv'))
sample_submission = pd.read_csv(os.path.join(root_path, 'sample_submission.csv'))

dataset = pd.concat([train, test], axis = 0, ignore_index = True)
train_len = len(train)

label = LabelEncoder()
categorical_feature_columns = dataset.drop('id', 1).select_dtypes(exclude=['float64']).columns

for column in categorical_feature_columns:
        label.fit(dataset[column])
        dataset[column] = label.transform(dataset[column])

categorical_features = list(range(len(categorical_feature_columns)))

train_preprocessed = dataset[:train_len]
test_preprocessed = dataset[train_len:]

features = train_preprocessed.drop(['id', 'target'], 1).columns.tolist()

assert train_preprocessed.shape[1] == test_preprocessed.shape[1]

#del train, test
gc.collect()

<a id="optuna"></a>

### Optuna

Look [here](https://optuna.readthedocs.io/en/stable/tutorial/) for reference about Optuna library. 

Look [here](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html) for a set of Lightgbm Classifier hyperparameters.


Skip and go [here](#hyperparams) to find my best parameters.

In [None]:
#Set to False if you want to skip it

OPTUNA_OPTIMIZATION = True

N_SPLITS = 3 #Number of folds for validation
N_TRIALS = 50 #Number of trials to find best hyperparameters
TIMEOUT = 3600*2

In [None]:
import tqdm
def objective(trial, cv=StratifiedKFold(N_SPLITS, shuffle = True, random_state = 7)):
    
    
    param_lgb = {
        "random_state": trial.suggest_int("random_state", 1, 100),
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "device" : device,
        "boosting_type": "gbdt",
        "gpu_use_dp": True,
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "max_depth": trial.suggest_int("max_depth", -1, 10),
        "n_estimators": trial.suggest_int("n_estimators", 50, 10000),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }
    
    model = LGBMClassifier(**param_lgb)
    
    val_aucs = []
    aucs = []
    
    for kfold, (train_idx, val_idx) in tqdm.tqdm(enumerate(cv.split(train_preprocessed[features].values, 
                                                                    train_preprocessed['target'].values))):
        
        model.fit(train_preprocessed.loc[train_idx, features], train_preprocessed.loc[train_idx, 'target'])
        print('Fitted {}'.format(type(model).__name__))
        val_true = train_preprocessed.loc[val_idx, 'target'].values
        
        preds = model.predict(train_preprocessed.loc[val_idx, features])
        
        auc = roc_auc_score(val_true, preds)
        
        print('Fold: {}\t AUC: {}\n'.format(kfold, auc))
        aucs.append(auc)
    
    print('Average AUC: {}'.format(np.average(auc)))
    return np.average(aucs)

In [None]:
if OPTUNA_OPTIMIZATION:
    study = optuna.create_study(study_name = 'lgbm_parameter_opt', direction="maximize")
    #study.optimize(objective, n_trials=N_TRIALS) 
    study.optimize(objective, timeout=TIMEOUT, show_progress_bar=True) 
    
    trial = study.best_trial
    
    print("  Value: {}".format(trial.value))
    
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
else:
    trial = {'reg_alpha': 28.07671346302542, 'reg_lambda': 3.7286097228210145e-05,
             'max_depth': 21, 'num_leaves': 37, 'colsample_bytree': 0.13251356691552435, 
             'subsample': 0.36239658705576194, 'subsample_freq': 43,
             'min_child_samples': 288}


Best Params: 
    
    'reg_alpha': 48.144730345953434 
    'reg_lambda': 1.2350451395477777e-06 
    'max_depth': 11 
    'num_leaves': 147 
    'colsample_bytree': 0.32482261861770284 
    'subsample': 0.601096026343747 
    'subsample_freq': 178 
    'min_child_samples': 291

In [None]:
if OPTUNA_OPTIMIZATION:
    display(optuna.visualization.plot_intermediate_values(study))

In [None]:
if OPTUNA_OPTIMIZATION:
    display(optuna.visualization.plot_optimization_history(study, target_name = 'Average Validation LogLoss'))

In [None]:
if OPTUNA_OPTIMIZATION:
    display(optuna.visualization.plot_slice(study, target_name = 'Average Validation LogLoss'))

In [None]:
if OPTUNA_OPTIMIZATION:
    display(study.trials_dataframe())

In [None]:
if OPTUNA_OPTIMIZATION:
    final_model = LGBMClassifier(**trial.params)
else:
    final_model = LGBMClassifier(**trial)

In [None]:
test_preds = []

skf = StratifiedKFold(N_SPLITS, shuffle = True, random_state = 7)
aucs = []
for kfold, (train_idx, val_idx) in enumerate(skf.split(train_preprocessed[features].values, 
                                                      train_preprocessed['target'].values)):
        
        final_model.fit(train_preprocessed.loc[train_idx, features], 
                        train_preprocessed.loc[train_idx, 'target'])
        print('Fitted {}'.format(type(final_model).__name__))
        val_true = train.loc[val_idx, 'target'].values
        
        preds = final_model.predict(train_preprocessed.loc[val_idx, features])
        
        auc = roc_auc_score(val_true, preds)
        aucs.append(auc)
        print('Fold: {}\t Validation AUC: {}\n'.format(kfold, auc))
        
        test_preds.append(final_model.predict_proba(test_preprocessed[features])[:, 1])
        
print("Best Parameters mean AUC: {}".format(np.mean(aucs)))

<a id = "submission"></a>

### Submission

In [None]:
test_predictions = np.mean(test_preds, axis = 0)

assert len(test_predictions) == len(test)

sample_submission['target'] = test_predictions

sample_submission.to_csv("submission.csv", index = False)