## Tabular Playground Series April 2021

<img src="https://i.imgur.com/uHVJtv0.png">

<br><br>

### Notebook Contents:

<div id="toc_container" style="background: #f9f9f9; border: 1px solid #aaa; display: table; font-size: 95%;
                               margin-bottom: 1em; padding: 20px; width: auto;">
<p class="toc_title" style="font-weight: 700; text-align: center">Notebook Contents</p>
<ul class="toc_list">
  <li><a href="#loading">0. Imports, Data Loading and Preprocessing</a>
  <li><a href="#optuna">1. Optuna Hyperparameter Optimization</a>
      <br>
      <ul>
    <li><a href="#optuna_objective">1.0 Define Objective</a></li>
    <li><a href="#optuna_study">1.1 Start Optimization</a></li>
    <li><a href="#optuna_plots">1.2 Check Optimization Plots</a></li>
  </ul>
</li>
<li><a href="#submission">2. Submission</a></li>
</ul>
</div>

##### Props

Props to [corochann](https://www.kaggle.com/corochann/optuna-tutorial-for-hyperparameter-optimization), I believe this notebook is the best you can find about Optuna.


##### Versioning

17 For best score (0.79611)

<a id="loading"></a>

##### 0. Imports, Data Loading and Preprocessing

In [None]:
import torch
device = 'gpu' if torch.cuda.is_available() else 'cpu'
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import QuantileTransformer, StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.feature_selection import VarianceThreshold, SelectKBest
import warnings
warnings.filterwarnings('ignore')
import optuna
import tqdm
import gc
import os
root_path = '/kaggle/input/tabular-playground-series-apr-2021'

In [None]:
train = pd.read_csv(os.path.join(root_path, 'train.csv'))
test = pd.read_csv(os.path.join(root_path, 'test.csv'))
sample_submission = pd.read_csv(os.path.join(root_path, 'sample_submission.csv'))

In [None]:
dataset = pd.concat([train, test], axis = 0, ignore_index = True)
train_len = len(train)

label = LabelEncoder()
categorical_feature_columns = dataset.drop(['PassengerId', 'Survived'], 1).select_dtypes(exclude=['float64']).columns

for column in categorical_feature_columns:
        label.fit(dataset[column])
        dataset[column] = label.transform(dataset[column])

categorical_features = list(range(len(categorical_feature_columns)))

train_preprocessed = dataset[:train_len]
test_preprocessed = dataset[train_len:]

features = train_preprocessed.drop(['PassengerId', 'Survived'], 1).columns.tolist()

assert train_preprocessed.shape[1] == test_preprocessed.shape[1]

#del train, test
gc.collect()
cat_indices = [features.index(i) for i in categorical_feature_columns]

<a id="optuna"></a>

### Optuna


Check Version 2 for the actual Optimization.

Look [here](https://optuna.readthedocs.io/en/stable/tutorial/) for reference about Optuna library. 

Look [here](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html) for a set of Lightgbm Classifier hyperparameters.


Skip and go [here](#hyperparams) to find my best parameters.

In [None]:
#Set to False if you want to skip it

OPTUNA_OPTIMIZATION = True
N_SPLITS = 5 #Number of folds for validation
N_TRIALS = 200 #Number of trials to find best hyperparameters
TIME = 3600*3 #Time to run optimization (alternative to N_TRIALS)
FOLD_RANDOM_SEED = 42

<a id = "optuna_objective"></a>
<h6> Define Objective </h6>

In [None]:
def objective(trial, cv=StratifiedKFold(N_SPLITS, shuffle = True, random_state = FOLD_RANDOM_SEED)):
    
    
    param_lgb = {
        "random_state": 42,
        "metric": "auc",
        "categorical_feature": cat_indices,
        "verbosity": -1,
        "n_estimators": 2000,
        "learning_rate": trial.suggest_categorical('learning_rate', [0.001, 0.005, 0.01, 0.05, 0.1]),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024),
        'max_depth': trial.suggest_int('max_depth', -1, 32),
        'reg_alpha': trial.suggest_float('reg_alpha', 1E-16, 25),
        'reg_lambda': trial.suggest_float('reg_lambda', 1E-16, 25),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 1E-16, 1.0),
        'subsample': trial.suggest_float('subsample ', 1E-16, 1.0),
        'cat_smooth': trial.suggest_float('cat_smooth', 1.0, 50.0)  
    }
    
    
    val_aucs = []
    aucs = []
    accuracies = []
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'auc', valid_name='valid_1') 
    
    for kfold, (train_idx, val_idx) in tqdm.tqdm(enumerate(cv.split(train_preprocessed[features].values, 
                                                                    train_preprocessed['Survived'].values))):
        
        
        X_train = train_preprocessed.loc[train_idx, features]
        y_train = train_preprocessed.loc[train_idx, 'Survived']
        
        X_valid = train_preprocessed.loc[val_idx, features]
        y_valid = train_preprocessed.loc[val_idx, 'Survived']
        
        d_train = lgb.Dataset(X_train, label=y_train)
        d_valid = lgb.Dataset(X_valid, label=y_valid)
        watchlist = [d_train, d_valid]
        
        model = lgb.train(param_lgb,
                      train_set=d_train,
                      num_boost_round=1500,
                      valid_sets=watchlist,
                      verbose_eval=0,
                      early_stopping_rounds=100,
                      callbacks=[pruning_callback])

        preds = np.round(model.predict(X_valid)).astype(int)
        #auc = roc_auc_score(y_valid, preds)
        
        accuracy = accuracy_score(y_valid, preds)
        
        #aucs.append(auc)
        accuracies.append(accuracy)
    
    return np.average(accuracies)

<a id = "optuna_study"></a>
<h6> Start Optimization </h6>

In [None]:
if OPTUNA_OPTIMIZATION:
    study = optuna.create_study(study_name = 'lgbm_parameter_opt', direction="maximize",
                                pruner=optuna.pruners.MedianPruner(n_warmup_steps=25))
    
    study.optimize(objective, n_trials=N_TRIALS) 
    
    trial = study.best_trial
    
    print("  Value: {}".format(trial.value))
    
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    best_params = trial.params.items()
else:
    trial = {
            "random_state": 42,
            "metric": "auc",
            "categorical_feature": cat_indices,
            "verbosity": -1,
            "n_estimators": 20000,
             'learning_rate': 0.05,
             'num_leaves': 39,
             'max_depth': 28,
             'reg_alpha': 13.0124692806962,
             'reg_lambda': 17.429087848443793,
             'colsample_bytree': 0.6993443635848076,
             'subsample': 0.7146065596315723,
             'cat_smooth': 8.61671087256764}
    best_params=trial

<a id ="hyperparams"></a>

best_params are: <br>
>     {"random_state": 42,
>      "metric": "auc",
>      "categorical_feature": cat_indices,
>      "verbosity": -1,
>      "n_estimators": 20000,
>      "learning_rate": 0.05,
>      "num_leaves": 39,
>      "max_depth": 28,
>      "reg_alpha": 13.0124692806962,
>      "reg_lambda": 17.429087848443793,
>      "colsample_bytree": 0.6993443635848076,
>      "subsample": 0.7146065596315723,
>      "cat_smooth": 8.61671087256764}

<a id = "optuna_plots"></a>
<h6> Check Optimization plots </h6>

In [None]:
if OPTUNA_OPTIMIZATION:
    display(optuna.visualization.plot_intermediate_values(study))

In [None]:
if OPTUNA_OPTIMIZATION:
    display()

In [None]:
if OPTUNA_OPTIMIZATION:
    display(optuna.visualization.plot_optimization_history(study, target_name = 'Average Validation AuC'))

In [None]:
if OPTUNA_OPTIMIZATION:
    display(optuna.visualization.plot_slice(study, target_name = 'Average Validation AuC'))

In [None]:
if OPTUNA_OPTIMIZATION:
    display(optuna.visualization.plot_parallel_coordinate(study, target_name = 'Average Validation AuC'))

In [None]:
if OPTUNA_OPTIMIZATION:
    display(study.trials_dataframe())

<a id = "submission"></a>

### Submission

In [None]:
if OPTUNA_OPTIMIZATION:
    final_model = LGBMClassifier(**trial.params)
else:
    final_model = LGBMClassifier(**trial)

In [None]:
test_preds = []

skf = StratifiedKFold(N_SPLITS, shuffle = True, random_state = FOLD_RANDOM_SEED)
aucs = []
accuracies = []

for kfold, (train_idx, val_idx) in enumerate(skf.split(train_preprocessed[features].values, 
                                                      train_preprocessed['Survived'].values)):
        
        final_model.fit(train_preprocessed.loc[train_idx, features], 
                        train_preprocessed.loc[train_idx, 'Survived'])
        print('Fitted {}'.format(type(final_model).__name__))
        
        val_true = train.loc[val_idx, 'Survived'].values
        
        preds = final_model.predict(train_preprocessed.loc[val_idx, features])
        
        auc = roc_auc_score(val_true, preds)
        accuracy = accuracy_score(val_true, preds)
        aucs.append(auc)
        accuracies.append(accuracy)
        print('Fold: {}\t Validation AUC: {}\n'.format(kfold, auc))
        print('Fold: {}\t Validation ACCURACY: {}\n'.format(kfold, accuracy))
        
        test_preds.append(final_model.predict(test_preprocessed[features]))
        
print("Best Parameters mean AUC: {}".format(np.mean(aucs)))
print("Best Parameters mean ACCURACY: {}".format(np.mean(accuracies)))

In [None]:
test_predictions = np.mean(test_preds, axis = 0)
assert len(test_predictions) == len(test)
sample_submission['Survived'] = np.round(test_predictions).astype(int)

In [None]:
sample_submission.to_csv("submission.csv", index = False)