In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb 
from tqdm import tqdm
import optuna
from  sklearn.metrics import accuracy_score
tqdm.pandas()

In [None]:
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv')
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')

In [None]:
cont_cols = [x for x in train_df.columns if 'cont' in x]
cat_cols = [x for x in train_df.columns if 'cat' in x]

In [None]:
for col in cat_cols:
    le = preprocessing.LabelEncoder()
    full = train_df[col].append(test_df[col])
    le.fit(full)
    train_df[col] = le.transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

In [None]:
data = train_df.drop(['id', 'target'], axis=1)
target = train_df['target']

In [None]:
# To use optuna, we have to declare an objective funation, which optuna tries to optimize
# Input to the objective is a trial,  which is a single execution of the objective function
# output of optuna will be the score which we are trying to optimize.
def objective(trial):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.3, stratify=target)
    dtrain = xgb.DMatrix(train_x, label=train_y, enable_categorical=True)
    dtest = xgb.DMatrix(test_x, label=test_y, enable_categorical=True)
    
#     print("Shape of dataframes after split:")
#     print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)
#     print("Target value counts after split in train and test:")
#     print( train_y.value_counts(), test_y.value_counts())    
    
    param = {
        "verbosity": 1,
        "objective": "binary:logistic", # Output is probability for logistic regression. "binary:hinge" can be sued for predictions(0, 1)
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]), #gbtree, dart use tree based models, gblinear uses linear finctions.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True), # lambda represents L2 regularization on weights
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True) # alpha represents L1 regularization
    } 
    
    # getting other hyperparameters, based on above set ones.
    if param["booster"]=="gbtree" or param["booster"]=="dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1,9) 
        param["eta"] = trial.suggest_float("eta", 1e-9, 1.0, log=True) 
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"]=="dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type",  ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
    
    bst = xgb.train(param, dtrain)
    preds = bst.predict(dtest)
    pred_labels = np.rint(preds) # Rounds number to nearest integer. same as threshold 0.5.
    accuracy = accuracy_score(test_y, pred_labels) # metric can be changed as per requirement
    return accuracy
    

In [None]:
# Now we crate a "study", which is a optuna terminology, where we try to optimize our accuracy by repeating the trials
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=600)

In [None]:
type(study)

In [None]:
dir(study)

In [None]:
len(study.trials)

In [None]:
trial = study.best_trial

In [None]:
trial.value

In [None]:
# we can get params using below:

for k, v in trial.params.items():
    print("{}: {}".format(k, v))

In [None]:
final_param = trial.params

In [None]:
test_data = test_df.drop(['id'], axis=1)

In [None]:
dtrain = xgb.DMatrix(data, label=target, enable_categorical=True)
dtest = xgb.DMatrix(test_data, enable_categorical=True)

In [None]:
bst = xgb.train(final_param, dtrain)

In [None]:
pred = bst.predict(dtest)

In [None]:
pred = np.rint(pred).astype(int)

In [None]:
sample_submission['target'] = pred
sample_submission.to_csv('submission_optuna_xgb.csv', index=False)