In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import optuna
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Loading the data:
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
submsn = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv')

### Basic Exploration of Data:

In [None]:
train.head()

In [None]:
print(f"The shape of train data is {train.shape}")
print(f"The shape of test data is {test.shape}")

In [None]:
train.info()

### Getting the missing values count:

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

### Substituting missing values:

In [None]:
test['Age'].fillna((train['Age'].mean()), inplace=True)
train['Age'].fillna((train['Age'].mean()), inplace=True)
test['Fare'].fillna((train['Fare'].mean()), inplace=True)
train['Fare'].fillna((train['Fare'].mean()), inplace=True)

In [None]:
# Dropping some columns:
train.drop(['Name','Ticket','Cabin'], axis=1, inplace=True)
test.drop(['Name','Ticket','Cabin'], axis=1, inplace=True)

In [None]:
# Applying Label Encoder:
obj_cols = train.select_dtypes(include=['object']).columns.tolist()
for col in obj_cols:
    le = LabelEncoder()
    le.fit(train[col])
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

In [None]:
# Defining the columns and target:
cols = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
target = train.Survived.values

# Fitting the model
#model = xgb.XGBClassifier()
#model.fit(train[cols], target)

In [None]:
# Getting the score for train data:
#roc_auc_score(target, model.predict_proba(train[cols])[:,1])

In [None]:
# Getting the predictions for test data:
# submsn['Survived'] = model.predict(test[cols])
# submsn.to_csv('submission.csv', index=False)

### Using Optuna + XGB:

In [None]:
NUM_FOLDS = 5
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=0)

In [None]:
def objective(trial , data = train , target = target):
    for f, (train_ind, val_ind) in enumerate(skf.split(data, target)):
        train_df, val_df = data.iloc[train_ind][cols], data.iloc[val_ind][cols]
        train_target, val_target = target[train_ind], target[val_ind]
        
        params = {
            'eval_metric' : 'auc',
            'booster' : 'gbtree',
            'tree_method' : 'gpu_hist' , 
            'use_label_encoder' : False , 
            'lambda' : trial.suggest_loguniform('lambda' , 1e-5 , 1.0),
            'alpha' : trial.suggest_loguniform('alpha' , 1e-5 , 1.0),
            'colsample_bytree' : trial.suggest_uniform('colsample_bytree' , 0 , 1.0),
            'subsample' : trial.suggest_uniform('subsample' , 0 , 1.0),
            'learning_rate' : trial.suggest_uniform('learning_rate' , 0 , 0.02),
            'n_estimators' : trial.suggest_int('n_estimators' , 1 , 9999),
            'max_depth' : trial.suggest_int('max_depth' , 1 , 20),
            'random_state' : trial.suggest_categorical('random_state' , [0,42,2021]),
            'min_child_weight' : trial.suggest_int('min_child_weight' , 1 , 300),
            'gamma' : trial.suggest_loguniform('gamma' , 1e-5 , 1.0)
        }
        model = xgb.XGBClassifier(**params)
        model.fit(train_df , train_target , eval_set = [(val_df , val_target)] , early_stopping_rounds = 200 , \
                  verbose = False)
        preds = model.predict(val_df[cols])
        auc = roc_auc_score(val_target , preds )
    return auc

In [None]:
# Running the study:
study = optuna.create_study(direction = 'maximize' , study_name = 'xgbclassifier')
study.optimize(objective , n_trials = 10)
print('number of the finished trials:' , len(study.trials))
print('the parametors of best trial:' , study.best_trial.params)
print('best value:' , study.best_value)

In [None]:
# Getting the best params:
params = study.best_trial.params
params['eval_metric'] = 'auc'
params['booster'] = 'gbtree'
params['tree_method'] = 'gpu_hist'

In [None]:
# Rerunning on the data using only the best params:
preds = np.zeros(test.shape[0])
oof_predictions = np.zeros(len(train))
skf = StratifiedKFold(n_splits = 10 , random_state = 0 , shuffle = True)

for trn_idx , val_idx in skf.split(train , target):
    train_x = train.iloc[trn_idx][cols]
    train_y = target[trn_idx]
    val_x = train.iloc[val_idx][cols]
    val_y = target[val_idx]
    
    model = xgb.XGBClassifier(**params)
    model.fit(train_x , train_y , eval_set = [(val_x , val_y)] , early_stopping_rounds = 100 , \
             verbose = False)
    preds += model.predict(test[cols])/skf.n_splits
    oof_predictions[val_idx] += model.predict(val_x[cols])

In [None]:
submsn['Survived'] = np.round(preds).astype(int)
submsn.to_csv('submission.csv', index=False)