In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import optuna

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/test.csv")

In [None]:
print(train.shape); print(test.shape) 

In [None]:
# create manual 10 folds
from sklearn import model_selection

train["kfold"] = -1

kf = model_selection.KFold(n_splits=10, shuffle=True, random_state=102)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=train)):
    train.loc[valid_indicies, "kfold"] = fold

In [None]:
# will use only f_27for prediction
df_tr = train[['f_27', 'target', 'kfold']]
df_te = test[['f_27']]

print(df_tr.shape); print(df_te.shape) 

In [None]:
pd.crosstab(index=df_tr['target'], columns=df_tr['kfold'])

In [None]:
# function to one hot encode the string

def count_alpha(df):
    for x in string.ascii_uppercase[:20]:
        df[f'count_{x}'] = df['f_27'].str.count(x)
    
    df = df.drop('f_27', 1)

    return(df)

df_tr = count_alpha(df_tr)
df_te = count_alpha(df_te)

In [None]:
print(df_tr.shape)
print(df_te.shape)

In [None]:
 use_feature = [c for c in df_tr.columns if c not in ("target", "kfold")]

In [None]:
# tuning 
def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = df_tr[df_tr.kfold != fold].reset_index(drop=True)
    xvalid = df_tr[df_tr.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[use_feature]
    xvalid = xvalid[use_feature]

    model = XGBClassifier(
        random_state=42,
        tree_method="gpu_hist",
        gpu_id=1,
        predictor="gpu_predictor",
        n_estimators=7000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    AUC = roc_auc_score(yvalid, preds_valid)
    return AUC


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(run, n_trials=5)

In [None]:
# final model for xgboost
final_predictions = []
scores = []

for fold in range(5):
    xtrain = df_tr[df_tr.kfold != fold].reset_index(drop=True)
    xvalid = df_tr[df_tr.kfold == fold].reset_index(drop=True)
    
    xtest = df_te[use_feature]
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[use_feature]
    xvalid = xvalid[use_feature]
    
    params = study.best_params
    
    model = XGBClassifier(
        random_state=0, 
        tree_method='gpu_hist',
        #gpu_id=0,
        predictor="gpu_predictor",
        n_estimators=5000,
         eval_metric = 'auc',
        **params
    )
    
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict_proba(xvalid)
    test_preds = model.predict_proba(xtest)
    final_predictions.append(test_preds)
    ROC = roc_auc_score(yvalid, preds_valid[:,1])
    print(fold, ROC)
    scores.append(ROC)

print(np.mean(scores), np.std(scores))

In [None]:
# submision for xgboost
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv")
preds = np.mean(np.column_stack([row[:,1] for row in final_predictions]), axis=1)
sample_submission.target = preds
sample_submission.to_csv('submission_f27.csv',index=False)