In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold
import optuna
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_data = pd.read_csv("../input/tabular-playground-series-may-2022/train.csv")
test_data = pd.read_csv("../input/tabular-playground-series-may-2022/test.csv")

In [None]:
# Taking reference from the wonderful notebook created by Ambrose
# Link of the notebook : https://www.kaggle.com/code/ambrosm/tpsmay22-eda-which-makes-sense
features = [f for f in test_data.columns if f != 'id' and f != 'f_27']
float_features = [f for f in features if test_data[f].dtype == float]
for df in [train_data, test_data]:
    # Extract the 10 letters of f_27 into individual features
    for i in range(10):
        df[f'ch{i}'] = df.f_27.str.get(i).apply(ord) - ord('A')
        
    # unique_characters feature is from https://www.kaggle.com/code/cabaxiom/tps-may-22-eda-lgbm-model
    df["unique_characters"] = df.f_27.apply(lambda s: len(set(s)))
    
    # Feature interactions: create three ternary features
    # Every ternary feature can have the values -1, 0 and +1
    df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
    df['i_05_22'] = (df.f_22 + df.f_05 > 5.1).astype(int) - (df.f_22 + df.f_05 < -5.4).astype(int)
    i_00_01_26 = df.f_00 + df.f_01 + df.f_26
    df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)
    
features = [f for f in test_data.columns if f != 'id' and f != 'f_27']
float_features = [f for f in features if test_data[f].dtype == float]
int_features = [f for f in features if test_data[f].dtype == int and f.startswith('f')]
ch_features = [f for f in features if f.startswith('ch')]

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
test_data.drop("f_27",axis=1,inplace=True)

In [None]:
useful_features = [feature for feature in train_data.columns if feature not in ["id", "target","f_27"]]

In [None]:
kf = KFold(n_splits=5,shuffle=True,random_state=42)
    
roc_score = []
    
final_predictions = []

for fold, (trn_idx, val_idx) in enumerate(kf.split(X=train_data)):
    
    X_train, X_val = train_data.iloc[trn_idx][useful_features], train_data.iloc[val_idx][useful_features]
    y_train, y_val = train_data.iloc[trn_idx]["target"], train_data.iloc[val_idx]["target"]
    
    x_test = test_data.copy()
    x_test = x_test.drop("id",axis=1)
    
    scaler =  StandardScaler()
    x_train = scaler.fit_transform(X_train)
    x_valid = scaler.transform(X_val)
    x_test = scaler.transform(x_test) 
    
    model = XGBClassifier( 
        n_estimators= 10000,
        random_state=0,
        use_label_encoder=False,
        objective='binary:logistic',
        tree_method='gpu_hist',  
        gpu_id=0,
        predictor='gpu_predictor',
        n_jobs = -1,
    )
    
    model.fit(x_train, y_train,
              eval_metric='auc', 
              eval_set=[(x_valid, y_val)],
              verbose=0)
    
    
    preds_valid = model.predict_proba(x_valid)[:,1]
    print("The ROC score after {} fold is {}".format(fold,roc_auc_score(y_val,preds_valid)))
    roc_score.append(roc_auc_score(y_val,preds_valid))
    test_preds = model.predict_proba(x_test)[:,1]
    final_predictions.append(test_preds)

print(np.mean(roc_score))

In [None]:
def objective(trial):
    
    kf = KFold(n_splits=5,shuffle=True,random_state=42)

    roc_score = []

    final_predictions = []

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X=train_data)):
        
        learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
        reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
        reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
        subsample = trial.suggest_float("subsample", 0.1, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
        max_depth = trial.suggest_int("max_depth", 1, 7)
        n_estimators = trial.suggest_int("n_estimators",1000,20000)

        X_train, X_val = train_data.iloc[trn_idx][useful_features], train_data.iloc[val_idx][useful_features]
        y_train, y_val = train_data.iloc[trn_idx]["target"], train_data.iloc[val_idx]["target"]

        x_test = test_data.copy()
        x_test = x_test.drop("id",axis=1)

        scaler =  StandardScaler()
        x_train = scaler.fit_transform(X_train)
        x_valid = scaler.transform(X_val)
        x_test = scaler.transform(x_test) 

        model = XGBClassifier( 
                n_estimators= n_estimators,
                random_state=fold,
                use_label_encoder=False,
                objective='binary:logistic',
                tree_method='gpu_hist',  
                gpu_id=0,
                predictor='gpu_predictor',
                n_jobs = -1,
                learning_rate=learning_rate,
                reg_lambda=reg_lambda,
                reg_alpha=reg_alpha,
                subsample=subsample,
                colsample_bytree=colsample_bytree,
                max_depth=max_depth
                    )

        model.fit(x_train, y_train,
                  eval_metric='auc', 
                  eval_set=[(x_valid, y_val)], 
                  verbose=0)


        preds_valid = model.predict_proba(x_valid)[:,1]
        print("The ROC score after {} fold is {}".format(fold,roc_auc_score(y_val,preds_valid)))
        
        roc_score.append(roc_auc_score(y_val,preds_valid))
        
    return np.mean(roc_score)

In [None]:
study = optuna.create_study(direction="maximize",study_name="XGBoost Hyperparameter Tuning")
study.optimize(objective, n_trials=25)

In [None]:
xgboost_best_params = study.best_params
xgboost_best_params

In [None]:
kf = KFold(n_splits=5,shuffle=True,random_state=42)
    
roc_score = []
    
final_predictions = []

for fold, (trn_idx, val_idx) in enumerate(kf.split(X=train_data)):
    
    X_train, X_val = train_data.iloc[trn_idx][useful_features], train_data.iloc[val_idx][useful_features]
    y_train, y_val = train_data.iloc[trn_idx]["target"], train_data.iloc[val_idx]["target"]
    
    x_test = test_data.copy()
    x_test = x_test.drop("id",axis=1)
    
    scaler =  StandardScaler()
    x_train = scaler.fit_transform(X_train)
    x_valid = scaler.transform(X_val)
    x_test = scaler.transform(x_test) 
    
    model = XGBClassifier( 
        random_state=fold,
        use_label_encoder=False,
        objective='binary:logistic',
        tree_method='gpu_hist',  
        gpu_id=0,
        predictor='gpu_predictor',
        n_jobs = -1,
        **xgboost_best_params
    )
    
    model.fit(x_train, y_train,
              eval_metric='auc', 
              eval_set=[(x_valid, y_val)],
              verbose=0
            )
    
    
    preds_valid = model.predict_proba(x_valid)[:,1]
    print("The ROC score after {} fold is {}".format(fold,roc_auc_score(y_val,preds_valid)))
    roc_score.append(roc_auc_score(y_val,preds_valid))
    test_preds = model.predict_proba(x_test)[:,1]
    final_predictions.append(test_preds)

print(np.mean(roc_score))

In [None]:
target = np.mean(np.column_stack(final_predictions), axis=1)

ids = test_data["id"]

In [None]:
output_data = pd.DataFrame({"id":ids,"target":target})

In [None]:
output_data

In [None]:
output_data.to_csv("submission3.csv",index=False)