In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import optuna

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
df_test  = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
ss       = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
X = df_train.drop(columns=['id','claim']).copy()
y = df_train['claim'].copy()
X_test = df_test.drop(columns=['id']).copy()

In [None]:
X['NaN_row'] = X.isna().sum(axis=1)
X_test['NaN_row'] = X_test.isna().sum(axis=1)

In [None]:
X.head()

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='constant')
X = imputer.fit_transform(X)
X_test = imputer.transform(X_test)

In [None]:
scaling = StandardScaler()
X = scaling.fit_transform(X)
X_test = scaling.fit_transform(X_test)

In [None]:
X = pd.DataFrame(X)
X_test = pd.DataFrame(X_test)

In [None]:
def objective(trial,data=X,target=y):
    
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2,random_state=42)
    
    params = {
        'max_depth': trial.suggest_int('max_depth', 6, 10), 
        'n_estimators': trial.suggest_int('n_estimators', 400, 4000, 400), 
        'eta': trial.suggest_float('eta', 0.007, 0.013), 
        'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 0.9, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 0.9, 0.1),
        'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 0.9, 0.1),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 1e4), 
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e4),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e4), 
        'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e4),
        'predictor': "gpu_predictor",
        'eval_metric' : 'auc',
        'objective' : 'binary:logistic',
        }
    
    model = XGBClassifier(**params, tree_method='gpu_hist', random_state=2021, use_label_encoder=False)
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)], early_stopping_rounds=100, verbose=False)
    
    preds = model.predict_proba(test_x)[:,1]
    fpr, tpr, _ = roc_curve(test_y, preds)
    score = auc(fpr, tpr)
    
    return score

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
params=study.best_params
print(params)

In [None]:
%%time
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

preds = []
scores = []

for fold, (idx_train, idx_valid) in enumerate(kf.split(X, y)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    model = XGBClassifier(**params,
                              booster= 'gbtree',
                              eval_metric = 'auc',
                              tree_method= 'gpu_hist',
                              predictor="gpu_predictor",
                              use_label_encoder=False)
    model.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],early_stopping_rounds=100,verbose=False)
    
    pred_valid = model.predict_proba(X_valid)[:,1]
    fpr, tpr, _ = roc_curve(y_valid, pred_valid)
    score = auc(fpr, tpr)
    scores.append(score)
    
    print(f"Fold: {fold + 1} Score: {score}")
    print('-'*25)
    
    test_preds = model.predict_proba(X_test)[:,1]
    preds.append(test_preds)
    
print(f"Overall Validation Score: {np.mean(scores)}")

In [None]:
predictions = np.mean(np.column_stack(preds),axis=1)

ss['claim'] = predictions
ss.to_csv('./xgb.csv', index=False)
ss.head()