In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
import optuna

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
df_test  = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
ss       = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
X = df_train.drop(columns=['id','claim']).copy()
y = df_train['claim'].copy()
X_test = df_test.drop(columns=['id']).copy()

In [None]:
X['NaN_row'] = X.isna().sum(axis=1)
X_test['NaN_row'] = X_test.isna().sum(axis=1)

In [None]:
pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='constant')),
    ('scale', StandardScaler())
])

X = pd.DataFrame(columns=X.columns, data=pipeline.fit_transform(X))
X_test = pd.DataFrame(columns=X_test.columns, data=pipeline.transform(X_test))

In [None]:
X.head()

In [None]:
def objective(trial, data=X, target=y):
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2,random_state=42)
    
    params = {
        'loss' : 'binary_crossentropy',
        'learning_rate' : trial.suggest_uniform('learning_rate',0.02,1),
        'max_iter' : trial.suggest_categorical('max_iter', [20, 30, 50, 100, 150, 200, 500, 1000]),
        'max_leaf_nodes' : trial.suggest_int('max_leaf_nodes', 2, 256),
        'max_depth' : trial.suggest_categorical('max_depth',[1, 2, 4, 5, 7, 9, 11, 13, 15]),
        'min_samples_leaf' : trial.suggest_int('min_sample_leaf', 1, 200),
        'l2_regularization' : trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'random_state' : 2021,
        'verbose' : 0,
        'early_stopping' : 100
    }
    
    model = HistGradientBoostingClassifier(**params)
    model.fit(train_x,train_y)
    
    preds = model.predict_proba(test_x)[:,1]
    fpr, tpr, _ = roc_curve(test_y, preds)
    score = auc(fpr, tpr)
    
    return score

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

# Visualization

**Note:-** If a parameter containing missing values, then trial with missing values will not be plotted in case of ***plot_parallel_coordinate*** and ***plot_slice***.

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_edf(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
optuna.visualization.plot_contour(study)

**Note:-** Since (*HistGradientBoosting*) is inspired by (*LightGBM*) so some parameter names like **min_samples_leaf** and **l2_regularization** will get changed to **min_sample_leaf** and **reg_lambda** respectively. So change it otherwise it will throw an error.

In [None]:
params=study.best_params
params['min_samples_leaf'] = params['min_sample_leaf']
params.pop('min_sample_leaf')
params['l2_regularization'] = params['reg_lambda']
params.pop('reg_lambda')
print(params)

In [None]:
%%time
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

preds = []
scores = []

for fold, (idx_train, idx_valid) in enumerate(kf.split(X, y)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    model = HistGradientBoostingClassifier(**params)
    model.fit(X_train,y_train)
    
    pred_valid = model.predict_proba(X_valid)[:,1]
    fpr, tpr, _ = roc_curve(y_valid, pred_valid)
    score = auc(fpr, tpr)
    scores.append(score)
    
    print(f"Fold: {fold + 1} Score: {score}")
    print('-'*25)
    
    test_preds = model.predict_proba(X_test)[:,1]
    preds.append(test_preds)
    
print(f"Overall Validation Score: {np.mean(scores)}")

In [None]:
predictions = np.mean(np.column_stack(preds),axis=1)

ss['claim'] = predictions
ss.to_csv('./gbhist.csv', index=False)
ss.head()