# Casual hack of h2o automl
Directly but good use cases when doing ensembling , specially when doing stacking.

In [None]:
import numpy as np
import pandas as pd

import os

from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler  
from tqdm import tqdm
import numpy as np
import pickle
import optuna

# Metrics for models evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

plt.style.use("seaborn-whitegrid")
import h2o
from h2o.automl import H2OAutoML


In [None]:
from pathlib import Path

input_path = Path('../input/tabular-playground-series-sep-2021/')

# Houskeeping

In [None]:
X_test = pd.read_csv(f"{input_path}/test.csv", index_col=0)
X_test.head()

## Declare Constants

In [None]:
TARGET_VAR='claim'
FOLDS=10
useful_features= None

In [None]:
X= pd.read_feather("../input/10-folds-stratified-parquet-feather/train_stratfold.ft")

# Feature Engineering

In [None]:
num_cols  = [c for c in X.columns if c.startswith("f")] 
len(num_cols)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

X["nan_count"] = X.isnull().sum(axis=1)
X_test["nan_count"] = X_test.isnull().sum(axis=1)
useful_features = [c for c in X.columns if c not in ['id','claim' ,'index','kfold']] 
def impute(df):
    for name in useful_features:
        df[name].fillna(df[name].mean(), inplace = True)
    return df    

X=impute(X)
X_test=impute(X_test)

claim = X[TARGET_VAR]
kfold = X["kfold"]
ids=X['id']
X = X.drop(["index","id",TARGET_VAR,"kfold"],axis=1)
scalar = StandardScaler()
X[:]= scalar.fit_transform(X)
X_test[:]= scalar.transform(X_test)
X[TARGET_VAR] = claim
X["kfold"] = kfold
X['id'] = ids
X['min_row'] = X[num_cols].min(axis=1)
X['mean_row'] = X[num_cols].min(axis=1)
X['max_row'] = X[num_cols].max(axis=1)
X['std_row'] = X[num_cols].std(axis=1)
X_test['min_row'] = X_test[num_cols].min(axis=1)
X_test['max_row'] = X_test[num_cols].min(axis=1)
X_test['std_row'] = X_test[num_cols].std(axis=1)
X_test['mean_row'] = X_test[num_cols].mean(axis=1)


In [None]:
print(f"Any NA :{X.isna().values.any()} Any Null :{X.isnull().values.any()}")

In [None]:
from h2o.transforms.preprocessing import H2OScaler
from h2o.transforms.decomposition import H2OPCA

# Check with default approach 

In [None]:
h2o.init() # h2o initialization 
train= h2o.H2OFrame(X) # convert to h2o frame
test = h2o.H2OFrame(X_test) # convert to h2o frame 
del X

In [None]:
train[TARGET_VAR] = train[TARGET_VAR].asfactor() #binary classification 

In [None]:
auto_ml = H2OAutoML( 
    #nfolds=5, # use 5 folds 
    seed = 1222,
    max_models = 20,
    include_algos = ["XGBoost" ,"StackedEnsemble","GBM"],# "DRF" ,"DeepLearning"
    max_runtime_secs=3600*4,  #time in sec 
    sort_metric='AUCPR',
    stopping_metric='AUCPR'
    )
#pipe = Pipeline([("standardize", H2OScaler()),
#                 ("pca", H2OPCA()),
#                 ("rf", auto_ml)])

auto_ml.train(x=useful_features, y=TARGET_VAR, training_frame=train,  fold_column='kfold')

In [None]:
auto_ml.leaderboard
# 0.77357 Public score on auc 0.806177
# 0.807013  0.77357
# last public score 0.81113 0.809487

In [None]:
auto_ml.leaderboard
leader = auto_ml.leaderboard

In [None]:
mc_plot = auto_ml.model_correlation_heatmap(train)

In [None]:
model = h2o.get_model(leader[3,"model_id"]) # get gbm model 
learning_curve_plot = model.learning_curve_plot()

In [None]:
preds = auto_ml.leader.predict(test)

In [None]:
## create submission
submission = pd.DataFrame({
    'id': X_test.index,
    'claim': preds.as_data_frame().p1
})
submission.head()

In [None]:
submission.to_csv('h2o_submission.csv', index=False)