# 1. Load libraries and data

In [None]:
#algebra
import pandas as pd
pd.options.display.float_format = '{:,.12f}'.format
#I want to see all features from the dataset given. But be careful, sometimes the output can be too large!
pd.options.display.max_rows = None 
pd.set_option('max_colwidth', 260)
import numpy as np

#data preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

#models
import optuna
from catboost import CatBoostRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

#Visual
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import ticker as tkr
#import plotly.express as px

In [None]:
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')

# 2. EDA

In [None]:
print('The train data has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
print('The test data has {} rows and {} columns'.format(test.shape[0], test.shape[1]))

In [None]:
train.info()

In [None]:
test.info()

In [None]:
#Color scheme picked from https://www.kaggle.com/usharengaraju/tensorflow-decision-forests-w-b
train.loc[:, 'f0':].describe().T.style.bar(subset=['mean'], color="#e9c46a")\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='Pastel1')

In [None]:
train.head(15)

### 2.1. Correlation, first look

In [None]:
corr_matrix = train.corr()[['target']].sort_values(by = ['target'], ascending = False).drop(['target']).T
corr_matrix.style.background_gradient(cmap = 'coolwarm').set_precision(2)

### 2.2. Missing values

In [None]:
missing = pd.concat([train.isna().sum().sort_values(ascending = False), train.dtypes], axis=1, keys=['Total', 'Type'])
missing[missing['Total'] > 0]

In [None]:
missing = pd.concat([test.isna().sum().sort_values(ascending = False), test.dtypes], axis=1, keys=['Total', 'Type'])
missing[missing['Total'] > 0]

### 2.3. Data distribution

In [None]:
nrows = 20
ncols = 5
fig, axes = plt.subplots(nrows, ncols, figsize = (25,75))
axes = axes.flatten()
for idx, ax in enumerate(axes):
    sns.kdeplot(data = train, x = f'f{idx}', 
                fill = True, 
                ax = ax)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    ax.set_title(f'f{idx}', loc = 'right', weight = 'bold', fontsize = 10)
    #plt.text(f'f{idx}')


fig.tight_layout()
plt.show()

### There are hats and peaks. Last are skewed and reduce the desired metric.

# 3. Models

In [None]:
X = train.drop(columns = ['id', 'target'])
Y = train['target']

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, train_size = 0.85, random_state = 42)
X_test = test.drop(columns = ['id'])

### 3.1. H2O

In [None]:
import h2o
from h2o.automl import H2OAutoML

In [None]:
h2o.init()

h2o_train = h2o.H2OFrame(train)
h2o_test = h2o.H2OFrame(test)

h2o_train['target'] = h2o.H2OFrame(Y.tolist()).asfactor()

In [None]:
feature_columns = [x for x in h2o_train.columns if x != 'target']
target_column = 'target'

In [None]:
aml = H2OAutoML(
    max_models = 50,
    seed = 2021, 
    max_runtime_secs = 8*3600,
    nfolds = 5,
    #exclude_algos = ['GBM']
)

aml.train(
    x = feature_columns, 
    y = target_column, 
    training_frame = h2o_train)

In [None]:
lb = aml.leaderboard
lb.head(rows = lb.nrows)

https://www.h2o.ai/blog/a-deep-dive-into-h2os-automl/

In [None]:
model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:, 0])
se = h2o.get_model(model_ids[0])
metalearner = h2o.get_model(se.metalearner()['name'])
metalearner.std_coef_plot()

# 4. Submission

In [None]:
preds = aml.predict(h2o.H2OFrame(test[feature_columns]))
preds_df = h2o.as_list(preds)

In [None]:
preds_df['id'] = test['id']
preds_df = preds_df.rename(columns = {'p1': 'target'}).filter(items = ['target', 'id'])

In [None]:
preds_df.to_csv('claim_prediction.csv', index = False)

# Save model

In [None]:
h2o.save_model(aml.leader, path = './suharkov_h2o_model_bin')