- reference : 
    - [Simple EDA + H2OAutoML](https://www.kaggle.com/mhslearner/simple-eda-h2oautoml)

- Most part of this notebook is based on a article, 'Simple EDA + H2OAutoML' in the reference above.

In [None]:
## Importing Libraries
import os
import pandas as pd
import numpy as np
import datatable as dt  # pip install datatable
import matplotlib.pyplot as plt
from collections import Counter
import math 
%matplotlib inline
import seaborn as sns
sns.set_style('ticks')
import plotly.offline as py
py.init_notebook_mode(connected = True)
import plotly.graph_objs as go
import plotly.tools as tls
from sklearn.feature_selection import mutual_info_classif
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
print("Packages Imported")

## Load the data

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
test_data = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')
sample     = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')

### Quick look at the Train data

In [None]:
train_data.head(5)

In [None]:
print(f'Number of rows: {train_data.shape[0]}; \n  Number of columns: {train_data.shape[1]}; \n No of missing values: {sum(train_data.isna().sum())};')

In [None]:
print('there is no missing values.')
train_data.isna().sum(axis = 0).sort_values(ascending = True)

In [None]:
print("Info about the train data: ")
Counter(train_data.dtypes.values)

### Basic summary statistics

In [None]:
train_data.describe().style.background_gradient(cmap = 'coolwarm')

- min-max scaling is already employed.

In [None]:
## Correlationmatrix
corrMatrix = train_data.corr(method = 'pearson', min_periods = 1)
corrMatrix

In [None]:
ax = sns.heatmap(
    corrMatrix, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

- it seems that there is no significant relation b/w target and exploratory variables.

### Correlation with target

In [None]:
corr_targ = train_data.corrwith(train_data['target'])

corr_targ.abs().sort_values(ascending = False)[1:11].plot.bar(title = 'Top 10 abs corr features')

In [None]:
print("Top 10 abs corr features : {}".format(corr_targ.abs().sort_values(ascending = False)[1:11].index))

- Top 10 features : 'f22', 'f179', 'f69', 'f156', 'f58', 'f136', 'f214', 'f78', 'f8'

### Traget columns

In [None]:
print('percentage of target values: ')
percent_value = pd.DataFrame(train_data['target'].value_counts()/len(train_data))
percent_value.T

In [None]:
# visualization
countplt, ax = plt.subplots(figsize = (8, 5))
ax = sns.countplot(train_data['target'], palette = 'husl')

## Quick look at the Test dataset

In [None]:
test_data.head(5)

In [None]:
print(f'Number of rows: {test_data.shape[0]}; \n  Number of columns: {test_data.shape[1]}; \n No of missing values: {sum(test_data.isna().sum())};')

In [None]:
print('there is no missing values.')
train_data.isna().sum(axis = 0).sort_values(ascending = True)

### Basic summary statistics for test data

In [None]:
test_data.describe().style.background_gradient(cmap = 'coolwarm')

In [None]:
train_data.iloc[:, 1:286].shape[1]/5

- It seems that train data is similar with test data.

In [None]:

fig, ax = plt.subplots(2,2, figsize = (12, 8))

train_data.iloc[:, 1:286].mean(axis = 0).plot(ax = ax[0,0],
                                              title = 'the train data distribution in a view of feature means')
train_data.iloc[:, 1:286].std(axis = 0).plot(ax = ax[0,1],
                                             title = 'the train data distribution in a view of of feature stds')
test_data.iloc[:, 1:286].mean(axis = 0).plot(ax = ax[1,0],
                                             title = 'the test data distribution in a view of feature means')
test_data.iloc[:, 1:286].std(axis = 0).plot(ax = ax[1,1],
                                            title = 'the test data  distribution in a view of feature stds')

plt.tight_layout()

In [None]:
features = train_data.iloc[:, 1: 286] 
i = 1
plt.figure()
fig, ax = plt.subplots(9, 6, figsize = (28, 28))
for feature in features: # feature 를 출력하면 컬럼명이 나옴
    plt.subplot(57, 5, i)
    sns.distplot(train_data[feature], color='blue', kde=True, bins = 120, label = 'train')
    sns.distplot(test_data[feature], color='orange', kde=True, bins = 120, label = 'test')
    i += 1
plt.show()

## Feature Selection
### Train a basic RF classifier

In [None]:
# train a basic RF classifier
rf = RandomForestClassifier(n_estimators = 100, max_depth = 5, min_samples_leaf = 4, max_features = 0.2, n_jobs = -1, random_state = 1234)
rf.fit(train_data.drop(['id', 'target'], axis = 1), train_data.target)
print("Training Done")

## 

In [None]:
#this snapCode comes from this notbook (https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial)
features = train_data.drop(['id', 'target'],axis=1).columns.values
trace = go.Scatter(
    y = rf.feature_importances_,
    x = features,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 13,
        color = rf.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = features
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Random Forest Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 5,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

- Result is similar with former correlation analysis.
- f22 is significantly important to predict target variable. and f179 is second important but is is not significant. but other is not. 


## Data reduction


## AutoML and Submission

- Data size is too big. So, I try to 1:10 sample a data to be efficient.

In [None]:
sample_size = int(train_data.shape[0]/10)
reduced_train = train_data.groupby('target', group_keys=False).apply(lambda x: x.sample(sample_size, random_state = 1234))

In [None]:
# install packages
import h2o
from h2o.automl import H2OAutoML
h2o.init() # h2o initialization

In [None]:
train = h2o.H2OFrame(reduced_train)


In [None]:
test = h2o.H2OFrame(test_data)

In [None]:
x = train.columns 
y = "target" # target 
x.remove(y) # # X_train 
train[y] = train[y].asfactor() #binary classification 

In [None]:
%%time

auto_ml = H2OAutoML( 
    nfolds=5, # use 5 folds 
    seed = 1234,
    max_models = 10,
    include_algos = ["XGBoost" ,"StackedEnsemble","GBM"],
    max_runtime_secs=3600*2,  #time in sec , if set to much high value may give high score 
    stopping_metric='AUC'
    )
auto_ml.train(x=x, y=y, training_frame=train)

In [None]:
# check leaderboard
leader = auto_ml.leaderboard
leader

### Feature Importance

In [None]:
model = h2o.get_model(leader[7,"model_id"]) # get gbm model 
model.varimp_plot()

### Model Correlation Heatmap

In [None]:
mc_plot = auto_ml.model_correlation_heatmap(train)

### Learning Curve Plot


In [None]:
learning_curve_plot = model.learning_curve_plot()

### Generate Prediction

In [None]:
preds = auto_ml.leader.predict(test)

In [None]:
print(preds.head())

In [None]:
## create submission
submission = pd.DataFrame({
    'id': test['id'].as_data_frame().id,
    'target': preds.as_data_frame().p1
})
submission.head()

In [None]:
# save submission
submission.to_csv('h2o_submission.csv', index=False)