In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:220%; text-align:center; border-radius: 15px 50px;"> Auto-ML (Automated Machine Learning) </h1>

## Table of Contents:

#### 1. Intoduction
#### 2. Reference
#### 3. Load Required Libraries
#### 4. Import Data
#### 5. EDA (Explaratory Data Analysis)
#### 6. H20 AutoML
 - **a) Importing AutoML**
 - **b) Initialize H2O**
 - **c) Loading Data**
 - **d) Preparing Dataset**
 - **e) Applying AutoML**
 - **f) Printing the Leaderboard**
 - **g) Predicting on Test Data**
 - **h) Printing Result**
 - **i) Printing the Ranking for All**
 - **j) Submission**

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:220%; text-align:center; border-radius: 15px 50px;">1. Introduction </h1>

![H20](https://imgur.com/6pCgC8e.png)

## Automatic machine learning broadly includes the following steps:

- **Data preparation and Ingestion:** The real-world data can be raw data or just in any format. In this step, data needs to be converted into a format that can be processed easily. This also required to decide the data type of different columns in the dataset. We also required a clear knowledge about the task we need  to perform on data (e.g classification, regression, etc.)

- **Feature Engineering:** This includes various steps that are required for cleaning the dataset such as dealing with NULL /missing values, selecting the most important features of the dataset, and removing the low-correlational features, dealing with the skewed dataset.

- **Hyperparameter Optimization:** To obtain the best results on any model, the AutoML need to carefully tune the hyperparameter values.
Model Selection: H2O autoML trains with a large number of models in order to produce the best results. H2O AutoML also trains the data of different ensembles to get the best performance out of training data.

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:220%; text-align:center; border-radius: 15px 50px;"> 2. Reference </h1>

 - https://www.kaggle.com/saurabhshahane/h2oautoml-template
 
 - https://www.kaggle.com/tunguz/apr-21-tps-h2o-automl
    
 - https://www.h2o.ai/products/h2o-automl/
    
 - https://www.kaggle.com/general/232139
    
 - https://www.analyticsvidhya.com/blog/2020/11/exploring-linear-regression-with-h20-automlautomated-machine-learning/
    
 - [Datacamp](https://www.datacamp.com/community/tutorials/h2o-automl?utm_source=adwords_ppc&utm_campaignid=1455363063&utm_adgroupid=65083631748&utm_device=c&utm_keyword=&utm_matchtype=b&utm_network=g&utm_adpostion=&utm_creative=278443377095&utm_targetid=aud-299261629574:dsa-429603003980&utm_loc_interest_ms=&utm_loc_physical_ms=9061992&gclid=Cj0KCQjw38-DBhDpARIsADJ3kjkBfUqGMOh6PhnfNl3Zz9gImsOb8LeECsnqP3RNl5n1CyaCpZ_aEvsaAkZnEALw_wcB)

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:220%; text-align:center; border-radius: 15px 50px;"> 3. Load Required Libraries </h1>

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:220%; text-align:center; border-radius: 15px 50px;"> 4. Import Data </h1>

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv')

In [None]:
display(train.head(3))
display(test.head(3))
display(submission.head(3))

In [None]:
print('Feature Names in Train:\n\n', train.columns)
print('\n\nFeature Names in Test:\n\n', test.columns)

In [None]:
display(train.shape)
display(test.shape)
display(submission.shape)

In [None]:
train_EDA = train.drop('PassengerId', axis=1)
test_EDA = test.drop('PassengerId', axis=1)

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:220%; text-align:center; border-radius: 15px 50px;"> 5. EDA (Explaratory Data Analysis) </h1>

In [None]:
display(train_EDA.describe())
display(test_EDA.describe())

In [None]:
print('Number of Survivors and Non-Survivors:\n\n', train['Survived'].value_counts())
print('\n\n', sns.countplot(x='Survived', data=train))
plt.show()

In [None]:
print('Number of Male and Female Passengers:\n\n', train['Sex'].value_counts())
print('\n\n', sns.countplot(x='Sex', data=train))
plt.show()

In [None]:
print('Number of Embarked:\n\n', train['Embarked'].value_counts())
print('\n\n', sns.countplot(x='Embarked', data=train))
plt.show()

In [None]:
f, ax = plt.subplots(1,2,figsize=(18,8))  

train_EDA['Survived'].value_counts().plot.pie(explode=[0,0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
ax[0].set_title('Pie plot - Survived')
ax[0].set_ylabel('') #ylabel = blank
# Count the Survived in the file train_EDA
sns.countplot('Survived', data=train_EDA, ax=ax[1])
ax[1].set_title('Count plot - Survived')
plt.show()

In [None]:
train_EDA[['Pclass','Survived']].groupby(['Pclass']).sum()

In [None]:
pd.crosstab(train_EDA['Pclass'],train_EDA['Survived'], margins=True).style.background_gradient(cmap='cool')

In [None]:
pd.crosstab(train_EDA['Sex'], train_EDA['Survived'], margins=True).style.background_gradient(cmap='summer_r')

In [None]:
f,ax=plt.subplots(2, 2, figsize=(20,15))

sns.countplot('Embarked', data=train_EDA, ax=ax[0,0])
ax[0,0].set_title('(1) No. Of Passengers Boarded')

sns.countplot('Embarked', hue='Sex', data=train_EDA, ax=ax[0,1])
ax[0,1].set_title('(2) Male-Female Split for Embarked')

sns.countplot('Embarked', hue='Survived', data=train_EDA, ax=ax[1,0])
ax[1,0].set_title('(3) Embarked vs Survived')

sns.countplot('Embarked', hue='Pclass', data=train_EDA, ax=ax[1,1])
ax[1,1].set_title('(4) Embarked vs Pclass')

plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()

In [None]:
# Pearson Correlation
plt.figure(figsize=(8,6))
sns.heatmap(train_EDA.corr(method='pearson'), annot=True, cbar=False, linewidth=0.2, fmt='0.2f');

In [None]:
# Spearman Correlation
plt.figure(figsize=(8,6))
sns.heatmap(train_EDA.corr(method='spearman'), annot=True, cbar=False, linewidth=0.2, fmt='0.2f');

In [None]:
# kendall
fig, ax = plt.subplots(1, 3, figsize=(17 , 5))

feature_lst = ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

corr = train_EDA[feature_lst].corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True


for idx, method in enumerate(['pearson', 'kendall', 'spearman']):
    sns.heatmap(train_EDA[feature_lst].corr(method=method), ax=ax[idx],
            square=True, annot=True, fmt='.2f', center=0, linewidth=2,
            cbar=False, cmap=sns.diverging_palette(240, 10, as_cmap=True),
            mask=mask
           ) 
    ax[idx].set_title(f'{method.capitalize()} Correlation', loc='left', fontweight='bold')     

plt.show()

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:220%; text-align:center; border-radius: 15px 50px;"> 6. H20 AutoML </h1>

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:left; "> a) Importing AutoML </h1>

#### First import H2O and AutoML package into the project

In [None]:
# import H2O and AutoML package
import h2o
from h2o.automl import H2OAutoML

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:left; "> b) Initialize H2O </h1>

In [None]:
# Initialize h2o
h2o.init(
    nthreads=-1,     # number of threads when launching a new H2O server
    max_mem_size='16G'  # in gigabytes
)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:left; "> c) Loading Data </h1>

In [None]:
train_data = h2o.import_file('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test_data = h2o.import_file('/kaggle/input/tabular-playground-series-apr-2021/test.csv')

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:left; "> d) Preparing Dataset </h1>

#### We need to decide on the features and the prediction columns. We use the same features and the predication column

In [None]:
# Identify predictors and response
x = ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Ticket', 'Fare', 'Cabin', 'Embarked']
y = 'Survived'

In [None]:
# Split the data in 80:20 ratio for training and testing
train, test = train_data.split_frame(ratios=[0.8])

In [None]:
train_data.head()

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:left; "> e) Applying AutoML </h1>

#### Now, we are all set for applying AutoML on our dataset. The AutoML will run for a fixed amount of time set by us and give us the optimized model. We set up the AutoML using the following statement.

#### The first parameter specifies the number of models that we want to evaluate and compare

#### The second parameter specifies the time for which the algorithm runs

In [None]:
# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=10000, seed=47, max_runtime_secs=1800)
aml.train(x=x, y=y, training_frame=train_data)

# h2o train
#aml = H2OAutoML(nfolds=10, sort_metric='auc', stopping_metric='auc', max_runtime_secs=3600*8, seed=0)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:left; "> f) Printing the Leaderboard </h1>

#### When the AutoML processing completes, it creates a leaderboard ranking all the 30 algorithms that it has evaluated. To see the first 10 records of the leaderboard, use the following code

In [None]:
# Binary classification, the default ranking metric is Area Under the ROC Curve (AUC).
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head()

In [None]:
# Get leaderboard with `extra_columns` = 'ALL'
lb = h2o.automl.get_leaderboard(aml, extra_columns = 'ALL')
lb

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:left; "> g) Predicting on Test Data </h1>

#### Now, you have the models ranked, you can see the performance of the top-rated model on your test data. To do so, run the following code statement

In [None]:
# To generate predictions on a test set, you can make predictions
# directly on the `"H2OAutoML"` object or on the leader model
# object directly
preds = aml.predict(test_data)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:left; "> h) Printing Result </h1>

In [None]:
print (preds)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:left; "> i) Printing the Ranking for All </h1>

#### If you want to see the ranks of all the tested algorithms, run the following code statement

In [None]:
# Print all rows instead of default (10 rows)
# Entire leaderboard
lb.head(rows=lb.nrows)

In [None]:
# The leader model is stored here
aml.leader

In [None]:
# Get the top model of leaderboard
se = aml.leader
  
# Get the metalearner model of top model
metalearner = h2o.get_model(se.metalearner()['name'])
  
# list baselearner models :
metalearner.varimp()

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:220%; text-align:center; border-radius: 15px 50px;">4. Submission </h1>

submission['Survived'] = preds['predict']
submission.to_csv('submission.csv', index=False)
submission.head()