# Hey Everyone !
### In this notebook we'll learn about AutoML using H2O.ai
### We'll start with basic EDA and visualization, followed by Feature Engineering and finally model building !!


## Lets get started !!

In [None]:
# importing some basic libraries|
import pandas as pd
import numpy as np
import os
import math

In [None]:
## load train and test data
train_data= pd.read_csv('../input/30-days-of-ml/train.csv')
test_data=pd.read_csv('../input/30-days-of-ml/test.csv')

# EDA

### For this problem we don't have much to explore, but lets see what we dealing with

In [None]:
train_data.describe()

In [None]:
train_data.info()

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

### It's nice that we don't need to deal with null values !

In [None]:
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

### Plotting continuous features

In [None]:
figure, axis = plt.subplots(5, 3)
figure.set_figheight(25)
figure.set_figwidth(20)

axis[0, 0].scatter(x=train_data['cont0'],y=train_data['target'],s=.5)
axis[1, 0].scatter(x=train_data['cont1'],y=train_data['target'],s=.5)
axis[2, 0].scatter(x=train_data['cont2'],y=train_data['target'],s=.5)
axis[0, 2].scatter(x=train_data['cont3'],y=train_data['target'],s=.5)
axis[0, 1].scatter(x=train_data['cont4'],y=train_data['target'],s=.5)
axis[1, 1].scatter(x=train_data['cont5'],y=train_data['target'],s=.5)
axis[2, 1].scatter(x=train_data['cont6'],y=train_data['target'],s=.5)
axis[1, 2].scatter(x=train_data['cont7'],y=train_data['target'],s=.5)
axis[2, 2].scatter(x=train_data['cont8'],y=train_data['target'],s=.5)
axis[3, 0].scatter(x=train_data['cont9'],y=train_data['target'],s=.5)
axis[3, 1].scatter(x=train_data['cont10'],y=train_data['target'],s=.5)
axis[3, 2].scatter(x=train_data['cont11'],y=train_data['target'],s=.5)
axis[4, 0].scatter(x=train_data['cont12'],y=train_data['target'],s=.5)
axis[4, 1].scatter(x=train_data['cont13'],y=train_data['target'],s=.5)


### Lets see how target is distributed

In [None]:
sns.histplot(data=train_data,x='target')

### Plotting categorical features

In [None]:
figure, axis = plt.subplots(3, 4)
figure.set_figheight(9)
figure.set_figwidth(12)
axis[0, 0].scatter(x=train_data['cat0'],y=train_data['target'],s=5)
axis[0, 1].scatter(x=train_data['cat1'],y=train_data['target'],s=5)
axis[0, 2].scatter(x=train_data['cat2'],y=train_data['target'],s=5)
axis[0, 3].scatter(x=train_data['cat3'],y=train_data['target'],s=5)
axis[1, 0].scatter(x=train_data['cat4'],y=train_data['target'],s=5)
axis[1, 1].scatter(x=train_data['cat5'],y=train_data['target'],s=5)
axis[1, 2].scatter(x=train_data['cat6'],y=train_data['target'],s=5)
axis[1, 3].scatter(x=train_data['cat7'],y=train_data['target'],s=5)
axis[2, 0].scatter(x=train_data['cat8'],y=train_data['target'],s=5)
axis[2, 1].scatter(x=train_data['cat9'],y=train_data['target'],s=5)


# Feature Engineering

### Because we don't have labels for the columns, we cant really do much other than shoot in the dark. We can create new features by combining the available featues, but right now, lets just convert categorical variables to numerical and move forward ! 

In [None]:
traindf=train_data.copy()
testdf=test_data.copy()

for i in range(10):
    map_=list(train_data['cat'+str(i)].unique())
    map_.sort()
    traindf['cat'+str(i)]=traindf['cat'+str(i)].apply(lambda x : map_.index(x))
    testdf['cat'+str(i)]=testdf['cat'+str(i)].apply(lambda x : map_.index(x))

In [None]:
# Lets plot the correlation heatmap
sns.heatmap(traindf.corr(),annot=True,cmap='magma',linewidths=0.2,annot_kws={'size':5})
fig=plt.gcf()
fig.set_size_inches(14,10)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

# Model Training 

### Alright, lets get into AutoML

## What is AutoML ? 
## AutoML helps solving the tedious manual task of exploring different models for our train data.
## AutoML can search over a huge model space to find the model that best fits our task

In [None]:
# importing automl and converting pandas df to h2o df
import h2o
from h2o.automl import H2OAutoML
h2o.init()
hf = h2o.H2OFrame(traindf.drop(columns=['id']))
y_col = "target"
x_col =[ col for col in hf.columns if col!="target" ]

### Commenting out this part as it takes a while to run

In [None]:
# Starting the search for best model 
# We run the search for max 30 min, and limit search space to 50 models and we want to sort the models by rmse

# aml = H2OAutoML(seed=1,sort_metric='rmse')
# aml.train(x=x_col, y=y_col, training_frame=hf)

In [None]:
# aml.leaderboard

In [None]:
# leaderboard=aml.leaderboard.as_data_frame(use_pandas=True)
# leaderboard.to_csv('h2o_leaderboard.csv')

In [None]:
leaderboard=pd.read_csv('../input/aml-leaderboard/h2o_leaderboard (3).csv')
leaderboard
# model_list=leaderboard['model_id'].values
# blend=traindf.copy()
# blend_test=testdf.copy()
# thf = h2o.H2OFrame(testdf.drop(columns=['id']))
# for i in range(10):
#     model=h2o.get_model(model_list[i])
#     preds=model.predict(hf)
#     preds=preds.as_data_frame(use_pandas=True)
#     preds_test=model.predict(thf)
#     preds_test=preds_test.as_data_frame(use_pandas=True)
#     blend["pred"+str(i)]=preds['predict']
#     blend_test["pred"+str(i)]=preds_test['predict']

In [None]:
# blend.to_csv('blend.csv')
# blend_test.to_csv('blend_test.csv')

### Reading the saved blend data

In [None]:
blend=pd.read_csv('../input/30-days-of-ml-blend/blend.csv')
blend_test=pd.read_csv('../input/30-days-of-ml-blend/blend_test.csv')

In [None]:
pred_cols=[col for col in blend.columns if("pred" in col or col=='target')]

### We could use Linear Regression,Xgboost or some other regressor. But just for fun I again used the H2O AutoML to find best model !

In [None]:
# from sklearn.linear_model import LinearRegression
# from xgboost import XGBRegressor
# lr=XGBRegressor()

# lr.fit(blend[pred_cols],blend['target'])
# preds=lr.predict(blend_test[pred_cols])
# submission = pd.DataFrame({'id':blend_test['id'],'target':preds})
# submission.to_csv('submission_auto.csv',index = False)

In [None]:
# hf = h2o.H2OFrame(blend[pred_cols])
# y_col = "target"
# x_col =[ col for col in hf.columns if col!="target" ]

In [None]:
# aml = H2OAutoML(seed=1,sort_metric='rmse')
# aml.train(x=x_col, y=y_col, training_frame=hf)

In [None]:
# preds=aml.leader.predict(h2o.H2OFrame(blend_test[x_col]))
# preds=preds.as_data_frame(use_pandas=True)
# submission = pd.DataFrame({'id':blend_test['id'],'target':preds['predict']})
# submission.to_csv('submission_auto.csv',index = False)

# Conclusion

### Aim of this notebook to expose newbies like me to AutoML. I got better score by using tuned XGBoost, LGBM models. Comment down if you guys have any doubt or question!! 
### I Hope you guys had fun reading this one !