# Tabular Playground Series - Jul 2021




<img src="https://storage.googleapis.com/kaggle-competitions/kaggle/25225/logos/header.png?t=2021-01-27-17-34-26" width="800px">


### Data Description
In this competition you are predicting the values of air pollution measurements over time, based on basic weather information (temperature and humidity) and the input values of 5 sensors.

The three target values to you to predict are: target_carbon_monoxide, target_benzene, and target_nitrogen_oxides

### Files
* train.csv - the training data, including the weather data, sensor data, and values for the 3 targets
* test.csv - the same format as train.csv, but without the target value; your task is to predict the value for each of these targets.
* sample_submission.csv - a sample submission file in the correct format.


#### Link 

[Here](https://www.kaggle.com/c/tabular-playground-series-jul-2021/overview)

In [None]:
!pip install sweetviz

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/test.csv')
df_submission = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/sample_submission.csv')

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_submission.head()

In [None]:
print('Size of train dataset:', df_train.shape)
print('\nSize of est dataset:', df_test.shape)
print('\n\nSize submission:\n\n', df_submission.shape)


In [None]:
print('Missing Values in train:\n\n', df_train.isnull().sum())
print('\n\nMissing Values in test:\n\n', df_test.isnull().sum())
print('\n\nMissing Values submission:\n\n', df_submission.isnull().sum())

In [None]:
print('Info train ', df_train.info())
print('\n\nInfo test:\n\n', df_test.info())
print('\n\nInfo submission:\n\n', df_submission.info())

In [None]:
print('Data Type train ', df_train.dtypes)
print('\n\n Data Type test:\n\n', df_test.dtypes)
print('\n\n Data Type submission:\n\n', df_submission.dtypes)

In [None]:
print('Data Size  train ', df_train.size)
print('\n\n Data Size test :\n\n', df_test.size)
print('\n\n Data Size submission:\n\n', df_submission.size)

In [None]:
 df_train.describe().T

In [None]:
 df_test.describe().T

# Exploratory Data Analysis with Sweetviz

Exploratory Data Analysis refers to the critical process of performing initial investigations on data to discover patterns, to spot anomalies, to test hypotheses, and to check assumptions with the help of summary statistics and graphical representations. When you want to build any model in Machine Learning you first need to understand the dataset. You need to get a sense of data before making your hands dirty.

<img src="https://miro.medium.com/max/2000/0*qnjfh5ioEMlMWgUZ.png" width="800px">



##### Link
[Here](https://pypi.org/project/sweetviz/)

In [None]:
import sweetviz as sv

In [None]:
Tr_report1 = sv.analyze(df_train)
Tr_report1.show_notebook(w="80%", h="full")
Tr_report1.show_html('Tr_report1.html')

In [None]:
Te_report = sv.analyze(df_test)
Te_report.show_notebook(w="80%", h="full")
Te_report.show_html('Te_report.html')

In [None]:
data1=df_train
data2=df_test
report_comp=sv.compare([data1,'TRAIN'],[data2,'TEST'])
report_comp.show_notebook(w="80%", h="full")
report_comp.show_html('Compare_Results.html')

In [None]:
Sub_report = sv.analyze(df_submission)
Sub_report.show_notebook(w="80%", h="full")
Sub_report.show_html('Sub_report.html')

In [None]:
G = sns.pairplot(df_train, diag_kind="kde")
G.map_lower(sns.kdeplot, levels=4, color=".2")

In [None]:
sns.distplot(df_train['deg_C'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "g"});

In [None]:
sns.distplot(df_train['relative_humidity'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "g"});

In [None]:
sns.distplot(df_train['absolute_humidity'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "g"});

In [None]:
sns.distplot(df_train['sensor_1'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "g"});

In [None]:
sns.distplot(df_train['sensor_2'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "g"});

In [None]:
sns.distplot(df_train['sensor_3'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "g"});

In [None]:
sns.distplot(df_train['sensor_4'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "g"});

In [None]:
sns.distplot(df_train['sensor_5'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "g"});

In [None]:
sns.distplot(df_train['target_carbon_monoxide'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "g"});

In [None]:
sns.distplot(df_train['target_benzene'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "g"});

In [None]:
sns.distplot(df_train['target_nitrogen_oxides'],hist_kws={ "linewidth": 3,"alpha": 1, "color": "g"})

In [None]:
df_train.plot(figsize = (11, 25), subplots = True, linewidth = 0.8, color = "g")
plt.xlabel('')
plt.show()

In [None]:
df_test.plot(figsize = (11, 25), subplots = True, linewidth = 0.8, color = "g")
plt.xlabel('')
plt.show()

In [None]:
df_train.columns

In [None]:
df_train[['deg_C','target_carbon_monoxide','target_benzene', 'target_nitrogen_oxides']].corr()

In [None]:
df_train[['relative_humidity','target_carbon_monoxide','target_benzene', 'target_nitrogen_oxides']].corr()

In [None]:
df_train[['sensor_1','target_carbon_monoxide','target_benzene', 'target_nitrogen_oxides']].corr()

In [None]:
df_train[['sensor_2','target_carbon_monoxide','target_benzene', 'target_nitrogen_oxides']].corr()

In [None]:
df_train[['sensor_3','target_carbon_monoxide','target_benzene', 'target_nitrogen_oxides']].corr()

In [None]:
df_train[['sensor_4','target_carbon_monoxide','target_benzene', 'target_nitrogen_oxides']].corr()

In [None]:
df_train[['sensor_5','target_carbon_monoxide','target_benzene', 'target_nitrogen_oxides']].corr()

In [None]:
df_train.corr()

In [None]:
df_test.corr()

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(df_train.corr(), annot=True)

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(df_test.corr(), annot=True)

In [None]:
df_train.drop('date_time', axis=1, inplace=True)
df_test.drop('date_time', axis=1, inplace=True)

In [None]:
df_train.to_csv('df_train.csv', index=False)
df_test.to_csv('df_test.csv', index=False)

In [None]:
df_train

# H2O’s AutoML
H2O’s AutoML can be used for automating the machine learning workflow, which includes automatic training and tuning of many models within a user-specified time-limi.


We used 3 models.
* Automatic Machine Learning (AutoML)
* Gradient Boosting Machine (GBM)
* XGBoost 

<img src="https://docs.h2o.ai/h2o/latest-stable/h2o-docs/_images/h2o-automl-logo.jpg" width="600px">



#### Link 
[Here](https://h2o-release.s3.amazonaws.com/h2o/master/3888/docs-website/h2o-docs/automl.html)

### AutoML: Automatic Machine Learning

In [None]:
import h2o
from h2o.automl import H2OAutoML

In [None]:
h2o.init(
    nthreads=-1,     # number of threads when launching a new H2O server
    max_mem_size=12  # in gigabytes
)

In [None]:
train = h2o.import_file("./df_train.csv")
test = h2o.import_file("./df_test.csv")
submission = h2o.import_file("../input/tabular-playground-series-jul-2021/sample_submission.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission.head()

In [None]:
train_as_df = h2o.as_list(train,use_pandas=True)
train_as_df_1 = train_as_df[train_as_df['target_carbon_monoxide'].notna()]
train_as_df_2 = train_as_df[train_as_df['target_benzene'].notna()]
train_as_df_3 = train_as_df[train_as_df['target_nitrogen_oxides'].notna()]

In [None]:
#Check for any missing values. In this case, there are no missing values to be treated.

round((train_as_df_1.isnull().sum() * 100/ len(train_as_df_1)),2).sort_values(ascending=False)

In [None]:
round((train_as_df_2.isnull().sum() * 100/ len(train_as_df_2)),2).sort_values(ascending=False)

In [None]:
round((train_as_df_3.isnull().sum() * 100/ len(train_as_df_3)),2).sort_values(ascending=False)

In [None]:
train_1 = h2o.H2OFrame(train_as_df_1)
x =train_1.columns
y_1 = 'target_carbon_monoxide'
y_2 ='target_benzene'
y_3 ='target_nitrogen_oxides'
x.remove(y_1)
x.remove(y_2)
x.remove(y_3)


In [None]:
aml1 = H2OAutoML(max_models=10, seed=1)
aml1.train(x=x, y=y_1, training_frame=train_1)
print(aml1)

pre_aml1=aml1.predict(test)
print(pre_aml1)

lb = aml1.leaderboard
lb.head(rows=lb.nrows)
lb = h2o.automl.get_leaderboard(aml1, extra_columns = 'ALL')
print(lb)

model_ids = list(aml1.leaderboard['model_id'].as_data_frame().iloc[:,0])
se = h2o.get_model([mid for mid in model_ids if "AllModels" in mid][0])
print(se)

metalearner = h2o.get_model(se.metalearner()['name'])
print(metalearner)
metalearner.coef_norm()

In [None]:
aml2 = H2OAutoML(max_models=10, seed=1)
aml2.train(x=x, y=y_2, training_frame=train_1)
print(aml2)

pre_aml2=aml2.predict(test)
print(pre_aml2)

lb = aml2.leaderboard
lb.head(rows=lb.nrows)
lb = h2o.automl.get_leaderboard(aml2, extra_columns = 'ALL')
print(lb)

model_ids = list(aml2.leaderboard['model_id'].as_data_frame().iloc[:,0])
se = h2o.get_model([mid for mid in model_ids if "AllModels" in mid][0])
print(se)

metalearner = h2o.get_model(se.metalearner()['name'])
print(metalearner)
metalearner.coef_norm()

In [None]:
aml3 = H2OAutoML(max_models=10, seed=1)
aml3.train(x=x, y=y_3, training_frame=train_1)
print(aml3)

pre_aml3=aml3.predict(test)
print(pre_aml3)

lb = aml3.leaderboard
lb.head(rows=lb.nrows)
lb = h2o.automl.get_leaderboard(aml3, extra_columns = 'ALL')
print(lb)

model_ids = list(aml3.leaderboard['model_id'].as_data_frame().iloc[:,0])
se = h2o.get_model([mid for mid in model_ids if "AllModels" in mid][0])
print(se)

metalearner = h2o.get_model(se.metalearner()['name'])
print(metalearner)
metalearner.coef_norm()

In [None]:
h2o.export_file(submission, path = "Sub_Mission.csv", force = True)

# Gradient Boosting Machine (GBM)


In [None]:
import h2o
from h2o.estimators import H2OGradientBoostingEstimator

In [None]:
train_1 = h2o.H2OFrame(train_as_df_1)
x =train_1.columns
y_1 = 'target_carbon_monoxide'
y_2 ='target_benzene'
y_3 ='target_nitrogen_oxides'
x.remove(y_1)
x.remove(y_2)
x.remove(y_3)

In [None]:
GBM1 = H2OGradientBoostingEstimator(nfolds=10, seed=1)
GBM1.train(x=x, y=y_1, training_frame=train_1)
print(GBM1)
pre_GBM1=GBM1.predict(test)
print(pre_GBM1)

# Eval performance:
performance = GBM1.model_performance()

print(performance)

In [None]:
GBM2 = H2OGradientBoostingEstimator(nfolds=10, seed=1)
GBM2.train(x=x, y=y_2, training_frame=train_1)
print(GBM2)
pre_GBM2=GBM2.predict(test)
print(pre_GBM2)

# Eval performance:
performance = GBM2.model_performance()

print(performance)

In [None]:
GBM3 = H2OGradientBoostingEstimator(nfolds=10, seed=1)
GBM3.train(x=x, y=y_3, training_frame=train_1)
print(GBM3)
pre_GBM3=GBM3.predict(test)
print(pre_GBM3)

# Eval performance:
performance = GBM3.model_performance()

print(performance)

# XGBoost 

In [None]:
from h2o.estimators import H2OXGBoostEstimator
h2o.init()

In [None]:
train_1 = h2o.H2OFrame(train_as_df_1)
x =train_1.columns
y_1 = 'target_carbon_monoxide'
y_2 ='target_benzene'
y_3 ='target_nitrogen_oxides'
x.remove(y_1)
x.remove(y_2)
x.remove(y_3)

In [None]:
xgb_1 = H2OXGBoostEstimator(booster='dart',normalize_type="tree",seed=1)
xgb_1.train(x=x, y=y_1, training_frame=train_1)
print(xgb_1)
pre_xgb_1=xgb_1.predict(test)
print(pre_xgb_1)

# Eval performance on a test set
performance = xgb_1.model_performance()

print(performance)


# Extract feature interactions:
feature_interactions = xgb_1.feature_interaction()
print(feature_interactions)

In [None]:
xgb_2 = H2OXGBoostEstimator(booster='dart',normalize_type="tree",seed=1)
xgb_2.train(x=x, y=y_2, training_frame=train_1)
print(xgb_1)
pre_xgb_2=xgb_2.predict(test)
print(pre_xgb_2)

# Eval performance on a test set
performance = xgb_2.model_performance()

print(performance)


# Extract feature interactions:
feature_interactions = xgb_2.feature_interaction()
print(feature_interactions)

In [None]:
xgb_3 = H2OXGBoostEstimator(booster='dart',normalize_type="tree",seed=1)
xgb_3.train(x=x, y=y_3, training_frame=train_1)
print(xgb_3)
pre_xgb_3=xgb_3.predict(test)
print(pre_xgb_3)

# Eval performance on a test set
performance = xgb_3.model_performance()

print(performance)


# Extract feature interactions:
feature_interactions = xgb_3.feature_interaction()
print(feature_interactions)

# Submission

In [None]:
df_test_new = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/test.csv') 
submission = pd.concat([pd.DataFrame({'date_time': df_test_new['date_time']}),
    pd.DataFrame({'target_carbon_monoxide': pre_aml1.as_data_frame().predict}),
    pd.DataFrame({'target_benzene': pre_aml2.as_data_frame().predict}),
    pd.DataFrame({'target_nitrogen_oxides':pre_aml3.as_data_frame().predict})], axis=1)

submission.head()

In [None]:
submission.to_csv('Sample_submission_aml.csv', index=False)

In [None]:
df_test_new = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/test.csv') 
submission = pd.concat([pd.DataFrame({'date_time': df_test_new['date_time']}),
    pd.DataFrame({'target_carbon_monoxide': pre_GBM1.as_data_frame().predict}),
    pd.DataFrame({'target_benzene': pre_GBM2.as_data_frame().predict}),
    pd.DataFrame({'target_nitrogen_oxides': pre_GBM3.as_data_frame().predict})], axis=1)

submission.head()

In [None]:
submission.to_csv('Sample_submission_GBM3.csv', index=False)

In [None]:
df_test_new = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/test.csv') 
submission = pd.concat([pd.DataFrame({'date_time': df_test_new['date_time']}),
    pd.DataFrame({'target_carbon_monoxide': pre_xgb_1.as_data_frame().predict}),
    pd.DataFrame({'target_benzene': pre_xgb_2.as_data_frame().predict}),
    pd.DataFrame({'target_nitrogen_oxides': pre_xgb_3.as_data_frame().predict})], axis=1)

submission.head()

In [None]:
submission.to_csv('Sample_submission_xgb.csv', index=False)