In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> Table of Contents </h1>

* [1) Load Required Libraries](#1)

* [2) Read Data](#2)

* [3) EDA (Exploratory Data Analysis)](#3)

  * [3.1) DataPrep (AutoEDA)](#3.1)
  
  * [3.2) Outliers](#3.2)
  
  * [3.3) Relation between Features](#3.3)
  
    * [3.3.1) Relation between Features](#3.3.1)
    
    * [3.3.2) Relation between Features](#3.3.2)

<h1 style="background-color:LimeGreen; font-family:timenewroman; font-size:200%; text-align:center; border-radius: 10px 50px;"> 1) Load Required Libraries </h1>

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

plt.style.use("fivethirtyeight")
sns.set_style("darkgrid")

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 2) Read Data </h1>

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/test.csv")
submission = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/sample_submission.csv")

In [None]:
display(train.head())
display(test.head())
display(submission.head())

In [None]:
display(train.shape)
display(test.shape)
display(submission.shape)

In [None]:
display(train.info())
display(test.info())

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 3) EDA (Exploratory Data Analysis) </h1>

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:180%; text-align:left; border-radius: 0x 0px;"> 3.1) DataPrep (AutoEDA) </h1>

In [None]:
!pip install dataprep

In [None]:
from dataprep.eda import *
from dataprep.eda import plot

In [None]:
# plots the distribution of each column and calculates dataset statistics
plot(train)

In [None]:
from dataprep.eda import plot_correlation

In [None]:
plot_correlation(train)

In [None]:
from dataprep.eda import plot_missing

In [None]:
plot_missing(train)

In [None]:
create_report(train)

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 3.2) Outliers </h1>

In [None]:
plt.figure(figsize=(18,18))
sns.boxplot(data=train, orient="h");

In [None]:
plt.figure(figsize=(18,18))
sns.boxplot(data=test, orient="h");

<h1 style="background-color:orange; font-family:newtimeroman; font-size:180%; text-align:left; border-radius: 0px 0px;"> 3.3) Relation between Features </h1>

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 3.3.1) Relation between Features </h1>

a. Pearson Correlation

b. Spearman Correlation

c. kendall

In [None]:
# Pearson Correlation
plt.figure(figsize=(18,10))
sns.heatmap(train.corr(method='pearson'), cbar=False, annot=True, fmt='.1f', linewidth=0.2, cmap='coolwarm');

In [None]:
# Spearman Correlation
plt.figure(figsize=(24,15))
sns.heatmap(train.corr(method='spearman'), cbar=False, annot=True, fmt='.1f', linewidth=0.2, cmap='coolwarm');

In [None]:
fig, ax = plt.subplots(figsize=(18, 12))
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))
ax.text(-1.1, -0.7, 'Correlation between the Features', fontsize=20, fontweight='bold', fontfamily='serif')
sns.heatmap(corr, mask=mask, annot=False, fmt='.2f', linewidth=0.2, cbar=True, cmap='coolwarm');

In [None]:
# kendall
fig, ax = plt.subplots(1, 3, figsize=(17 , 5))

feature_lst = ['relative_humidity', 'absolute_humidity', 'sensor_1','sensor_2','sensor_3', 'sensor_4', 'sensor_5']

corr = train[feature_lst].corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True


for idx, method in enumerate(['pearson', 'kendall', 'spearman']):
    sns.heatmap(train[feature_lst].corr(method=method), ax=ax[idx],
            square=True, annot=True, fmt='.1f', center=0, linewidth=2,
            cbar=False, cmap=sns.diverging_palette(240, 10, as_cmap=True),
            mask=mask
           ) 
    ax[idx].set_title(f'{method.capitalize()} Correlation', loc='left', fontweight='bold')     

plt.show()

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:180%; text-align:left; border-radius: 0px 0px;"> 3.3.2) Relation between Features </h1>

In [None]:
train.skew()

In [None]:
test.skew()

In [None]:
plt.figure(figsize=(16, 16), tight_layout=True)

all_df = pd.concat([train, test]).reset_index(drop=True)
all_df.fillna(0, inplace=True)
for i, col in enumerate(test.columns[1:]):
    plt.subplot(8, 1, i+1)
    plt.title(col)

    plt.plot(all_df[col][:train.shape[0]])
    plt.plot(all_df[col][train.shape[0]:])

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:180%; text-align:center; border-radius: 10px 50px;"> 4) MODEL BUILDING AND EVALUATION </h1>

In [None]:
columns = test.columns[1:]
columns

In [None]:
X = train[columns].values
X_test = test[columns].values
target_1 = train['target_carbon_monoxide'].values.reshape(-1,1)
target_2 = train['target_benzene'].values.reshape(-1,1)
target_3 = train['target_nitrogen_oxides'].values.reshape(-1,1)

In [None]:
rr = Ridge()
rr.fit(X, target_1)
submission['target_carbon_monoxide'] = rr.predict(X_test)

rr = Ridge()
rr.fit(X, target_2)
submission['target_benzene'] = rr.predict(X_test)

rr = Ridge()
rr.fit(X, target_3)
submission['target_nitrogen_oxides'] = rr.predict(X_test)

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 10px 50px;"> 5) H2O AutoML </h1>

In [None]:
# import packages
import pandas as pd

import h2o
from h2o.automl import H2OAutoML

In [None]:
# prepare data
h2o.init()

h2o_train = h2o.import_file('/kaggle/input/tabular-playground-series-jul-2021/train.csv')
h2o_test = h2o.import_file('/kaggle/input/tabular-playground-series-jul-2021/test.csv')

In [None]:
## run model for carbon monoxide
features = [x for x in h2o_train.columns if x not in ['date_time', 'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]

h2oaml_carbon_monoxide = H2OAutoML(
    max_runtime_secs=180,
    stopping_metric='RMSLE',
    sort_metric='RMSLE'
)

h2oaml_carbon_monoxide.train(x=features, y='target_carbon_monoxide', training_frame=h2o_train)

In [None]:
# check leaderboard for carbon monoxide
h2oaml_carbon_monoxide.leaderboard

In [None]:
# run model for carbon benzene
features = [x for x in h2o_train.columns if x not in ['date_time', 'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]

h2oaml_benzene = H2OAutoML(
    max_runtime_secs=180,
    stopping_metric='RMSLE',
    sort_metric='RMSLE'
)

h2oaml_benzene.train(x=features, y='target_benzene', training_frame=h2o_train)

In [None]:
# check leaderboard for benzene
h2oaml_benzene.leaderboard

In [None]:
# run model for nitrogen oxides
features = [x for x in h2o_train.columns if x not in ['date_time', 'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]

h2oaml_nitrogen_oxides = H2OAutoML(
    max_runtime_secs=180,
    stopping_metric='RMSLE',
    sort_metric='RMSLE'
)

h2oaml_nitrogen_oxides.train(x=features, y='target_nitrogen_oxides', training_frame=h2o_train)

In [None]:
# check leaderboard for nitrogen oxides
h2oaml_nitrogen_oxides.leaderboard

In [None]:
# generate predictions
preds_h2oaml_carbon_monoxide = h2oaml_carbon_monoxide.leader.predict(h2o_test)
preds_h2oaml_benzene = h2oaml_benzene.leader.predict(h2o_test)
preds_h2oaml_nitrogen_oxides = h2oaml_nitrogen_oxides.leader.predict(h2o_test)

In [None]:
# create submission
submission = pd.DataFrame({
    'date_time': test.date_time,
    'target_carbon_monoxide': preds_h2oaml_carbon_monoxide.as_data_frame().predict,
    'target_benzene': preds_h2oaml_benzene.as_data_frame().predict,
    'target_nitrogen_oxides': preds_h2oaml_nitrogen_oxides.as_data_frame().predict
})

submission.head()

In [None]:
# save submission
submission.to_csv('submission_H2OAutoML.csv', index=False)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:180%; text-align:center; border-radius: 10px 50px;"> 6) Light AutoML </h1>

# install packages
!python3 -m pip install -q lightautoml

# import packages
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

# run model for carbon monoxide
train['target'] = train.target_carbon_monoxide.values

laml_carbon_monoxide = TabularAutoML(
    task = Task('reg'),
    timeout = 180
)

laml_carbon_monoxide.fit_predict(train_data=train, roles={'target': 'target'})

del train['target']

# run model for benzene
train['target'] = train.target_benzene.values

laml_benzene = TabularAutoML(
    task = Task('reg'),
    timeout = 180
)

laml_benzene.fit_predict(train_data=train, roles={'target': 'target'})

del train['target']

# run model for nitrogen oxides
train['target'] = train.target_nitrogen_oxides.values

laml_nitrogen_oxides = TabularAutoML(
    task = Task('reg'),
    timeout = 180
)

laml_nitrogen_oxides.fit_predict(train_data=train, roles={'target': 'target'})

del train['target']

# generate predictions
preds_laml_carbon_monoxide = laml_carbon_monoxide.predict(test[train.columns]).data.ravel()
preds_laml_benzene = laml_benzene.predict(test[train.columns]).data.ravel()
preds_laml_nitrogen_oxides = laml_nitrogen_oxides.predict(test[train.columns]).data.ravel()

# create submission
submission = pd.DataFrame({
    'date_time': test.date_time,
    'target_carbon_monoxide': preds_laml_carbon_monoxide,
    'target_benzene': preds_laml_benzene,
    'target_nitrogen_oxides': preds_laml_nitrogen_oxides
})

submission.head()

## save submission
submission.to_csv(PATH_LAML_SUBMISSION, index=False)