In [None]:
# basic library import section
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
# import required datasets
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
sample_sub = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

In [None]:
train.shape, test.shape

In [None]:
train.head()

In [None]:
#Let's print a concise summary of train Dataset
train.info()

In [None]:
# now print a concise summary of test Dataset
test.info()

In [None]:
# descriptive statistics of train set
train.describe().T.style.bar(subset=['mean'], color='#606ff2').background_gradient(subset=['std'],cmap='YlOrBr').bar(subset=['max'],color='green')

In [None]:
# descriptive statistics of test set
test.describe().T.style.bar(subset=['mean'], color='#606ff2').background_gradient(subset=['std'],cmap='YlOrBr').bar(subset=['max'],color='green')

In [None]:
# checking null values
print(train.isna().sum())
print("============================")
print(test.isna().sum())

In [None]:
# features to visualize
new_train = train.drop(['date_time','target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'], axis = 1)
new_test = test.drop('date_time', axis = 1)
features= ['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']
targets = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

In [None]:
import matplotlib.ticker as ticker
from matplotlib.ticker import PercentFormatter

plt.rcParams['figure.dpi'] = 600
fig = plt.figure(figsize=(10, 10), facecolor='#f6f5f5')
gs = fig.add_gridspec(4, 2)
gs.update(wspace=0.3, hspace=0.3)

background_color = "#f6f5f5"
sns.set_palette('icefire')

run_no = 0
for row in range(0, 4):
    for col in range(0, 2):
        locals()["ax"+str(run_no)] = fig.add_subplot(gs[row, col])
        locals()["ax"+str(run_no)].set_facecolor(background_color)
        for s in ["top","right"]:
            locals()["ax"+str(run_no)].spines[s].set_visible(False)
        run_no += 1

features = list(new_train.columns) 

run_no = 0
for col in features:
    sns.boxplot(ax=locals()["ax"+str(run_no)],x=col, data = new_train)
    locals()["ax"+str(run_no)].grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()["ax"+str(run_no)].grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()["ax"+str(run_no)].set_ylabel('')
    locals()["ax"+str(run_no)].set_xlabel(col, fontsize=4, fontweight='bold')
    locals()["ax"+str(run_no)].tick_params(labelsize=4, width=0.5, length=1.5)
    locals()["ax"+str(run_no)].yaxis.set_major_formatter(ticker.PercentFormatter())
    run_no += 1
fig.text(0.5, 0.9, 'boxplot for every column to detect outliers in train set', fontsize=10, ha='center', va='top', weight='bold')
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 10), facecolor='#f6f5f5')
gs = fig.add_gridspec(4, 2)
gs.update(wspace=0.3, hspace=0.3)

background_color = "#f6f5f5"
sns.set_palette('icefire')
run_no = 0
for row in range(0, 4):
    for col in range(0, 2):
        locals()["ax"+str(run_no)] = fig.add_subplot(gs[row, col])
        locals()["ax"+str(run_no)].set_facecolor(background_color)
        for s in ["top","right"]:
            locals()["ax"+str(run_no)].spines[s].set_visible(False)
        run_no += 1

features = list(new_test.columns) 

run_no = 0
for col in features:
    sns.boxplot(ax=locals()["ax"+str(run_no)],x=col, data = new_test)
    locals()["ax"+str(run_no)].grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()["ax"+str(run_no)].grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()["ax"+str(run_no)].set_ylabel('')
    locals()["ax"+str(run_no)].set_xlabel(col, fontsize=4, fontweight='bold')
    locals()["ax"+str(run_no)].tick_params(labelsize=4, width=0.5, length=1.5)
    locals()["ax"+str(run_no)].yaxis.set_major_formatter(ticker.PercentFormatter())
    run_no += 1
fig.text(0.5, 0.9, 'boxplot for every column to detect outliers in test set', fontsize=10, ha='center', va='top', weight='bold')
plt.show()

In [None]:
plt.rcParams['figure.dpi'] = 600
fig = plt.figure(figsize=(10, 10), facecolor='#f6f5f5')
gs = fig.add_gridspec(4, 2)
gs.update(wspace=0.3, hspace=0.3)

background_color = "#f6f5f5"
sns.set_palette('icefire')

run_no = 0
for row in range(0, 4):
    for col in range(0, 2):
        locals()["ax"+str(run_no)] = fig.add_subplot(gs[row, col])
        locals()["ax"+str(run_no)].set_facecolor(background_color)
        for s in ["top","right"]:
            locals()["ax"+str(run_no)].spines[s].set_visible(False)
        run_no += 1

features = list(new_train.columns) 

run_no = 0
for col in features:
    sns.kdeplot(ax=locals()["ax"+str(run_no)],x=col, data = new_train, color = 'palegreen')
    sns.kdeplot(ax=locals()["ax"+str(run_no)],x=col, data = new_test, color = 'lightcoral')
    locals()["ax"+str(run_no)].grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()["ax"+str(run_no)].grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()["ax"+str(run_no)].set_ylabel('')
    locals()["ax"+str(run_no)].set_xlabel(col, fontsize=4, fontweight='bold')
    locals()["ax"+str(run_no)].tick_params(labelsize=4, width=0.5, length=1.5)
    locals()["ax"+str(run_no)].yaxis.set_major_formatter(ticker.PercentFormatter())
    run_no += 1
fig.text(0.5, 0.9, 'Kernel distribution plot(KDE) for every columns of both datasets', fontsize=10, ha='center', va='top', weight='bold')
plt.show()

In [None]:
plt.rcParams['figure.dpi'] = 600
fig = plt.figure(figsize=(10, 10), facecolor='#f6f5f5')
gs = fig.add_gridspec(3, 1)
gs.update(wspace=0.3, hspace=0.3)

background_color = "#f6f5f5"
sns.set_palette('icefire')

run_no = 0
for row in range(0, 3):
    for col in range(0, 1):
        locals()["ax"+str(run_no)] = fig.add_subplot(gs[row, col])
        locals()["ax"+str(run_no)].set_facecolor(background_color)
        for s in ["top","right"]:
            locals()["ax"+str(run_no)].spines[s].set_visible(False)
        run_no += 1


run_no = 0
for target in targets:
    locals()["ax"+str(run_no)].scatter(x = train.date_time, y = train[target], s = 3, color='salmon', marker = '^', alpha = 0.5)
    locals()["ax"+str(run_no)].set_ylabel('')
    locals()["ax"+str(run_no)].set_xlabel(target, fontsize=4, fontweight='bold')
    locals()["ax"+str(run_no)].xaxis.set_major_locator(plt.MaxNLocator(5))
    run_no += 1
fig.text(0.5, 0.9, 'Target distribution over time', fontsize=10, ha='center', va='top', weight='bold')
plt.show()

In [None]:
plt.rcParams['figure.dpi'] = 600
fig = plt.figure(figsize=(10, 10), facecolor='#f6f5f5')
gs = fig.add_gridspec(8, 1)
gs.update(wspace=0.3, hspace=0.3)

background_color = "#f6f5f5"
sns.set_palette('icefire')

run_no = 0
for row in range(0, 8):
    for col in range(0, 1):
        locals()["ax"+str(run_no)] = fig.add_subplot(gs[row, col])
        locals()["ax"+str(run_no)].set_facecolor(background_color)
        for s in ["top","right"]:
            locals()["ax"+str(run_no)].spines[s].set_visible(False)
        run_no += 1


run_no = 0
for feature in features:
    locals()["ax"+str(run_no)].scatter(x = train.date_time, y = train[feature], s = 3, color='blue', marker = '^', alpha = 0.5)
    locals()["ax"+str(run_no)].scatter(x = test.date_time, y = test[feature], s = 3, color='green', marker = '>', alpha = 0.5)
    locals()["ax"+str(run_no)].set_ylabel('')
    locals()["ax"+str(run_no)].xaxis.set_major_locator(plt.MaxNLocator(5))
    run_no += 1
fig.text(0.5, 0.9, 'features over time', fontsize=10, ha='center', va='top', weight='bold')
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 10), facecolor='#f6f5f5')
gs = fig.add_gridspec(3, 1)
gs.update(wspace=0.3, hspace=0.3)

background_color = "#f6f5f5"
sns.set_palette('icefire')

run_no = 0
for row in range(0, 3):
    for col in range(0, 1):
        locals()["ax"+str(run_no)] = fig.add_subplot(gs[row, col])
        locals()["ax"+str(run_no)].set_facecolor(background_color)
        for s in ["top","right"]:
            locals()["ax"+str(run_no)].spines[s].set_visible(False)
        run_no += 1


run_no = 0
for target in targets:
    sns.lineplot(ax=locals()["ax"+str(run_no)],x = train.deg_C, y = train[target], color = 'lightcoral')
    locals()["ax"+str(run_no)].grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()["ax"+str(run_no)].grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()["ax"+str(run_no)].set_ylabel('')
    locals()["ax"+str(run_no)].set_xlabel(target, fontsize=6, fontweight='bold')
    locals()["ax"+str(run_no)].tick_params(labelsize=4, width=0.5, length=1.5)
    locals()["ax"+str(run_no)].xaxis.set_major_locator(plt.MaxNLocator(20))
    run_no += 1
fig.text(0.5, 0.9, 'Linear relationship between deg_c and target columns', fontsize=10, ha='center', va='top', weight='bold')
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 10), facecolor='#f6f5f5')
gs = fig.add_gridspec(3, 1)
gs.update(wspace=0.3, hspace=0.3)

background_color = "#f6f5f5"
sns.set_palette('icefire')

run_no = 0
for row in range(0, 3):
    for col in range(0, 1):
        locals()["ax"+str(run_no)] = fig.add_subplot(gs[row, col])
        locals()["ax"+str(run_no)].set_facecolor(background_color)
        for s in ["top","right"]:
            locals()["ax"+str(run_no)].spines[s].set_visible(False)
        run_no += 1


run_no = 0
for target in targets:
    sns.lineplot(ax=locals()["ax"+str(run_no)],x = train.relative_humidity, y = train[target], color = 'lightgreen')
    locals()["ax"+str(run_no)].grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()["ax"+str(run_no)].grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()["ax"+str(run_no)].set_ylabel('')
    locals()["ax"+str(run_no)].set_xlabel(target, fontsize=6, fontweight='bold')
    locals()["ax"+str(run_no)].tick_params(labelsize=4, width=0.5, length=1.5)
    locals()["ax"+str(run_no)].xaxis.set_major_locator(plt.MaxNLocator(20))
    run_no += 1
fig.text(0.5, 0.9, 'Linear relationship between relative_humidity and target columns', fontsize=10, ha='center', va='top', weight='bold')
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 10), facecolor='#f6f5f5')
gs = fig.add_gridspec(3, 1)
gs.update(wspace=0.3, hspace=0.3)

background_color = "#f6f5f5"
sns.set_palette('icefire')

run_no = 0
for row in range(0, 3):
    for col in range(0, 1):
        locals()["ax"+str(run_no)] = fig.add_subplot(gs[row, col])
        locals()["ax"+str(run_no)].set_facecolor(background_color)
        for s in ["top","right"]:
            locals()["ax"+str(run_no)].spines[s].set_visible(False)
        run_no += 1


run_no = 0
for target in targets:
    sns.lineplot(ax=locals()["ax"+str(run_no)],x = train.sensor_1, y = train[target], color = 'tomato')
    locals()["ax"+str(run_no)].grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()["ax"+str(run_no)].grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()["ax"+str(run_no)].set_ylabel('')
    locals()["ax"+str(run_no)].set_xlabel(target, fontsize=6, fontweight='bold')
    locals()["ax"+str(run_no)].tick_params(labelsize=4, width=0.5, length=1.5)
    locals()["ax"+str(run_no)].xaxis.set_major_locator(plt.MaxNLocator(20))
    run_no += 1
fig.text(0.5, 0.9, 'Linear relationship between sensor_1 and target columns', fontsize=10, ha='center', va='top', weight='bold')
plt.show()

In [None]:
# relation among the targets
sns.pairplot(train[targets]);

In [None]:
# relation among the features in train set
sns.pairplot(train[features]);

In [None]:
# relation among the features in test set
sns.pairplot(test[features]);

## Heatmap of both datasets.

### Train set heatmap

In [None]:
plt.figure(figsize = (12, 8))
corr_train = train.corr()
sns.heatmap(corr_train, annot = True);

### Test set heatmap

In [None]:
plt.figure(figsize = (12, 8))
corr_test = test.corr()
sns.heatmap(corr_test, annot = True);

# Feature Engineering

In [None]:
train['date_time'] = pd.to_datetime(train['date_time'], errors='coerce')
train.loc[:, 'weekofyear'] = train['date_time'].dt.weekofyear
train.loc[:, 'month'] = train['date_time'].dt.month
train.loc[:, 'hour'] = train['date_time'].dt.hour
train['fer_C'] = (train['deg_C']*(9/5)) + 32
train['kel_C'] = train['deg_C'] + 273.15
train['dew_point'] = train['deg_C'] - ((100 - train['relative_humidity'])/5)

In [None]:
test['date_time'] = pd.to_datetime(test['date_time'], errors='coerce')
test.loc[:, 'weekofyear'] = test['date_time'].dt.weekofyear
test.loc[:, 'month'] = test['date_time'].dt.month
test.loc[:, 'hour'] = test['date_time'].dt.hour
test['fer_C'] = (test['deg_C']*(9/5)) + 32
test['kel_C'] = test['deg_C'] + 273.15
test['dew_point'] = test['deg_C'] - ((100 - test['relative_humidity'])/5)

# Modeling using H2o AutoML

In [None]:
import h2o
from h2o.automl import H2OAutoML
h2o.init()

In [None]:
h2o_train = h2o.H2OFrame(train)
h2o_test = h2o.H2OFrame(test)

# Modeling

In [None]:
def h2oml(df, x, y, time = 60):
    df = df.drop(x, axis = 1)
    
    splits = df.split_frame(ratios = [0.8], seed = 1)
    x_train = splits[0]
    x_test = splits[1]
    
    aml = H2OAutoML(max_runtime_secs = time, seed = 1)
    aml.train(y = y, training_frame = x_train, leaderboard_frame = x_test)
    return aml
    

In [None]:
# trainning
aml1 = h2oml(h2o_train, ['target_benzene', 'target_nitrogen_oxides'], 'target_carbon_monoxide', 300)
aml2 = h2oml(h2o_train, ['target_carbon_monoxide', 'target_nitrogen_oxides'], 'target_benzene', 300)
aml3 = h2oml(h2o_train, ['target_carbon_monoxide', 'target_benzene'], 'target_nitrogen_oxides', 600)

In [None]:
aml1.leaderboard.head()

In [None]:
aml2.leaderboard.head()

In [None]:
aml3.leaderboard.head()

In [None]:
# predicting
prediction1 = aml1.predict(h2o_test)
prediction2 = aml2.predict(h2o_test)
prediction3 = aml3.predict(h2o_test)

In [None]:
# changing column names
prediction1.set_names(['target_carbon_monoxide'])
prediction2.set_names(['target_benzene'])
prediction3.set_names(['target_nitrogen_oxides']);

In [None]:
# converting to pandas column
pred1_data = h2o.as_list(prediction1)
pred2_data = h2o.as_list(prediction2)
pred3_data = h2o.as_list(prediction3)

### Creating submission file

In [None]:
submission = pd.concat([pd.DataFrame(sample_sub['date_time']),pred1_data, pred2_data, pred3_data], axis = 1)
submission.to_csv('submission_file.csv', index = False)

#### If you find it useful, don't forget Upvotting. Thanks