In [None]:
import numpy as np
import seaborn as sns
import pandas as pd 
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy import stats
import scipy
from scipy.stats import norm, skew, zscore
import time

# First Exploration

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

In [None]:
f,ax = plt.subplots(figsize=(12,2))
plt.xticks(size = 14)
plt.yticks(size = 14)
bar1 =  ax.barh('train', train.shape[0], color="indianred")
bar2 =  ax.barh('test', test.shape[0], color="green")
ax.set_title("Train and test datasets size comparison", fontsize=20, pad=5)
ax.bar_label(bar1, ["{0:.2f}%".format((train.shape[0]/(train.shape[0]+test.shape[0]))*100)], label_type="center",
             fontsize=20, color="white", weight="bold")
ax.bar_label(bar2,["{0:.2f}%".format((test.shape[0]/(train.shape[0]+test.shape[0]))*100)], label_type="center",
             fontsize=20, color="white", weight="bold")
plt.show()

In [None]:
targets_list = ['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']

## Train

In [None]:
train.head()

In [None]:
print(" Shape ".center(100,'*'))
print('Rows: {}'.format(train.shape[0]))
print('Columns: {}'.format(train.shape[1]))
print(" Head ".center(100,'*'))
print(train.head())
print(" Types ".center(100,'*'))
print(train.dtypes)
print(" Missing values ".center(100,'*'))
print("Missing values %:   {}%".format(train.isna().sum().sum()/(train.shape[0]*train.shape[1])*100))
print(train.isna().sum())
print(' Duplicated'.center(100,'*'))
print(train.duplicated().sum())

In [None]:
df_train = train.copy()

In [None]:
df_train['date_time'] = pd.to_datetime(df_train.date_time, format="%Y-%m-%d %H:%M:%S")
df_train['month'] = pd.DatetimeIndex(df_train['date_time']).month
df_train['day'] = pd.DatetimeIndex(df_train['date_time']).day
df_train['hour'] = pd.DatetimeIndex(df_train['date_time']).hour
df_train['dayofweek'] = pd.DatetimeIndex(df_train['date_time']).dayofweek

df_train.head()


In [None]:
to_desc = df_train.drop(columns = ['date_time','target_carbon_monoxide','target_benzene','target_nitrogen_oxides'])
desc = to_desc.describe().T.drop(columns = ['count'])

desc_df = pd.DataFrame(index= [col for col in to_desc.columns], 
                   columns= desc.describe().T.columns.tolist().remove('count'), data= desc )

f,ax = plt.subplots(figsize=(10,14))
sns.heatmap(desc_df, annot=True,cmap = "coolwarm", fmt= '.0f',
            ax=ax,linewidths = 5, cbar = True,
            annot_kws={"size": 12})
ax.xaxis.tick_top()
plt.xticks(size = 14)
plt.yticks(size = 14, rotation = 0)
plt.title("Descriptive Statistics", size = 16)
plt.show()

In [None]:
corr_target_pearson = df_train.corr(method='pearson')
corr_target_spearman = df_train.corr(method='spearman')

fig = plt.figure(figsize = (16,12))
sns.heatmap(corr_target_pearson, annot=True, cmap='YlGn',linewidth = 0.5, vmin=-1, vmax=+1, fmt = ".1f")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.title('Pearson Correlation')
plt.show()

fig = plt.figure(figsize = (16,12))
sns.heatmap(corr_target_spearman, annot=True, cmap='YlGn',linewidth = 0.5, vmin=-1, vmax=+1, fmt = ".1f")
plt.xticks(rotation=45)
plt.title('Spearman Correlation')
plt.show()

## Test

In [None]:
test.head()

In [None]:
print(" Shape ".center(100,'*'))
print('Rows: {}'.format(test.shape[0]))
print('Columns: {}'.format(test.shape[1]))
print(" Head ".center(100,'*'))
print(test.head())
print(" Types ".center(100,'*'))
print(test.dtypes)
print(" Missing values ".center(100,'*'))
print("Missing values %:   {}%".format(test.isna().sum().sum()/(test.shape[0]*test.shape[1])*100))
print(test.isna().sum())
print(' Duplicated'.center(100,'*'))
print(test.duplicated().sum())

In [None]:
df_test = test.copy()
df_test['date_time'] = pd.to_datetime(df_test.date_time, format="%Y-%m-%d %H:%M:%S")
df_test['year'] = pd.DatetimeIndex(df_test['date_time']).year
df_test['month'] = pd.DatetimeIndex(df_test['date_time']).month
df_test['day'] = pd.DatetimeIndex(df_test['date_time']).day
df_test['hour'] = pd.DatetimeIndex(df_test['date_time']).hour
df_test['dayofweek'] = pd.DatetimeIndex(df_test['date_time']).dayofweek
df_test.head()

In [None]:
to_desc = df_test.drop(columns = ['date_time'])
desc = to_desc.describe().T.drop(columns = ['count'])

desc_df = pd.DataFrame(index= [col for col in to_desc.columns], 
                   columns= desc.describe().T.columns.tolist().remove('count'), data= desc )

f,ax = plt.subplots(figsize=(10,14))
sns.heatmap(desc_df, annot=True,cmap = "coolwarm", fmt= '.0f',
            ax=ax,linewidths = 5, cbar = True,
            annot_kws={"size": 12})
ax.xaxis.tick_top()
plt.xticks(size = 14)
plt.yticks(size = 14, rotation = 0)
plt.title("Descriptive Statistics", size = 16)
plt.show()

## Targets

In [None]:
df_target = train[['date_time','target_carbon_monoxide','target_benzene','target_nitrogen_oxides' ]]

In [None]:
to_desc = train[targets_list]
desc = to_desc.describe().T.drop(columns = ['count'])

desc_df = pd.DataFrame(index= [col for col in to_desc.columns], 
                   columns= desc.describe().T.columns.tolist().remove('count'), data= desc )

f,ax = plt.subplots(figsize=(12,4))
sns.heatmap(desc_df, annot=True,cmap = "coolwarm", fmt= '.0f',
            ax=ax,linewidths = 5, cbar = True,
            annot_kws={"size": 12})
ax.xaxis.tick_top()
plt.xticks(size = 14)
plt.yticks(size = 14, rotation = 0)
plt.title("Descriptive Statistics", size = 16)
plt.show()

In [None]:
color = ['forestgreen', 'royalblue','indianred']
for i in range(3):
    fig, ax = plt.subplots(figsize=(24,6))
    plt.xticks(rotation=45)
    sns.lineplot(x = df_train['date_time'], y = df_train[targets_list[i]], color=color[i], linewidth=1.5,ax=ax)
    ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
    ax.grid(axis="both")
    plt.show()

In [None]:
color = ['forestgreen', 'royalblue','indianred']
for i in range(3):
    sns.displot(data = df_train, x = targets_list[i], color=color[i],kde=True, stat = 'density',aspect=3)
plt.show()


In [None]:
sns.pairplot(train[targets_list],  height=4, aspect=1.5, palette = 'Paired')
plt.show()

In [None]:
color = ['forestgreen', 'royalblue','indianred']
for i in range(3):
    fig, ax = plt.subplots(figsize=(10,6))
    stats.probplot(train[targets_list[i]], plot=ax)
    ax.set_title(targets_list[i])
    plt.show()

# Preprocessing

outliers testing solution

In [None]:
# df_train_norm = df_train_norm.drop(df_train_norm[df_train_norm.target_carbon_monoxide > 2.3].index)
# df_train_norm = df_train_norm.drop(df_train_norm[df_train_norm.target_nitrogen_oxides < 2.55].index)
# df_train_norm = df_train_norm.drop(df_train_norm[df_train_norm.target_benzene < 0.3].index)

# fig, ax = plt.subplots(figsize=(8,2))
# sns.boxplot(x=df_train_norm['target_carbon_monoxide'])

# fig, ax = plt.subplots(figsize=(8,2))
# sns.boxplot(x=df_train_norm['target_benzene'])

# fig, ax = plt.subplots(figsize=(8,2))
# sns.boxplot(x=df_train_norm['target_nitrogen_oxides'])

# plt.show()

In [None]:
# sns.pairplot(df_train_norm[targets_list],  height=3, aspect=1, palette = 'Paired')
# plt.show()

In [None]:
# print(df_train_norm.shape)
# for i in targets_list:
#     z_scores = stats.zscore(df_train_norm[i])
#     abs_z_scores = np.abs(z_scores)
#     filtered_entries = (abs_z_scores < 3)
#     df_train_norm = df_train_norm[filtered_entries]
# print(df_train_norm.shape)

In [None]:
target = df_train[['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']]
target.head()

In [None]:
df_train = df_train.drop(columns = ['date_time','target_carbon_monoxide','target_benzene','target_nitrogen_oxides' ])

In [None]:
df_train = pd.get_dummies(df_train, columns = ['month'], dtype = 'int64')
df_train.insert(loc=13, column='month_2', value=0)
df_train = pd.get_dummies(df_train, columns = ['day', 'hour', 'dayofweek'], dtype = 'int64')
df_train.head()

In [None]:
df_test = pd.get_dummies(df_test, columns = ['month'], dtype = 'int64')
df_test['month_5'] = 0
df_test['month_6'] = 0
df_test['month_7'] = 0
df_test['month_8'] = 0
df_test['month_9'] = 0
df_test['month_10'] = 0
df_test['month_11'] = 0
df_test['month_12'] = 0
df_test = pd.get_dummies(df_test, columns = ['day', 'hour', 'dayofweek'], dtype = 'int64')
df_test = df_test[df_train.columns]
df_test.head()

# Models

**auto-sklearn**

In [None]:
# !pip install auto-sklearn
# !pip install --upgrade flake8

In [None]:
# import autosklearn.regression
# import sklearn.metrics
# from autosklearn.metrics import mean_squared_log_error

In [None]:
# X_train = df_train
# x_test = df_test
# target_carbon_monoxide = target.target_carbon_monoxide.values
# target_benzene = target.target_benzene.values
# target_nitrogen_oxides = target.target_nitrogen_oxides.values


In [None]:
# automl1 = autosklearn.regression.AutoSklearnRegressor(
# time_left_for_this_task=30*60,
# n_jobs = -1,
# metric=mean_squared_log_error,
# scoring_functions=[mean_squared_log_error]
# )

# automl1.fit(X = X_train, y = target_carbon_monoxide)
# print(automl1.sprint_statistics())
# sample_submission.target_carbon_monoxide = automl1.predict(x_test)

In [None]:
# automl2 = autosklearn.regression.AutoSklearnRegressor(
# time_left_for_this_task=30*60,
# n_jobs = -1,
# metric=mean_squared_log_error,
# scoring_functions=[mean_squared_log_error]
# )

# automl2.fit(X = X_train, y = target_benzene)
# print(automl2.sprint_statistics())
# sample_submission.target_benzene = automl2.predict(x_test)

In [None]:
# automl3 = autosklearn.regression.AutoSklearnRegressor(
# time_left_for_this_task=30*60,
# n_jobs = -1,
# metric=mean_squared_log_error,
# scoring_functions=[mean_squared_log_error]
# )

# automl3.fit(X = X_train, y = target_nitrogen_oxides)
# print(automl3.sprint_statistics())
# sample_submission.target_nitrogen_oxides = automl3.predict(x_test)



In [None]:
# sample_submission.to_csv('submission.csv',index=False)
# sample_submission.shape

**H2OAutoML**

In [None]:
import h2o
from h2o.automl import H2OAutoML
h2o.init()
sample_submissionh2o = sample_submission.copy()

In [None]:
X_train = df_train
x_test = df_test
target_carbon_monoxide = target.target_carbon_monoxide.values
target_benzene = target.target_benzene.values
target_nitrogen_oxides = target.target_nitrogen_oxides.values


In [None]:
X_train1 = X_train.join(target.target_carbon_monoxide)
htrain = h2o.H2OFrame(X_train1)
htest = h2o.H2OFrame(x_test)
x = htrain.columns
y = "target_carbon_monoxide"
x.remove(y)

In [None]:
%%time 
aml1 = H2OAutoML(max_runtime_secs = 3600, 
                 seed = 1,
                 include_algos = ['GBM', 'StackedEnsemble'])
aml1.train(x=x, y =y, training_frame=htrain)

In [None]:
lb = aml1.leaderboard
lb

In [None]:
pred1 = aml1.predict(htest)
pred1 = pred1.as_data_frame()
sample_submissionh2o.target_carbon_monoxide = pred1

2

In [None]:
X_train2 = X_train.join(target.target_benzene)
htrain = h2o.H2OFrame(X_train2)
htest = h2o.H2OFrame(x_test)
x = htrain.columns
y = "target_benzene"
x.remove(y)

In [None]:
%%time 
aml2 = H2OAutoML(max_runtime_secs = 3600, 
                 seed = 1,
                 include_algos = ['GBM','StackedEnsemble'])
aml2.train(x=x, y =y, training_frame=htrain)

In [None]:
lb = aml2.leaderboard
lb

In [None]:
pred2 = aml2.predict(htest)
pred2 = pred2.as_data_frame()
sample_submissionh2o.target_benzene = pred2

3

In [None]:
X_train3 = X_train.join(target.target_nitrogen_oxides)
htrain = h2o.H2OFrame(X_train3)
htest = h2o.H2OFrame(x_test)
x = htrain.columns
y = "target_nitrogen_oxides"
x.remove(y)

In [None]:
aml3 = H2OAutoML(max_runtime_secs = 3600, 
                 seed = 1,
                 include_algos = ['GBM','StackedEnsemble'])
aml3.train(x=x, y =y, training_frame=htrain)

In [None]:
lb = aml3.leaderboard
lb

In [None]:
pred3 = aml3.predict(htest)
pred3 = pred3.as_data_frame()
sample_submissionh2o.target_nitrogen_oxides = pred3

In [None]:
sample_submissionh2o.to_csv('submissionh2o.csv',index=False)