In [None]:
import numpy as np
import seaborn as sns
from scipy import stats
from scipy.stats import norm
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import time

import warnings
warnings.filterwarnings(action='ignore')

## XGBoost
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

<font size="5">Importing the Dataset</font>

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/test.csv')

In [None]:
print(train_df.info())
print("*********")
print(test_df.info())

<font size="5">The Dataset does not have any empty cells and all the variables are numerical. That is one less thing to worry about. :)</font>

In [None]:
print(train_df.shape)
print(test_df.shape)

<font size="5">Let us begin by exploring the target variables and observe their distribution in the training data.</font>

In [None]:
sns.distplot(train_df['target_carbon_monoxide'] ,fit = norm)
fig = plt.figure()
res = stats.probplot(train_df['target_carbon_monoxide'], plot=plt)

<font size='5'> The distribution is deviated from a normal distribution. Lets check the distribution after taking the Log of this target variable.</font>

In [None]:
train_df['log_target_carbon_monoxide'] = np.log(train_df['target_carbon_monoxide'])
sns.distplot(train_df['log_target_carbon_monoxide'] ,fit = norm)
fig = plt.figure()
res = stats.probplot(train_df['log_target_carbon_monoxide'], plot=plt)

<font size='5'>The log value to carbon monoxide seems closer to normal distribution</font>

In [None]:
sns.distplot(train_df['target_benzene'] ,fit = norm)
fig = plt.figure()
res = stats.probplot(train_df['target_benzene'], plot=plt)

In [None]:
train_df['log_target_benzene'] = np.log(train_df['target_benzene'])
sns.distplot(train_df['log_target_benzene'] ,fit = norm)
fig = plt.figure()
res = stats.probplot(train_df['log_target_benzene'], plot=plt)

<font size="5">The distribution for benzene does not seem normal even for its Log values.</font>

In [None]:
sns.distplot(train_df['target_nitrogen_oxides'] ,fit = norm)
fig = plt.figure()
res = stats.probplot(train_df['target_nitrogen_oxides'], plot=plt)

In [None]:
train_df['log_target_nitrogen_oxides'] = np.log(train_df['target_nitrogen_oxides'])
sns.distplot(train_df['log_target_nitrogen_oxides'] ,fit = norm)
fig = plt.figure()
res = stats.probplot(train_df['log_target_nitrogen_oxides'], plot=plt)

<font size="5"> In case of Nitrogen Oxide its Log values are having a normal distribution.</font>

In [None]:
independent_variables = ['deg_C','relative_humidity','absolute_humidity','sensor_1','sensor_2','sensor_3','sensor_4','sensor_5']

In [None]:
sns.distplot(train_df["deg_C"], fit = norm)

deg_Cis similar to normal distribution

In [None]:
sns.distplot(train_df["relative_humidity"], fit = norm)

relative_humidity is similar to normal distribution

In [None]:
sns.distplot(train_df["absolute_humidity"], fit = norm)

absolute_humidity is similar to normal distribution

In [None]:
sns.distplot(train_df["sensor_1"], fit = norm)

sensor_1 is not similar to normal distribution. It is a little left skewed.

In [None]:
sns.distplot(train_df["sensor_2"], fit = norm)

sensor_2 is similar to normal distribution

In [None]:
sns.distplot(train_df["sensor_3"], fit = norm)

sensor_3 is not similar to normal distribution. It is a little left skewed

In [None]:
sns.distplot(train_df["sensor_4"], fit = norm)

sensor_4 is similar to normal distribution

In [None]:
sns.distplot(train_df["sensor_5"], fit = norm)

sensor_5 is not similar to normal distribution. It is left skewed

<font size="5"> Now we will have a look at the distribution of independent variables in the train and test dataset.</font>

In [None]:
for idx, feature in enumerate(independent_variables):
    plt.hist(train_df[feature], bins=30, alpha=0.5, label='Train set')
    plt.hist(test_df[feature], bins=30, alpha=0.5, label='Test set')
    plt.title(feature + " Train/Test")
    plt.xlabel(feature)
    plt.ylabel('Frequency')

    plt.legend()
    plt.show()

In [None]:
corr_mat = train_df.corr()
cols = corr_mat.index
f, ax = plt.subplots(figsize=(12, 9))
hm = sns.heatmap(corr_mat, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)

### deg_C and relative_humidity both have negative correlation and both have low correlation with target variables
### absolute_humidity has low correlation with targetand all other variables except a slightly medium correlation 
### with sensor_4 and deg_C
### sensor 1 through 5 have a significant correlation with each other and with the target variables
### sensor 1, 2, 4, 5 have a positive correlation with target
### sensor 3 has a negative correlation with target

In [None]:
sns.jointplot(
    data=train_df,
    x="target_carbon_monoxide", y="target_benzene",
)

### A strong perfect correlation exists between benzene and carbon monoxide

In [None]:
sns.jointplot(
    data=train_df,
    x="target_carbon_monoxide", y="target_nitrogen_oxides",
)

### A strong correlation exists between nitrogen oxides and carbon monoxide

In [None]:
sns.jointplot(
    data=train_df,
    x="target_benzene", y="target_nitrogen_oxides",
)

### Upto benzene value 30 we have strong correlation

In [None]:
sns.pairplot(train_df)

## Let's train a model!!

In [None]:
columns = test_df.columns[1:]
X_train = train_df[columns].values
X_test = test_df[columns].values

In [None]:
target_1 = train_df['target_carbon_monoxide'].values.reshape(-1,1)
target_2 = train_df['target_benzene'].values.reshape(-1,1)
target_3 = train_df['target_nitrogen_oxides'].values.reshape(-1,1)

In [None]:
xgb = XGBRegressor()
xgb.fit(X_train, target_1)
test_target_1 = xgb.predict(X_test)

xgb.fit(X_train, target_2)
test_target_2 = xgb.predict(X_test)

xgb.fit(X_train, target_3)
test_target_3 = xgb.predict(X_test)

In [None]:
final = pd.DataFrame()

final['date_time'] = test_df['date_time']
final['target_carbon_monoxide'] = test_target_1
final['target_benzene'] = test_target_2
final['target_nitrogen_oxides'] = test_target_3

final.head()

In [None]:
now = str(time.time()).split('.')[0]
final.to_csv('submit-'+ now + '.csv', index=False)