In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-jul-2021/sample_submission.csv")
sub.head()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
import warnings
warnings.filterwarnings(action='ignore')
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Data exploration and relations with scatter matrix and jointplots

In [None]:
import plotly.express as px
fig = px.scatter(train, color="target_carbon_monoxide", x="absolute_humidity", y = 'target_carbon_monoxide',
                 hover_data=['target_carbon_monoxide'])
fig.show()

When absolute humidity is above 2 then carbon monoxide level is very low, but when absolute humidity is in range 1 to 1.5 the carbon monoxide level reaches as high as 12.5.

Move the cursor around the diagram to get the details.

In [None]:
fig = px.scatter(train, color="target_carbon_monoxide", x="deg_C", y = 'target_carbon_monoxide',
                 hover_data=['relative_humidity'])
fig.show()

As temperature rises above 30 degree the level of carbon monoxide decreases. The level of carbon monoxide is highest between 10 and 20.

In [None]:
fig = px.line(train, x='date_time', y='target_carbon_monoxide')
fig.show()

Carbon Monoxide reaches it's highest level in Nov 23 2010. The value is 12.5.
In majority the carbon monoxide level is 6 or below 6.

In [None]:
fig = px.scatter(train, color="target_benzene", x="deg_C", y = 'target_benzene',
                 hover_data=['relative_humidity'])
fig.show()

Between 10 and 30 degree celsius benzene is highest. A maximum of 50.5 can be seen at temperature 23.4 and 12.9 with relative humidity 60.3 and 74.1 respectively.

In [None]:
fig = px.scatter_matrix(train,
    dimensions=["sensor_1", "sensor_2", "sensor_3", "sensor_4", "sensor_5"],
    color="target_benzene")
fig.show()

With higher the value of sensor 1,2,4,5 the higher the benzene level.
But the lower the value of sensor 3 the higher the benzen level.

In [None]:
fig = px.scatter_matrix(train,
    dimensions=["sensor_1", "sensor_2", "sensor_3", "sensor_4", "sensor_5"],
    color="target_carbon_monoxide")
fig.show()

Move the cursor around the diagram to get the values.

In [None]:
fig = px.scatter_matrix(train,
    dimensions=["sensor_1", "sensor_2", "sensor_3", "sensor_4", "sensor_5"],
    color="target_nitrogen_oxides")
fig.show()

In [None]:
sns.jointplot(
    data=train,
    x="target_carbon_monoxide", y="target_benzene",
)

Carbon monoxide and benzene have nearly perfect correlation

In [None]:
sns.jointplot(
    data=train,
    x="target_carbon_monoxide", y="target_nitrogen_oxides",
)

A strong correlation exists between carbon monoxide and nitrogen oxides

In [None]:
sns.jointplot(
    data=train,
    x="target_benzene", y="target_nitrogen_oxides",
)

Upto benzene level 30 there is a very strong correlation between benzene and nitrogen oxides

Different Carbon Monoxide level in different range.

# Violin plots and boxplots

In [None]:
f, ax = plt.subplots()
sns.violinplot(data=train[["sensor_1", "sensor_2", "sensor_3", "sensor_4", "sensor_5"]])
sns.despine(offset=10, trim=True);

Violin plots of sensors in train set

In [None]:
f, ax = plt.subplots()
sns.violinplot(data=test[["sensor_1", "sensor_2", "sensor_3", "sensor_4", "sensor_5"]])
sns.despine(offset=10, trim=True);

Not much considerable difference between train and test sensor plots.

In [None]:
fig = px.box(train, x="target_carbon_monoxide", points="all")
fig.show()

In [None]:
fig = px.box(train, x="target_benzene", points="all")
fig.show()

In [None]:
fig = px.box(train, y="target_nitrogen_oxides", points="all")
fig.show()

# Distribution and KDE plots of train

In [None]:
sns.displot(train, x="sensor_1", kde = True, color ='r')

In [None]:
sns.displot(train, x="sensor_2", kde = True, color ='b')

In [None]:
sns.displot(train, x="sensor_3", kde = True, color ='y')

In [None]:
sns.displot(train, x="sensor_4", kde = True, color ='g')

In [None]:
sns.displot(train, x="sensor_5", kde = True, color ='k')

In [None]:
sns.displot(train, x="target_carbon_monoxide", kde = True, color ='b')

In [None]:
sns.displot(train, x="target_benzene", kde = True, color ='c')

In [None]:
sns.displot(train, x="target_nitrogen_oxides", kde = True, color ='r')

In [None]:
sns.jointplot(data=train, x="absolute_humidity", y="relative_humidity", kind = 'kde')

Both absolute humidity and relative humidity forms close to normal distributions.

In [None]:
sns.jointplot(data=train, x="absolute_humidity", y="relative_humidity")

A complete relationship can be seen between relative humidity and absolute humidity with their proper kde plots

# Pairplot

In [None]:
sns.pairplot(train)

All the relations with their correlations and distributions can be seen here.

# Distribution plots and KDE of test

In [None]:
sns.pairplot(test)

In [None]:
sns.displot(train, x="sensor_1", kde = True, color ='g')

In [None]:
sns.displot(train, x="sensor_2", kde = True, color ='b')

In [None]:
sns.displot(train, x="sensor_3", kde = True, color ='c')

In [None]:
sns.displot(train, x="sensor_4", kde = True, color ='y')

In [None]:
sns.displot(train, x="sensor_5", kde = True, color ='r')

In [None]:
sns.displot(train, x="deg_C", kde = True, color ='k')

In [None]:
sns.jointplot(data=test, x="absolute_humidity", y="relative_humidity")

In test set too we can see there is a relationship between absolute humidity and relaitve humidity

# Modelling

In [None]:
columns = test.columns[1:]
X = train[columns].values
X_test = test[columns].values

#Since we are to predict 3 targets so we are setting target 1,2,3
#Reshaping otherwise it will throw an error
target_1 = train['target_carbon_monoxide'].values.reshape(-1,1)
target_2 = train['target_benzene'].values.reshape(-1,1)
target_3 = train['target_nitrogen_oxides'].values.reshape(-1,1)

In [None]:
from xgboost import XGBRegressor

In [None]:
#Helpful code snippet by Grandmaster Bojan Tunguz
xgb = XGBRegressor()
xgb.fit(X, target_1)
sub['target_carbon_monoxide'] = xgb.predict(X_test)

xgb.fit(X, target_2)
sub['target_benzene'] = xgb.predict(X_test)

xgb.fit(X, target_3)
sub['target_nitrogen_oxides'] = xgb.predict(X_test)

sub.head()

In [None]:
sub.to_csv('submission with XGBoost.csv', index=False)

# Upvote if you like it or fork it :)