In [None]:
%config IPCompleter.use_jedi = False
!ls

### Import relevant packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import *
from sklearn.ensemble import *
from sklearn.model_selection import *
from sklearn.metrics import *
%matplotlib inline
import matplotlib
matplotlib.rcdefaults()
plt.style.available


### Inspection of Data

In [None]:
train_ori = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv', parse_dates=['date_time'])
test_ori = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv',parse_dates=['date_time'])
display(train_ori, test_ori)

### Categorization of columns into feature and Target based on project description
Some columns are discrete and some are continous variable. These columns are split this way in order to find out relationship via EDA.

In [None]:
train_ori.columns

In [None]:
target_col = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']
feature_col = ['deg_C', 'relative_humidity', 'absolute_humidity','sensor_1', 'sensor_2','sensor_3','sensor_4','sensor_5']
discrete_col = ['deg_C', 'relative_humidity', 'absolute_humidity']
continous_col = ['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']

In [None]:
#Plot discrete values and its relationships with target values
fig, ax = plt.subplots(len(discrete_col), len(target_col),figsize=(14,14))

for row in range(len(discrete_col)):
    for col in range(len(target_col)):
        sns.histplot(x=discrete_col[row] ,y=target_col[col], data=train_ori ,ax=ax[row,col])
        plt.tight_layout()

In [None]:
for row in range(len(discrete_col)):
    #plt.subplot(len(discrete_col),1,row+1)
    #plt.hist(train_ori[discrete_col[row]])
    sns.displot(train_ori[discrete_col[row]],bins=20)

In [None]:
sns.heatmap(train_ori.corr(), cmap='Greys')

Most of the features having 2nd degree linear relationship with all target variables, except sensor 3 which have logarithmic relationship

In [None]:
fig, ax = plt.subplots(len(continous_col), len(target_col),figsize=(14,14))

for row in range(len(continous_col)):
    for col in range(len(target_col)):
        sns.histplot(x=continous_col[row] ,y=target_col[col], data=train_ori ,ax=ax[row,col])
        plt.tight_layout()

### Try to predict using Random Forest Regressor with default parameters.

In [None]:
#Using basic params
X = train_ori[feature_col]
y_co = train_ori[target_col[0]]
y_bz = train_ori[target_col[1]]
y_no = train_ori[target_col[2]]

X_train_co, X_test_co, y_train_co, y_test_co = train_test_split(X, y_co, test_size=0.25, random_state=0)
RF_co = RandomForestRegressor().fit(X_train_co, y_train_co)
y_predict_co = RF_co.predict(X_test_co)
score_co_basic = RF_co.score(X_test_co, y_test_co)
MSLE_co_RF_basic = mean_squared_log_error(y_test_co, y_predict_co)
print("Score for CO RandomForestRegressor basic:",score_co_basic)
print("MSLE for CO prediction:", MSLE_co_RF_basic)

X_train_bz, X_test_bz, y_train_bz, y_test_bz = train_test_split(X, y_bz, test_size=0.25, random_state=0)
RF_bz = RandomForestRegressor().fit(X_train_bz, y_train_bz)
y_predict_bz = RF_bz.predict(X_test_bz)
score_bz_basic = RF_bz.score(X_test_bz, y_test_bz)
MSLE_bz_RF_basic = mean_squared_log_error(y_test_bz, y_predict_bz)
print("Score for Benzene RandomForestRegressor basic:",score_bz_basic)
print("MSLE for Benzene prediction:", MSLE_bz_RF_basic)

X_train_no, X_test_no, y_train_no, y_test_no = train_test_split(X, y_no, test_size=0.25, random_state=0)
RF_no = RandomForestRegressor().fit(X_train_no, y_train_no)
y_predict_no = RF_no.predict(X_test_no)
score_no_basic = RF_no.score(X_test_no, y_test_no)
MSLE_no_RF_basic = mean_squared_log_error(y_test_no, y_predict_no)
print("Score for NO RandomForestRegressor basic:",score_no_basic)
print("MSLE for NO prediction:", MSLE_no_RF_basic)



Seems the score and MSLE are acceptable.

In [None]:
y_realtest_co = RF_co.predict(test_ori.iloc[:,1:])
y_realtest_bz = RF_bz.predict(test_ori.iloc[:,1:])
y_realtest_no = RF_no.predict(test_ori.iloc[:,1:])

In [None]:
final = pd.DataFrame({'date_time': test_ori['date_time'].values, 
                     'target_carbon_monoxide': y_realtest_co, 
                     'target_benzene': y_realtest_bz, 
                     'target_nitrogen_oxides': y_realtest_no})

In [None]:
final.to_csv('submission.csv', index = False)