<img src="https://carbontracker.org/wp-content/uploads/2019/08/air-pollution-chimney-clouds-459728-1024x683.jpg">

```

1. Situation
Data on the daily timeseries relating to weather information (temperature and humidity) and the input values of 5 sensors. Correspondingly, time series for the target_carbon_monoxide, target_benzene, and target_nitrogen_oxides are provided.

2. Task
Predict the values of target_carbon_monoxide, target_benzene, and target_nitrogen_oxides using weather information and sensor data 

3. Action
- Exploratory Data Analysis
- Feature Engineering
- Fine-tuning, selection and training of ML models
- Testing on the test set
   
4. Result
- Achieve a highly performing predicitive model which can help understand the interaction between pollution and the   atmosphere
- Intepret the model to gain trust and look for ways to improve performance (Feature elimination)
- Gain an understanding of the system through the ML model
```

# Libraries

In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
from matplotlib import dates
%matplotlib inline

import seaborn as sns
sns.set_style("darkgrid")

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV,KFold
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score, mean_squared_error

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

!pip install shap
import shap
shap.initjs()

import ipywidgets as widgets

from statsmodels.graphics.tsaplots import plot_acf,plot_pacf

# Reading and understanding training data

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jul-2021/train.csv")
train.head(5)

In [None]:
train.describe()

- Temperature between two extremes
- No clear pattern yet

In [None]:
print(train.dtypes)

In [None]:
print("Null values in Features :\n", train.isnull().sum())

- No null values

In [None]:
train.rename(columns = {"date_time":"datetime"},inplace=True)
train.datetime = pd.to_datetime(train.datetime)

# For analysis later
train['Day'] = train['datetime'].dt.day_name()
train['Month'] = train['datetime'].dt.month_name()
train.head(5)

# EDA of features
### Analyse behaviour of atmospheric varibles over the weekdays

- Since, it is a time series, let's analyse the change of the features with time
- Plot of deg_C over the weekdays

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.title("Plot of deg_C with time in the first week")

for i in range(10,17):
    
    
    one_day = train[(train.datetime >= "2010-03-" +str(i) +" 00:00:00") & 
                         (train.datetime <= "2010-03-" +str(i) +" 23:00:00")]
    sns.lineplot(data=one_day, x="datetime", y="deg_C",ax=ax)
    ax.xaxis.set_major_formatter(dates.DateFormatter('%H-%M'))
    ax.xaxis.set_major_locator(dates.HourLocator(interval=12))

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.title("Plot of deg_C with time in the second week")

for i in range(17,24):
    
    
    one_day = train[(train.datetime >= "2010-03-" +str(i) +" 00:00:00") & 
                         (train.datetime <= "2010-03-" +str(i) +" 23:00:00")]
    sns.lineplot(data=one_day, x="datetime", y="deg_C",ax=ax)
    ax.xaxis.set_major_formatter(dates.DateFormatter('%H-%M'))
    ax.xaxis.set_major_locator(dates.HourLocator(interval=12))

plt.show()

- An increase of temperature in the morning to a peak value in the afternoon proceeded by a drop at night
- Also, a cyclical function of time
- Similarly, let's plot for relative_humidity, absolute_humidity

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.title("Plot of relative_humidity with time in the first week")

for i in range(10,17):
    
    
    one_day = train[(train.datetime >= "2010-03-" +str(i) +" 00:00:00") & 
                         (train.datetime <= "2010-03-" +str(i) +" 23:00:00")]
    sns.lineplot(data=one_day, x="datetime", y="relative_humidity",ax=ax)
    ax.xaxis.set_major_formatter(dates.DateFormatter('%H-%M'))
    ax.xaxis.set_major_locator(dates.HourLocator(interval=12))

plt.show()

- Spike (at 2) and drop (at around 14)

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.title("Plot of absolute_humidity with time in the first week")

for i in range(10,17):
    
    
    one_day = train[(train.datetime >= "2010-03-" +str(i) +" 00:00:00") & 
                         (train.datetime <= "2010-03-" +str(i) +" 23:00:00")]
    sns.lineplot(data=one_day, x="datetime", y="absolute_humidity",ax=ax)
    ax.xaxis.set_major_formatter(dates.DateFormatter('%H-%M'))
    ax.xaxis.set_major_locator(dates.HourLocator(interval=12))

plt.show()

- No clear pattern as such

### Analyse behaviour of sensor data over the weekdays

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.title("Plot of sensor_1 with time in the first week")

for i in range(10,17):
    
    
    one_day = train[(train.datetime >= "2010-03-" +str(i) +" 00:00:00") & 
                         (train.datetime <= "2010-03-" +str(i) +" 23:00:00")]
    sns.lineplot(data=one_day, x="datetime", y="sensor_1",ax=ax)
    ax.xaxis.set_major_formatter(dates.DateFormatter('%H-%M'))
    ax.xaxis.set_major_locator(dates.HourLocator(interval=12))

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.title("Plot of sensor_2 with time in the first week")

for i in range(10,17):
    
    
    one_day = train[(train.datetime >= "2010-03-" +str(i) +" 00:00:00") & 
                         (train.datetime <= "2010-03-" +str(i) +" 23:00:00")]
    sns.lineplot(data=one_day, x="datetime", y="sensor_2",ax=ax)
    ax.xaxis.set_major_formatter(dates.DateFormatter('%H-%M'))
    ax.xaxis.set_major_locator(dates.HourLocator(interval=12))

plt.show()

- Dual peaks around early morning and late afternoon

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.title("Plot of sensor_3 with time in the first week")

for i in range(10,17):
    
    
    one_day = train[(train.datetime >= "2010-03-" +str(i) +" 00:00:00") & 
                         (train.datetime <= "2010-03-" +str(i) +" 23:00:00")]
    sns.lineplot(data=one_day, x="datetime", y="sensor_3",ax=ax)
    ax.xaxis.set_major_formatter(dates.DateFormatter('%H-%M'))
    ax.xaxis.set_major_locator(dates.HourLocator(interval=12))

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.title("Plot of sensor_4 with time in the first week")

for i in range(10,17):
    
    
    one_day = train[(train.datetime >= "2010-03-" +str(i) +" 00:00:00") & 
                         (train.datetime <= "2010-03-" +str(i) +" 23:00:00")]
    sns.lineplot(data=one_day, x="datetime", y="sensor_4",ax=ax)
    ax.xaxis.set_major_formatter(dates.DateFormatter('%H-%M'))
    ax.xaxis.set_major_locator(dates.HourLocator(interval=12))

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.title("Plot of sensor_5 with time in the first week")

for i in range(10,17):
    
    
    one_day = train[(train.datetime >= "2010-03-" +str(i) +" 00:00:00") & 
                         (train.datetime <= "2010-03-" +str(i) +" 23:00:00")]
    sns.lineplot(data=one_day, x="datetime", y="sensor_5",ax=ax)
    ax.xaxis.set_major_formatter(dates.DateFormatter('%H-%M'))
    ax.xaxis.set_major_locator(dates.HourLocator(interval=12))

plt.show()

- No pecularities, but there seems to be a strong correlation between the sensors data

### Analyse behaviour of target variables over the weekdays

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.title("Plot of target_carbon_monoxide with time in the first week")

for i in range(10,17):
    
    
    one_day = train[(train.datetime >= "2010-03-" +str(i) +" 00:00:00") & 
                         (train.datetime <= "2010-03-" +str(i) +" 23:00:00")]
    sns.lineplot(data=one_day, x="datetime", y="target_carbon_monoxide",ax=ax)
    ax.xaxis.set_major_formatter(dates.DateFormatter('%H-%M'))
    ax.xaxis.set_major_locator(dates.HourLocator(interval=12))

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.title("Plot of target_carbon_monoxide with time in the second week")

for i in range(17,24):
    
    
    one_day = train[(train.datetime >= "2010-03-" +str(i) +" 00:00:00") & 
                         (train.datetime <= "2010-03-" +str(i) +" 23:00:00")]
    sns.lineplot(data=one_day, x="datetime", y="target_carbon_monoxide",ax=ax)
    ax.xaxis.set_major_formatter(dates.DateFormatter('%H-%M'))
    ax.xaxis.set_major_locator(dates.HourLocator(interval=12))

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.title("Plot of target_carbon_monoxide with time in the third week")

for i in range(24,31):
    
    
    one_day = train[(train.datetime >= "2010-03-" +str(i) +" 00:00:00") & 
                         (train.datetime <= "2010-03-" +str(i) +" 23:00:00")]
    sns.lineplot(data=one_day, x="datetime", y="target_carbon_monoxide",ax=ax)
    ax.xaxis.set_major_formatter(dates.DateFormatter('%H-%M'))
    ax.xaxis.set_major_locator(dates.HourLocator(interval=12))

plt.show()

- The values are depressed around the weekend (Saturday and Sunday)
- Might be it located in an industrial area
- Let's check for benzene and nitrogen_oxides

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.title("Plot of target_benzene with time in the first week")

for i in range(10,17):
    
    
    one_day = train[(train.datetime >= "2010-03-" +str(i) +" 00:00:00") & 
                         (train.datetime <= "2010-03-" +str(i) +" 23:00:00")]
    sns.lineplot(data=one_day, x="datetime", y="target_benzene",ax=ax)
    ax.xaxis.set_major_formatter(dates.DateFormatter('%H-%M'))
    ax.xaxis.set_major_locator(dates.HourLocator(interval=12))

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.title("Plot of nitrogen_oxides with time in the first week")

for i in range(10,17):
    
    
    one_day = train[(train.datetime >= "2010-03-" +str(i) +" 00:00:00") & 
                         (train.datetime <= "2010-03-" +str(i) +" 23:00:00")]
    sns.lineplot(data=one_day, x="datetime", y="target_nitrogen_oxides",ax=ax)
    ax.xaxis.set_major_formatter(dates.DateFormatter('%H-%M'))
    ax.xaxis.set_major_locator(dates.HourLocator(interval=12))

plt.show()

- Similar patterns, it is a function of holidays and time (drop of pollutants in the night)


## Monthly variation of targets values

In [None]:
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(211)
fig = plot_acf(train.target_carbon_monoxide,lags=360,ax=ax,title="monthly seasonality using ACF - carbon_monoxide")
fig.show()

fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(211)
fig = plot_acf(train.target_benzene,lags=360,ax=ax,title="monthly seasonality using ACF - benzene")
fig.show()

fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(211)
fig = plot_acf(train.target_nitrogen_oxides,lags=360,ax=ax,title="monthly seasonality using ACF - carbon_monoxide")
fig.show()

- Month is very important factor for the target values

## Heatmaps of the correlation plots 

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(train.corr(), cmap="Greys")
plt.show()

- High correlation between the sensors and the target values

# Feature engineering

Let's introduce some new features with respect to temperature and relative humidity. Since, I am using classical models, there is no memory of time for each input sequence like LSTM. However, I can create features like cumulative sum which can retain some memory of past events. Also other features worth considering -:
- Cumulative temperature for a single day
- Cumulative Relative humidity for a single day
- Time in absolute seconds (in terms of hours not days)

In [None]:
# Cumulative features
train["M-D"] = train['datetime'].dt.strftime('%m-%d')
train['deg_C_cumsum'] = train.groupby("M-D")['deg_C'].apply(lambda x: x.cumsum())
train['relative_humidity_cumsum'] = train.groupby("M-D")['relative_humidity'].apply(lambda x: x.cumsum())
train.drop(columns=["M-D"],inplace=True)

In [None]:
one_day = train[(train.datetime >= "2010-03-10 00:00:00") & 
                         (train.datetime <= "2010-03-10 23:00:00")]

plt.plot(one_day["datetime"],one_day['deg_C_cumsum'])
one_day = train[(train.datetime >= "2010-03-11 00:00:00") & 
                         (train.datetime <= "2010-03-11 23:00:00")]
plt.plot(one_day["datetime"],one_day['deg_C_cumsum'])
plt.xticks(rotation=45)
plt.show()

In [None]:
one_day = train[(train.datetime >= "2010-03-10 00:00:00") & 
                         (train.datetime <= "2010-03-10 23:00:00")]

plt.plot(one_day["datetime"],one_day['relative_humidity_cumsum'])
one_day = train[(train.datetime >= "2010-03-11 00:00:00") & 
                         (train.datetime <= "2010-03-11 23:00:00")]
plt.plot(one_day["datetime"],one_day['relative_humidity_cumsum'])
plt.xticks(rotation=45)
plt.show()

In [None]:
train['Seconds'] = train['datetime'].dt.hour * 3600 + \
             train['datetime'].dt.minute * 60 + \
             train['datetime'].dt.second

In [None]:
one_day = train[(train.datetime >= "2010-03-10 00:00:00") & 
                         (train.datetime <= "2010-03-10 23:00:00")]

plt.plot(one_day["datetime"],one_day['Seconds'])
one_day = train[(train.datetime >= "2010-03-11 00:00:00") & 
                         (train.datetime <= "2010-03-11 23:00:00")]
plt.plot(one_day["datetime"],one_day['Seconds'])
plt.xticks(rotation=45)
plt.show()

## Covariate drift between training and testing set

<img src= "https://i.morioh.com/201029/ddb97846.webp" width="800" height="400">

In [None]:
test = pd.read_csv("../input/tabular-playground-series-jul-2021/test.csv")
test.head(5)

In [None]:
plt.figure(figsize=(12,8))
plt.title("Covariate drift for deg_C")
sns.kdeplot(test.deg_C.values, c='r',label='Test set')
sns.kdeplot(train.deg_C.values, c='b',label='Train set')
plt.legend()
plt.show()

plt.figure(figsize=(12,8))
plt.title("Covariate drift for relative_humidity")
sns.kdeplot(test.relative_humidity.values, c='r',label='Test set')
sns.kdeplot(train.relative_humidity.values, c='b',label='Train set')
plt.legend()
plt.show()

plt.figure(figsize=(12,8))
plt.title("Covariate drift for absolute_humidity")
sns.kdeplot(test.absolute_humidity.values, c='r',label='Test set')
sns.kdeplot(train.absolute_humidity.values, c='b',label='Train set')
plt.legend()
plt.show()

plt.figure(figsize=(12,8))
plt.title("Covariate drift for sensor_1")
sns.kdeplot(test.sensor_1.values, c='r',label='Test set')
sns.kdeplot(train.sensor_1.values, c='b',label='Train set')
plt.legend()
plt.show()

plt.figure(figsize=(12,8))
plt.title("Covariate drift for sensor_2")
sns.kdeplot(test.sensor_2.values, c='r',label='Test set')
sns.kdeplot(train.sensor_2.values, c='b',label='Train set')
plt.legend()
plt.show()

plt.figure(figsize=(12,8))
plt.title("Covariate drift for sensor_3")
sns.kdeplot(test.sensor_3.values, c='r',label='Test set')
sns.kdeplot(train.sensor_3.values, c='b',label='Train set')
plt.legend()
plt.show()

plt.figure(figsize=(12,8))
plt.title("Covariate drift for sensor_4")
sns.kdeplot(test.sensor_4.values, c='r',label='Test set')
sns.kdeplot(train.sensor_4.values, c='b',label='Train set')
plt.legend()
plt.show()

plt.figure(figsize=(12,8))
plt.title("Covariate drift for sensor_5")
sns.kdeplot(test.sensor_5.values, c='r',label='Test set')
sns.kdeplot(train.sensor_5.values, c='b',label='Train set')
plt.legend()
plt.show()

- Apart from deg_C, absolute_humidity and sensor_4, no major drift
- During training and testing, it will be clear how much of an impact is the drift.

# Training of the model within the training set

In [None]:
train.head(5)

In [None]:
train.columns

In [None]:
train.drop(columns=["datetime"],inplace=True)
train_X = train[["Month","deg_C",'relative_humidity','absolute_humidity','sensor_1','sensor_2','sensor_3','sensor_4','sensor_5', 'relative_humidity_cumsum', 'Seconds',
       'Day','deg_C_cumsum']]
train_y = train.loc[:, ~train.columns.isin(train_X.columns)]

In [None]:
print(train_X.columns)
print(train_y.columns)

- Splitting into training and testing set and normalisation of values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.3,random_state=30)

In [None]:
columns_int =   X_train.select_dtypes(include=['int64']).columns.values.tolist()
columns_float = X_train.select_dtypes(include=['float']).columns.values.tolist()
columns_object = X_train.select_dtypes(include=['object']).columns.values.tolist() # Days
columns_int.extend(columns_float)
   
# Scale numerical values    
SS_in = preprocessing.MinMaxScaler()        
X_train[columns_int] = SS_in.fit_transform(X_train[columns_int])
X_test[columns_int] = SS_in.transform(X_test[columns_int])

# Encode Categorical Columns
# In training, months starts from March. Also, easy intepretation with shap plots
# Due to missing values of January and Febraury, label encoding (random in behaviour) might give better values 
# than ordinal encoding. Ordinal encoding is purely done for intepretation of the model and the system alone
months = ["March","April","May","June","July","August","September","October","November","December","January","February"]
lbl_1 = preprocessing.OrdinalEncoder(categories=[months])
months = lbl_1.fit(np.reshape(months,(-1,1)))
X_train["Month"] = lbl_1.transform(np.reshape(X_train["Month"].to_list(),(-1,1)))
X_test["Month"] = lbl_1.transform(np.reshape(X_test["Month"].to_list(),(-1,1)))

days = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
lbl_2 = preprocessing.OrdinalEncoder(categories=[days])
days = lbl_2.fit(np.reshape(days,(-1,1)))
X_train["Day"] = lbl_2.transform(np.reshape(X_train["Day"].to_list(),(-1,1)))
X_test["Day"] = lbl_2.transform(np.reshape(X_test["Day"].to_list(),(-1,1)))



In [None]:
columns_int =   y_train.select_dtypes(include=['int64']).columns.values.tolist()
columns_float = y_train.select_dtypes(include=['float']).columns.values.tolist()
columns_int.extend(columns_float)    
    
SS_out = preprocessing.MinMaxScaler()        
y_train[columns_int] = SS_out.fit_transform(y_train[columns_int])
y_test[columns_int] = SS_out.transform(y_test[columns_int])


## Models to evaluate

- Catboost
- LightGBM
- RandomForest
- Voting regressor


In [None]:
model1 = CatBoostRegressor(logging_level='Silent')
CAT = MultiOutputRegressor(model1)

CAT.fit(X_train, y_train)

pred = CAT.predict(X_test)
print('Mean r2_score:',r2_score(pred,y_test))

In [None]:
model2 = LGBMRegressor()
LGBM = MultiOutputRegressor(model2)

LGBM.fit(X_train, y_train)

pred = LGBM.predict(X_test)
print('Mean r2_score:',r2_score(pred,y_test))

In [None]:
RF = RandomForestRegressor()

RF.fit(X_train, y_train)

pred = RF.predict(X_test)
print('Mean r2_score:',r2_score(pred,y_test))

In [None]:
VR = MultiOutputRegressor(VotingRegressor([('lg', model2), ('CAT', model1),("RF",RF)]))
VR.fit(X_train, y_train)

pred = VR.predict(X_test)
print('Mean r2_score:',r2_score(pred,y_test))

- Baseline catboost is the best model

# Model intepretation

<img src="https://blog.ml.cmu.edu/wp-content/uploads/2019/11/sfLP7d0oKFZ5crb6wt7a9lA.png">

In [None]:
# Derives shap values for the different features
samples = X_train.iloc[0:50,:]

explainer = shap.KernelExplainer(CAT.predict,samples)
shap_values = explainer.shap_values(samples)

In [None]:
# References

# https://www.kaggle.com/vikumsw/explaining-random-forest-model-with-shapely-values
# https://www.analyticsvidhya.com/blog/2019/11/shapley-value-machine-learning-interpretability-game-theory/
# https://shap.readthedocs.io/en/latest/example_notebooks/tabular_examples/model_agnostic/Multioutput%20Regression%20SHAP.html


## Target specific intepretations

In [None]:
print(f'Current Plot Shown: Carbon monoxide \n')

shap.summary_plot(shap_values = shap_values[0],
                  features = samples, plot_size=(20,20),
                  title = "Carbon monoxide")

### Some interesting observations
1. Accumulation of Carbon monoxide mostly happens later in the day (seconds)
2. Higher values of temperature and absolute humidity seem to decrease the level of Carbon monoxide in the atmosphere
3. Also higher carbon monoxide emmisions happens later in the year (Months) 

In [None]:
print(f'Current Plot Shown: Benzene \n')
shap.summary_plot(shap_values = shap_values[1],
                  features = samples, plot_size=(20,20),title = "Benzene"
                  )

### Some interesting observations
1. Higher values of relative and absolute humidity seem to decrease the level of benzene in the atmosphere
2. Also higher benzene emmisions happens later in the year (Months) 
3. Lower benzene emmisions happen on later days of the week

In [None]:
print(f'Current Plot Shown: Nitrogen oxide \n')
shap.summary_plot(shap_values = shap_values[2],
                  features = samples, plot_size=(20,20),title="Nitrogen oxide"
                  )

### Some interesting observations
1. Higher nitrogen oxide emmisions happens later in the year (Months) 
2. Lower values of absolute humidity have a higher effect on nitrogen oxide
3. Higher temperatures has negative impact on nitrogen oxide levels

# Visualization of fit

In [None]:
pred_inv = SS_out.inverse_transform(pred)
test_inv = SS_out.inverse_transform(y_test)

In [None]:
for i,column in enumerate(y_test.columns):

    print('r2_score for '+ column + ': ',round(r2_score(test_inv[:,i],pred_inv[:,i]),2))

In [None]:
for i,column in enumerate(y_test.columns):
        
    plt.figure(figsize=(30,10))
    plt.plot(pred_inv[:50,i],'r--',label='Prediction')
    plt.plot(test_inv[:50,i],'g--',label='Real')
    plt.title(column)
    plt.legend()
    plt.show()

# Testing on test set and submission

In [None]:
test = pd.read_csv("../input/tabular-playground-series-jul-2021/test.csv")
test.head(5)

### Feature engineering and Data pre-processing

In [None]:
test.rename(columns = {"date_time":"datetime"},inplace=True)
test.datetime = pd.to_datetime(test.datetime)
test['Day'] = test['datetime'].dt.day_name()
test["Month"] = test["datetime"].dt.month_name()

# Cumulative features
test["M-D"] = test['datetime'].dt.strftime('%m-%d')
test['deg_C_cumsum'] = test.groupby("M-D")['deg_C'].apply(lambda x: x.cumsum())
test['relative_humidity_cumsum'] = test.groupby("M-D")['relative_humidity'].apply(lambda x: x.cumsum())
test.drop(columns=["M-D"],inplace=True)

test['Seconds'] = test['datetime'].dt.hour * 3600 + \
             test['datetime'].dt.minute * 60 + \
             test['datetime'].dt.second


In [None]:
X_test = test[X_train.columns]

In [None]:
columns_int =   X_test.select_dtypes(include=['int64']).columns.values.tolist()
columns_float = X_test.select_dtypes(include=['float']).columns.values.tolist()
columns_object = X_test.select_dtypes(include=['object']).columns.values.tolist()
columns_int.extend(columns_float)    
    
X_test[columns_int] = SS_in.transform(X_test[columns_int])
X_test["Month"] = lbl_1.transform(np.reshape(X_test["Month"].to_list(),(-1,1)))
X_test["Day"] = lbl_2.transform(np.reshape(X_test["Day"].to_list(),(-1,1)))

                                       

## Model testing and submission

In [None]:
pred = CAT.predict(X_test)

In [None]:
pred_inv = SS_out.inverse_transform(pred)
print(pred_inv)

In [None]:
submission = pd.DataFrame()
submission['date_time'] = test['datetime']
submission['target_carbon_monoxide'] = pred_inv[:,0]
submission['target_benzene'] = pred_inv[:,1]
submission['target_nitrogen_oxides'] = pred_inv[:,2]
print(submission.head(10))

submission.to_csv("Submission.csv", index=False)

## Still under completion. Please feel free to add suggestions