In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import chi2, SelectKBest

In [3]:
train = pd.read_csv('tabular-playground-series-jul-2021/train.csv', parse_dates=['date_time'], index_col='date_time')
test = pd.read_csv('tabular-playground-series-jul-2021/test.csv', parse_dates=['date_time'], index_col='date_time')

In [4]:
corr = train.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
deg_C,1.0,-0.668002,0.445162,0.017513,0.133167,-0.145437,0.308202,-0.050567,-0.007503,0.109757,-0.205984
relative_humidity,-0.668002,1.0,0.249013,0.09313,-0.035152,-0.102146,0.027002,0.126466,0.016589,-0.022895,0.178608
absolute_humidity,0.445162,0.249013,1.0,0.105977,0.236894,-0.485445,0.567376,0.124945,-0.013756,0.186293,-0.101911
sensor_1,0.017513,0.09313,0.105977,1.0,0.811898,-0.592233,0.643191,0.860849,0.857215,0.842534,0.643044
sensor_2,0.133167,-0.035152,0.236894,0.811898,1.0,-0.819334,0.812454,0.863464,0.821025,0.96444,0.604795
sensor_3,-0.145437,-0.102146,-0.485445,-0.592233,-0.819334,1.0,-0.741439,-0.706006,-0.547567,-0.735545,-0.427622
sensor_4,0.308202,0.027002,0.567376,0.643191,0.812454,-0.741439,1.0,0.64112,0.571412,0.78375,0.235662
sensor_5,-0.050567,0.126466,0.124945,0.860849,0.863464,-0.706006,0.64112,1.0,0.834501,0.877662,0.70827
target_carbon_monoxide,-0.007503,0.016589,-0.013756,0.857215,0.821025,-0.547567,0.571412,0.834501,1.0,0.881884,0.806099
target_benzene,0.109757,-0.022895,0.186293,0.842534,0.96444,-0.735545,0.78375,0.877662,0.881884,1.0,0.664235


In [5]:
columns = set(train.columns) - set(['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])
columns_carbon_monoxide = list(columns) + ['target_benzene']
columns_nitrogen_oxides = list(columns) + ['target_carbon_monoxide']
targets = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

In [6]:
rf_carbon_monoxide = make_pipeline(StandardScaler(), RandomForestRegressor())
rf_benzene = make_pipeline(StandardScaler(), RandomForestRegressor())
rf_nitrogen_oxides = make_pipeline(StandardScaler(), RandomForestRegressor())

In [7]:
X_train, X_test, y_train, y_test = train_test_split(train, train[targets], test_size=0.2)

In [8]:
X_train_carbon_monoxide = X_train[columns_carbon_monoxide]
X_train_benzene = X_train[columns]
X_train_nitrogen_oxides = X_train[columns_nitrogen_oxides]

X_test_carbon_monoxide = X_test[columns_carbon_monoxide]
X_test_benzene = X_test[columns]
X_test_nitrogen_oxides = X_test[columns_nitrogen_oxides]

In [9]:
y_train_carbon_monoxide = y_train['target_carbon_monoxide']
y_test_carbon_monoxide = y_test['target_carbon_monoxide']

y_train_benzene = y_train['target_benzene']
y_test_benzene = y_test['target_benzene']

y_train_nitrogen_oxides = y_train['target_nitrogen_oxides']
y_test_nitrogen_oxides = y_test['target_nitrogen_oxides']

In [10]:
rf_benzene.fit(X_train_benzene, y_train_benzene.values)
y_pred_train_benzene = rf_benzene.predict(X_train_benzene)
y_pred_test_benzene = rf_benzene.predict(X_test_benzene)
(mean_squared_log_error(y_train_benzene, y_pred_train_benzene),
 mean_squared_log_error(y_test_benzene, y_pred_test_benzene))

(0.001235014937368321, 0.00872370501003721)

In [11]:
test['target_benzene'] = rf_benzene.predict(test[columns])

In [12]:
rf_carbon_monoxide.fit(X_train_carbon_monoxide, y_train_carbon_monoxide.values)
y_pred_train_carbon_monoxide = rf_carbon_monoxide.predict(X_train_carbon_monoxide)
y_pred_test_carbon_monoxide = rf_carbon_monoxide.predict(X_test_carbon_monoxide)
(mean_squared_log_error(y_train_carbon_monoxide, y_pred_train_carbon_monoxide),
 mean_squared_log_error(y_test_carbon_monoxide, y_pred_test_carbon_monoxide))

(0.002411873065074066, 0.01669007094499824)

In [13]:
test['target_carbon_monoxide'] = rf_carbon_monoxide.predict(test[columns_carbon_monoxide])

In [14]:
rf_nitrogen_oxides.fit(X_train_nitrogen_oxides, y_train_nitrogen_oxides.values)
y_pred_train_nitrogen_oxides = rf_nitrogen_oxides.predict(X_train_nitrogen_oxides)
y_pred_test_nitrogen_oxides = rf_nitrogen_oxides.predict(X_test_nitrogen_oxides)
(mean_squared_log_error(y_train_nitrogen_oxides, y_pred_train_nitrogen_oxides),
 mean_squared_log_error(y_test_nitrogen_oxides, y_pred_test_nitrogen_oxides))

(0.012134974071959704, 0.07373656325087663)

In [15]:
test['target_nitrogen_oxides'] = rf_nitrogen_oxides.predict(test[columns_nitrogen_oxides])

In [16]:
rf_carbon_monoxide.steps[1][1].feature_importances_

array([0.01541529, 0.01981558, 0.01345775, 0.01274465, 0.03188859,
       0.01995946, 0.10425389, 0.02781555, 0.75464924])

In [17]:
X_train_carbon_monoxide

Unnamed: 0_level_0,sensor_2,sensor_5,sensor_4,relative_humidity,absolute_humidity,sensor_3,sensor_1,deg_C,target_benzene
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-04-01 12:00:00,875.0,774.2,1493.5,36.0,0.8797,1003.2,1072.6,19.4,7.6
2010-07-23 04:00:00,720.1,1022.0,1530.0,44.0,1.4801,917.7,848.2,26.1,3.8
2010-08-20 03:00:00,544.4,452.6,1452.2,49.2,1.8720,1266.3,775.7,27.9,1.3
2010-05-19 06:00:00,832.0,761.4,1520.5,53.1,0.9612,933.8,1019.2,16.9,5.5
2010-12-29 14:00:00,687.3,742.6,1066.0,46.3,0.6397,870.0,886.6,11.8,4.6
...,...,...,...,...,...,...,...,...,...
2010-09-04 05:00:00,788.6,903.0,1333.9,50.7,1.4337,768.6,929.3,21.4,4.8
2010-07-16 10:00:00,950.0,1040.0,1497.4,32.0,1.2825,842.1,977.4,29.1,9.8
2010-10-25 09:00:00,1236.8,1610.9,1859.2,65.0,1.4140,520.4,1300.9,18.0,17.7
2010-04-07 14:00:00,792.5,626.0,1383.2,50.2,0.8433,1040.3,997.0,15.1,6.2


In [18]:
submission = test[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]

In [19]:
submission

Unnamed: 0_level_0,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011-01-01 00:00:00,1.404,4.491,206.009
2011-01-01 01:00:00,2.391,7.652,334.853
2011-01-01 02:00:00,1.792,7.665,329.924
2011-01-01 03:00:00,2.471,8.708,366.893
2011-01-01 04:00:00,1.428,6.704,247.548
...,...,...,...
2011-04-04 10:00:00,2.590,12.605,393.111
2011-04-04 11:00:00,2.222,10.497,348.134
2011-04-04 12:00:00,2.452,12.277,376.657
2011-04-04 13:00:00,1.961,11.153,295.075


In [20]:
submission.to_csv('my_submission.csv')