In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import chi2, SelectKBest

In [3]:
train = pd.read_csv('tabular-playground-series-jul-2021/train.csv', parse_dates=['date_time'], index_col='date_time')
test = pd.read_csv('tabular-playground-series-jul-2021/test.csv', parse_dates=['date_time'], index_col='date_time')

In [4]:
corr = train.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
deg_C,1.0,-0.668002,0.445162,0.017513,0.133167,-0.145437,0.308202,-0.050567,-0.007503,0.109757,-0.205984
relative_humidity,-0.668002,1.0,0.249013,0.09313,-0.035152,-0.102146,0.027002,0.126466,0.016589,-0.022895,0.178608
absolute_humidity,0.445162,0.249013,1.0,0.105977,0.236894,-0.485445,0.567376,0.124945,-0.013756,0.186293,-0.101911
sensor_1,0.017513,0.09313,0.105977,1.0,0.811898,-0.592233,0.643191,0.860849,0.857215,0.842534,0.643044
sensor_2,0.133167,-0.035152,0.236894,0.811898,1.0,-0.819334,0.812454,0.863464,0.821025,0.96444,0.604795
sensor_3,-0.145437,-0.102146,-0.485445,-0.592233,-0.819334,1.0,-0.741439,-0.706006,-0.547567,-0.735545,-0.427622
sensor_4,0.308202,0.027002,0.567376,0.643191,0.812454,-0.741439,1.0,0.64112,0.571412,0.78375,0.235662
sensor_5,-0.050567,0.126466,0.124945,0.860849,0.863464,-0.706006,0.64112,1.0,0.834501,0.877662,0.70827
target_carbon_monoxide,-0.007503,0.016589,-0.013756,0.857215,0.821025,-0.547567,0.571412,0.834501,1.0,0.881884,0.806099
target_benzene,0.109757,-0.022895,0.186293,0.842534,0.96444,-0.735545,0.78375,0.877662,0.881884,1.0,0.664235


In [5]:
columns = set(train.columns) - set(['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])
columns_carbon_monoxide = set(train.columns) - set(['target_carbon_monoxide'])
columns_benzene = set(train.columns) - set(['target_benzene'])
columns_nitrogen_oxides = set(train.columns) - set(['nitrogen_oxides'])
targets = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

In [6]:
clf_carbon_monoxide = make_pipeline(MinMaxScaler(), GradientBoostingRegressor())
clf_benzene = make_pipeline(MinMaxScaler(), GradientBoostingRegressor())
clf_nitrogen_oxides = make_pipeline(MinMaxScaler(), GradientBoostingRegressor())

In [7]:
X_train, X_test, y_train, y_test = train_test_split(train, train[targets], test_size=0.2)

In [8]:
X_train_no_targets =  X_train[columns]
X_train_carbon_monoxide = X_train[columns_carbon_monoxide]
X_train_benzene = X_train[columns_benzene]
X_train_nitrogen_oxides = X_train[columns_nitrogen_oxides]

X_test_no_targets =  X_test[columns]
X_test_carbon_monoxide = X_test[columns_carbon_monoxide]
X_test_benzene = X_test[columns_benzene]
X_test_nitrogen_oxides = X_test[columns_nitrogen_oxides]

In [9]:
y_train_carbon_monoxide = y_train['target_carbon_monoxide']
y_test_carbon_monoxide = y_test['target_carbon_monoxide']

y_train_benzene = y_train['target_benzene']
y_test_benzene = y_test['target_benzene']

y_train_nitrogen_oxides = y_train['target_nitrogen_oxides']
y_test_nitrogen_oxides = y_test['target_nitrogen_oxides']

In [10]:
clf_carbon_monoxide.fit(X_train_no_targets, y_train_carbon_monoxide.values)
y_pred_train_carbon_monoxide = clf_carbon_monoxide.predict(X_train_no_targets)
y_pred_test_carbon_monoxide = clf_carbon_monoxide.predict(X_test_no_targets)
(mean_squared_log_error(y_train_carbon_monoxide, y_pred_train_carbon_monoxide),
 mean_squared_log_error(y_test_carbon_monoxide, y_pred_test_carbon_monoxide))

(0.014305973981808773, 0.019233010853635647)

In [11]:
clf_benzene.fit(X_train_no_targets, y_train_benzene.values)
y_pred_train_benzene = clf_benzene.predict(X_train_no_targets)
y_pred_test_benzene = clf_benzene.predict(X_test_no_targets)
(mean_squared_log_error(y_train_benzene, y_pred_train_benzene),
 mean_squared_log_error(y_test_benzene, y_pred_test_benzene))

(0.00813076481931209, 0.009585392706960823)

In [12]:
clf_nitrogen_oxides.fit(X_train_no_targets, y_train_nitrogen_oxides.values)
y_pred_train_nitrogen_oxides = clf_nitrogen_oxides.predict(X_train_no_targets)
y_pred_test_nitrogen_oxides = clf_nitrogen_oxides.predict(X_test_no_targets)
(mean_squared_log_error(y_train_nitrogen_oxides, y_pred_train_nitrogen_oxides),
 mean_squared_log_error(y_test_nitrogen_oxides, y_pred_test_nitrogen_oxides))

(0.1324241168330572, 0.14037390261941976)

In [13]:
test['target_carbon_monoxide'] = clf_carbon_monoxide.predict(test[columns])
test['target_benzene'] = clf_benzene.predict(test[columns])
test['target_nitrogen_oxides'] = clf_nitrogen_oxides.predict(test[columns])

In [14]:
clf_carbon_monoxide.fit(X_train_carbon_monoxide, y_train_carbon_monoxide.values)
y_pred_train_carbon_monoxide = clf_carbon_monoxide.predict(X_train_carbon_monoxide)
y_pred_test_carbon_monoxide = clf_carbon_monoxide.predict(X_test_carbon_monoxide)
(mean_squared_log_error(y_train_carbon_monoxide, y_pred_train_carbon_monoxide),
 mean_squared_log_error(y_test_carbon_monoxide, y_pred_test_carbon_monoxide))

(0.009411968948066004, 0.012655177605074395)

In [15]:
clf_benzene.fit(X_train_benzene, y_train_benzene.values)
y_pred_train_benzene = clf_benzene.predict(X_train_benzene)
y_pred_test_benzene = clf_benzene.predict(X_test_benzene)
(mean_squared_log_error(y_train_benzene, y_pred_train_benzene),
 mean_squared_log_error(y_test_benzene, y_pred_test_benzene))

ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.

In [None]:
clf_nitrogen_oxides.fit(X_train_nitrogen_oxides, y_train_nitrogen_oxides.values)
y_pred_train_nitrogen_oxides = clf_nitrogen_oxides.predict(X_train_nitrogen_oxides)
y_pred_test_nitrogen_oxides = clf_nitrogen_oxides.predict(X_test_nitrogen_oxides)
(mean_squared_log_error(y_train_nitrogen_oxides, y_pred_train_nitrogen_oxides),
 mean_squared_log_error(y_test_nitrogen_oxides, y_pred_test_nitrogen_oxides))

In [None]:
carbon_monoxide = clf_carbon_monoxide.predict(test[columns_carbon_monoxide])
benzene = clf_benzene.predict(test[columns_benzene])
nitrogen_oxides = clf_nitrogen_oxides.predict(test[columns_nitrogen_oxides])

In [None]:
test['target_carbon_monoxide'] = carbon_monoxide
test['target_benzene'] = benzene
test['target_nitrogen_oxides'] = nitrogen_oxides

In [None]:
submission = test[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]

In [None]:
submission

In [None]:
submission.to_csv('my_submission.csv')