In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [2]:
print("Hello")

Hello


In [3]:
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
import time
import random

In [4]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

In [5]:
train['date_time'] = pd.to_datetime(train['date_time'], format = "%Y-%m-%d %H:%M:%S")
test['date_time'] = pd.to_datetime(test['date_time'], format= "%Y-%m-%d %H:%M:%S")

In [6]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7111 entries, 0 to 7110
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   date_time               7111 non-null   datetime64[ns]
 1   deg_C                   7111 non-null   float64       
 2   relative_humidity       7111 non-null   float64       
 3   absolute_humidity       7111 non-null   float64       
 4   sensor_1                7111 non-null   float64       
 5   sensor_2                7111 non-null   float64       
 6   sensor_3                7111 non-null   float64       
 7   sensor_4                7111 non-null   float64       
 8   sensor_5                7111 non-null   float64       
 9   target_carbon_monoxide  7111 non-null   float64       
 10  target_benzene          7111 non-null   float64       
 11  target_nitrogen_oxides  7111 non-null   float64       
dtypes: datetime64[ns](1), float64(11)
memory usage: 

In [7]:
def make_new_features(df):
    df["month"] = df["date_time"].dt.month
    df["day_of_week"] = df["date_time"].dt.dayofweek
    df["day_of_year"] = df["date_time"].dt.dayofyear
    df["hour"] = df["date_time"].dt.hour
    df["quarter"] = df["date_time"].dt.quarter
    df["week_of_year"] = df["date_time"].dt.isocalendar().week.astype("int")
    df["is_winter"] = df["month"].isin([1, 2, 12])
    df["is_sprint"] = df["month"].isin([3, 4, 5])
    df["is_summer"] = df["month"].isin([6, 7, 8])
    df["is_autumn"] = df["month"].isin([9, 10, 11])
    df["working_hours"] =  df["hour"].isin(np.arange(8, 21, 1)).astype("int")
    df["is_weekend"] = (train["date_time"].dt.dayofweek >= 5).astype("int")
    return df

In [8]:
train = make_new_features(train)
test = make_new_features(test)

In [9]:
train

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,...,day_of_year,hour,quarter,week_of_year,is_winter,is_sprint,is_summer,is_autumn,working_hours,is_weekend
0,2010-03-10 18:00:00,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,...,69,18,1,10,False,True,False,False,1,0
1,2010-03-10 19:00:00,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,...,69,19,1,10,False,True,False,False,1,0
2,2010-03-10 20:00:00,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,...,69,20,1,10,False,True,False,False,1,0
3,2010-03-10 21:00:00,11.0,62.4,0.7867,1321.0,929.0,1102.9,1536.5,1263.2,2.2,...,69,21,1,10,False,True,False,False,0,0
4,2010-03-10 22:00:00,11.9,59.0,0.7888,1272.0,852.7,1180.9,1415.5,1132.2,1.5,...,69,22,1,10,False,True,False,False,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7106,2010-12-31 20:00:00,9.2,32.0,0.3871,1000.5,811.2,873.0,909.0,910.5,1.3,...,365,20,4,52,True,False,False,False,1,0
7107,2010-12-31 21:00:00,9.1,33.2,0.3766,1022.7,790.0,951.6,912.9,903.4,1.4,...,365,21,4,52,True,False,False,False,0,0
7108,2010-12-31 22:00:00,9.6,34.6,0.4310,1044.4,767.3,861.9,889.2,1159.1,1.6,...,365,22,4,52,True,False,False,False,0,0
7109,2010-12-31 23:00:00,8.0,40.7,0.4085,952.8,691.9,908.5,917.0,1206.3,1.5,...,365,23,4,52,True,False,False,False,0,0


In [10]:
test

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,month,...,day_of_year,hour,quarter,week_of_year,is_winter,is_sprint,is_summer,is_autumn,working_hours,is_weekend
0,2011-01-01 00:00:00,8.0,41.3,0.4375,1108.8,745.7,797.1,880.0,1273.1,1,...,1,0,1,52,True,False,False,False,0,0
1,2011-01-01 01:00:00,5.1,51.7,0.4564,1249.5,864.9,687.9,972.8,1714.0,1,...,1,1,1,52,True,False,False,False,0,0
2,2011-01-01 02:00:00,5.8,51.5,0.4689,1102.6,878.0,693.7,941.9,1300.8,1,...,1,2,1,52,True,False,False,False,0,0
3,2011-01-01 03:00:00,5.0,52.3,0.4693,1139.7,916.2,725.6,1011.0,1283.0,1,...,1,3,1,52,True,False,False,False,0,0
4,2011-01-01 04:00:00,4.5,57.5,0.4650,1022.4,838.5,871.5,967.0,1142.3,1,...,1,4,1,52,True,False,False,False,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2242,2011-04-04 10:00:00,23.2,28.7,0.7568,1340.3,1023.9,522.8,1374.0,1659.8,4,...,94,10,2,14,False,True,False,False,1,1
2243,2011-04-04 11:00:00,24.5,22.5,0.7119,1232.8,955.1,616.1,1226.1,1269.0,4,...,94,11,2,14,False,True,False,False,1,1
2244,2011-04-04 12:00:00,26.6,19.0,0.6406,1187.7,1052.4,572.8,1253.4,1081.1,4,...,94,12,2,14,False,True,False,False,1,1
2245,2011-04-04 13:00:00,29.1,12.7,0.5139,1053.2,1009.0,702.0,1009.8,808.5,4,...,94,13,2,14,False,True,False,False,1,1


In [11]:
train['date_time'] = train['date_time'].astype('datetime64[ns]').astype(np.int64)/10**9
test['date_time'] = test['date_time'].astype('datetime64[ns]').astype(np.int64)/10**9

In [12]:
train

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,...,day_of_year,hour,quarter,week_of_year,is_winter,is_sprint,is_summer,is_autumn,working_hours,is_weekend
0,1.268244e+09,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,...,69,18,1,10,False,True,False,False,1,0
1,1.268248e+09,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,...,69,19,1,10,False,True,False,False,1,0
2,1.268251e+09,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,...,69,20,1,10,False,True,False,False,1,0
3,1.268255e+09,11.0,62.4,0.7867,1321.0,929.0,1102.9,1536.5,1263.2,2.2,...,69,21,1,10,False,True,False,False,0,0
4,1.268258e+09,11.9,59.0,0.7888,1272.0,852.7,1180.9,1415.5,1132.2,1.5,...,69,22,1,10,False,True,False,False,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7106,1.293826e+09,9.2,32.0,0.3871,1000.5,811.2,873.0,909.0,910.5,1.3,...,365,20,4,52,True,False,False,False,1,0
7107,1.293829e+09,9.1,33.2,0.3766,1022.7,790.0,951.6,912.9,903.4,1.4,...,365,21,4,52,True,False,False,False,0,0
7108,1.293833e+09,9.6,34.6,0.4310,1044.4,767.3,861.9,889.2,1159.1,1.6,...,365,22,4,52,True,False,False,False,0,0
7109,1.293836e+09,8.0,40.7,0.4085,952.8,691.9,908.5,917.0,1206.3,1.5,...,365,23,4,52,True,False,False,False,0,0


In [13]:
test

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,month,...,day_of_year,hour,quarter,week_of_year,is_winter,is_sprint,is_summer,is_autumn,working_hours,is_weekend
0,1.293840e+09,8.0,41.3,0.4375,1108.8,745.7,797.1,880.0,1273.1,1,...,1,0,1,52,True,False,False,False,0,0
1,1.293844e+09,5.1,51.7,0.4564,1249.5,864.9,687.9,972.8,1714.0,1,...,1,1,1,52,True,False,False,False,0,0
2,1.293847e+09,5.8,51.5,0.4689,1102.6,878.0,693.7,941.9,1300.8,1,...,1,2,1,52,True,False,False,False,0,0
3,1.293851e+09,5.0,52.3,0.4693,1139.7,916.2,725.6,1011.0,1283.0,1,...,1,3,1,52,True,False,False,False,0,0
4,1.293854e+09,4.5,57.5,0.4650,1022.4,838.5,871.5,967.0,1142.3,1,...,1,4,1,52,True,False,False,False,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2242,1.301911e+09,23.2,28.7,0.7568,1340.3,1023.9,522.8,1374.0,1659.8,4,...,94,10,2,14,False,True,False,False,1,1
2243,1.301915e+09,24.5,22.5,0.7119,1232.8,955.1,616.1,1226.1,1269.0,4,...,94,11,2,14,False,True,False,False,1,1
2244,1.301918e+09,26.6,19.0,0.6406,1187.7,1052.4,572.8,1253.4,1081.1,4,...,94,12,2,14,False,True,False,False,1,1
2245,1.301922e+09,29.1,12.7,0.5139,1053.2,1009.0,702.0,1009.8,808.5,4,...,94,13,2,14,False,True,False,False,1,1


In [14]:
columns = test.columns
columns

Index(['date_time', 'deg_C', 'relative_humidity', 'absolute_humidity',
       'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'month',
       'day_of_week', 'day_of_year', 'hour', 'quarter', 'week_of_year',
       'is_winter', 'is_sprint', 'is_summer', 'is_autumn', 'working_hours',
       'is_weekend'],
      dtype='object')

In [15]:
columns_train = train.columns
columns_train

Index(['date_time', 'deg_C', 'relative_humidity', 'absolute_humidity',
       'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5',
       'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides',
       'month', 'day_of_week', 'day_of_year', 'hour', 'quarter',
       'week_of_year', 'is_winter', 'is_sprint', 'is_summer', 'is_autumn',
       'working_hours', 'is_weekend'],
      dtype='object')

In [16]:
X = train[columns].values
X_test = test[columns].values
target_1 = train['target_carbon_monoxide'].values.reshape(-1,1)
target_2 = train['target_benzene'].values.reshape(-1,1)
target_3 = train['target_nitrogen_oxides'].values.reshape(-1,1)

In [17]:
import warnings
warnings.filterwarnings("ignore")

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV



rf = RandomForestRegressor()
rf.fit(X,target_1)
y_target1_rf = rf.predict(X_test)


rf2 = RandomForestRegressor()
rf2.fit(X,target_2)
y_target2_rf = rf2.predict(X_test)


rf3 = RandomForestRegressor()
rf3.fit(X,target_3)
y_target3_rf = rf3.predict(X_test)

In [19]:
from sklearn.ensemble import GradientBoostingRegressor

gb1= GradientBoostingRegressor()
gb1.fit(X,target_1)
y_target1_gb = gb1.predict(X_test)

gb2 = GradientBoostingRegressor()
gb2.fit(X,target_2)
y_target2_gb = gb2.predict(X_test)

gb3 = GradientBoostingRegressor()
gb3.fit(X,target_3)
y_target3_gb = gb3.predict(X_test)

In [20]:
sub['target_carbon_monoxide'] = (y_target1_rf + y_target1_gb) / 2
sub['target_benzene'] = (y_target2_rf + y_target2_gb) / 2
sub['target_nitrogen_oxides'] = (y_target3_rf + y_target3_gb) / 2

In [21]:
sub

Unnamed: 0,date_time,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,2011-01-01 00:00:00,1.568980,4.786842,217.499948
1,2011-01-01 01:00:00,2.159295,7.536595,272.689680
2,2011-01-01 02:00:00,1.817201,7.422379,287.332059
3,2011-01-01 03:00:00,1.923184,8.494786,298.560559
4,2011-01-01 04:00:00,1.313949,6.672343,216.958326
...,...,...,...,...
2242,2011-04-04 10:00:00,2.784834,12.442240,405.454617
2243,2011-04-04 11:00:00,2.270162,10.330328,366.791522
2244,2011-04-04 12:00:00,2.547526,12.683659,364.912864
2245,2011-04-04 13:00:00,2.192208,10.777171,355.772448


In [22]:
sub.to_csv('submission.csv',index=False)