In [None]:
# %pip install xgboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import xgboost as xgb

In [None]:
# Paths
path_train = '/kaggle/input/tabular-playground-series-jul-2021/train.csv'
path_test = '/kaggle/input/tabular-playground-series-jul-2021/test.csv'
path_sub = '/kaggle/input/tabular-playground-series-jul-2021/sample_submission.csv'

# seed 
SEED = 100

In [None]:
train = pd.read_csv(path_train)
train.head()

test = pd.read_csv(path_test)
test_index = test['date_time']
print(test.head())


### EDA

In [None]:
train.describe()

Takeaways:
* No missing values.
* Data could be scaled. Sensors have similar range but deg_C and humidities don't.

### First correlation analysis (without feature engineering)

In [None]:
corr_matrix = train.corr()
print('Carbon Monoxide\n\n', corr_matrix['target_carbon_monoxide'].sort_values(ascending=False), '\n')
print('Benzene\n\n', corr_matrix['target_benzene'].sort_values(ascending=False), '\n')
print('Nitrogen Oxides\n\n', corr_matrix['target_nitrogen_oxides'].sort_values(ascending=False), '\n')

In [None]:
sns.heatmap(corr_matrix[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']].sort_values(['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']
                                                                                                   ,ascending=False))

### Feature engineering 

In [None]:
# Convert the date_time column to datetime format 
train['date_time'] = pd.to_datetime(train['date_time'])

# Create df copy for feature engineering
train_eng = train.copy()

# day_of_week is an integer for each day of the week (0 is monday, 6 is sunday)
train_eng['day_of_week'] = train['date_time'].dt.dayofweek

# is_weekend is boolean, calculated using the new day_of_week variable
train_eng['is_weekend'] = 0
train_eng.loc[(train_eng['day_of_week'] == 5)|(train_eng['day_of_week'] == 6), 'is_weekend'] = 1

# time_of_day is an int for every hour of the day, from 0 to 23
train_eng['time_of_day'] = train['date_time'].dt.hour

# is_daylight is boolean, true for hours between 6:00 and 19:00
train_eng.loc[(train_eng['time_of_day']>6) & (train_eng['time_of_day']<19), 'is_daylight'] = 1

# is_dark is boolean, true for hours between 18:00 and 7:00
train_eng.loc[(train_eng['time_of_day']>18) | (train_eng['time_of_day']<7), 'is_dark'] = 1

# Check 
print(train_eng['is_daylight'].value_counts())
print(train_eng['is_dark'].value_counts())

assert train_eng[train_eng['is_daylight']==1].shape[0] + train_eng[train_eng['is_dark']==1].shape[0] == train_eng.shape[0]
train_eng.fillna(0, inplace=True)

In [None]:
corr_matrix_eng = train_eng.corr()
sns.heatmap(corr_matrix_eng[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']].sort_values(['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']
                                                                                                   ,ascending=False))

Looks like time_of_day has a slight positive correlation, maybe not significant.
Is_daylight is not adding value.
Day_of_week, is_dark and is_weekend have negative correlations, could add value to the model, this could be tested in an automated pipeline.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# class to add engineered features, will be used as part of a pipeline for rapid testing
class FeatureCreator(BaseEstimator, TransformerMixin):
    def __init__(self, add_day_of_week=True, add_is_weekend=True,add_time_of_day=True, add_is_daylight=False, add_is_dark=True):
        self.add_day_of_week = add_day_of_week
        self.add_is_weekend = add_is_weekend
        self.add_time_of_day = add_time_of_day
        self.add_is_daylight = add_is_daylight
        self.add_is_dark = add_is_dark 
        
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        
        X['date_time'] = pd.to_datetime(X['date_time'])
        X['day_of_week'] = X['date_time'].dt.dayofweek
        X['time_of_day'] = X['date_time'].dt.hour

        if self.add_is_weekend:
            X.loc[(X['day_of_week'] == 5)|(X['day_of_week'] == 6), 'is_weekend'] = 1
        if self.add_is_daylight:
            X.loc[(X['time_of_day']>6) & (X['time_of_day']<19), 'is_daylight'] = 1
        if self.add_is_dark:
            X.loc[(X['time_of_day']>18) | (X['time_of_day']<7), 'is_dark'] = 1
        
        X.fillna(0, inplace=True)
        
        if not self.add_day_of_week:
            X.drop(columns='day_of_week', inplace=True)
        if not self.add_time_of_day:
            X.drop(columns='time_of_day', inplace=True)
            
        X.set_index('date_time', inplace=True)
                    
        return X
    
# testing the FeatureCreator class
'''feat_creator = FeatureCreator()
train_eng_test = train.copy()

train_eng_test = feat_creator.transform(train_eng_test)
train_eng_test.head()'''

### Modeling Time!

This first CV part is to test for different models for each target feature.



In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor


splits = 3
time_kfold = TimeSeriesSplit(n_splits=splits)

# Sort by datetime before performing the splits
train = train.sort_values('date_time')

# Target columns
targets = ['target_nitrogen_oxides', 'target_benzene', 'target_carbon_monoxide']

# DataFrame to store the fold errors
df_rmsle = pd.DataFrame(index = [i for i in range(splits)], columns=targets)

# Model dict XGboost
'''dict_model = {'target_nitrogen_oxides':
              xgb.XGBRegressor(n_estimators=1000,
                               max_depth=6,
                              random_state=SEED),
             'target_benzene':
              xgb.XGBRegressor(n_estimators=1000,
                               max_depth=6,
                              random_state=SEED),
             'target_carbon_monoxide':
              xgb.XGBRegressor(n_estimators=1000,
                               max_depth=6,
                              random_state=SEED)}'''

'''# Model dict RF
dict_model = {'target_nitrogen_oxides': 
              RandomForestRegressor(n_estimators=600,
                                    max_depth=12,
                                    random_state=SEED),
             'target_benzene':
              RandomForestRegressor(n_estimators=1000,
                                    max_depth=12,
                                    random_state=SEED),
             'target_carbon_monoxide':
              RandomForestRegressor(n_estimators=1000,
                                    max_depth=12,
                                    random_state=SEED)}'''

# Model dict Catboost
dict_model = {'target_nitrogen_oxides': 
              CatBoostRegressor(random_state=SEED,
                                learning_rate=0.05,
                                depth=8,
                               verbose=False),
             'target_benzene':
              CatBoostRegressor(random_state=SEED,
                                depth=4,
                                learning_rate=0.01,
                               verbose=False),
             'target_carbon_monoxide':
              CatBoostRegressor(random_state=SEED,
                                learning_rate=0.01,
                                depth=10,
                               verbose=False)}

transformation_pipeline = Pipeline([
    ('feature_creator', FeatureCreator()),
    ('scaler', StandardScaler()),
])

# Loop over the splits
for idx_fold, (train_ix, val_ix )in enumerate(time_kfold.split(train)):
    
    # Training and validation sets (to avoid confusion with original test set)
    cv_train, cv_val = train.iloc[train_ix], train.iloc[val_ix]
    
    # Another loop to train for each of the three targets
    for idx_target, target in enumerate(targets, 1):
        
        # Drop labels from the training set and validation set
        X_train = cv_train.iloc[:, :-3]
        X_val = cv_val.iloc[:, :-3]
        
        # Set labels, apply a log transform to the target 
        y_train = cv_train.iloc[:, -idx_target]
        y_train_log = np.log1p(y_train)
        y_val = cv_val.iloc[:, -idx_target]                

        # Pipeline
        X = transformation_pipeline.fit_transform(X_train)
        X_val = transformation_pipeline.transform(X_val)
                
        # Fit, predict, evaluate pipeline
        dict_model[target].fit(X, y_train_log)
        preds = dict_model[target].predict(X_val)
                
        # Evaluate the predictions, expm1 transforms the predictions back to normal ranges
        rmsle = mean_squared_log_error(y_val, np.expm1(preds)) ** (1/2)
        
        df_rmsle[target].iloc[idx_fold] = rmsle
        
        print(f'Fold {idx_fold} : RSMLE for {target} is {rmsle}.')
    
    print('\n')
    
cox_rmsle =  df_rmsle.loc[:, 'target_carbon_monoxide'].mean()
ben_rmsle = df_rmsle.loc[:, 'target_benzene'].mean()
nox_rmsle = df_rmsle.loc[:, 'target_nitrogen_oxides'].mean()
print(f"Overall RMSLE for Monoxide = {cox_rmsle}")
print(f"Overall RMSLE for Benzene = {ben_rmsle}")
print(f"Overall RMSLE for Nitrous Oxide = {nox_rmsle}")
print(f'Overall RMSLE = {(cox_rmsle + ben_rmsle + nox_rmsle)/3}')

In [None]:
sub_ex = pd.read_csv(path_sub)

# Drop labels from the training set and validation set
X_train = train.iloc[:, :-3]

# Pipeline
X_train = transformation_pipeline.fit_transform(X_train)
test = transformation_pipeline.transform(test)

sub = pd.DataFrame()
for idx_target, target in enumerate(targets, 1):
        
    y_train = train.iloc[:, -idx_target]
    y_train_log = np.log1p(y_train)
    
    # Fit, predict, evaluate pipeline
    dict_model[target].fit(X_train, y_train_log) 
        
    preds = dict_model[target].predict(test)
    # print(preds[:5])
    sub = pd.concat([sub, pd.DataFrame(np.expm1(preds))], axis=1)

sub.columns = targets
sub = sub[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]
sub = pd.concat([sub_ex['date_time'], sub], axis=1)
print(sub.head())
sub.to_csv('submission.csv', index=False)

Next steps:
Maybe experiment with a few neural nets (LSTMs?) and Boosted models.

In the end probably some ensembling.