In [None]:
import os, sys, re, math
import numpy as np
import pandas as pd
import warnings

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns',300)
pd.set_option('display.max_rows',300)

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')

In [None]:
train.head()

In [None]:
train.info(), test.info()

### Note
1. No null values in train and test - great!

In [None]:
train.describe().T, test.describe().T

### Note
1. features are all numerical

In [None]:
train['date_time'] = pd.to_datetime(train['date_time'])
test['date_time'] = pd.to_datetime(test['date_time'])

In [None]:
features = ['deg_C', 'relative_humidity', 'absolute_humidity','sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']
targets = ['target_benzene','target_carbon_monoxide','target_nitrogen_oxides']

In [None]:
all_data = train.append(test, ignore_index=True)

## Correlation checks

correlation between features

In [None]:
sns.pairplot(train[features])

correlation between targets

In [None]:
sns.pairplot(train[targets])

### Note
1. correlation between targets is observed

correlation between targets

In [None]:
# https://www.kaggle.com/docxian/tabular-playground-7-visualization-baseline
corr_target_pearson = train[targets].corr(method='pearson')
corr_target_spearman = train[targets].corr(method='spearman')

fig = plt.figure(figsize = (4,3))
sns.heatmap(corr_target_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

fig = plt.figure(figsize = (4,3))
sns.heatmap(corr_target_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()

### Correlation between features and targets

In [None]:
corr_mat = train.corr()
sns.heatmap(corr_mat.loc[features, targets].sort_values(targets, ascending=False))

### Note
1. positively correlated - sensor_2, sensor_5, sensor_1, sensor_4
2. negatively correlated - sensor_3, relative_humidity, deg_C
3. poorly correlated - absolute_humidity

In [None]:
# plot targets vs features column-wise
fig, ax = plt.subplots(len(features), len(targets), figsize=(16,64))
# plt.axis('off')
# gs = fig.add_gridspec(len(features), 3)
# gs.update(wspace=0.2, hspace=0.25)
i = 0
for f in features :
    for t in targets:
        ax_ = ax.flat[i]
#         ax = fig.add_subplot(gs[row, col])
        corr_t = np.round(train[f].corr(train[t], method='pearson'),4)
        ax_.scatter(train[f], train[t], alpha=0.25, s=4,color='darkred')
#         t_short = t.replace('target_','')
        ax_.set_title(t + ' vs ' + f + ', corr=' + str(corr_t))
        ax_.grid()
        i += 1

## Distribution of features 

In [None]:
fig, ax = plt.subplots(figsize=(14,12))
plt.axis('off')
gs = fig.add_gridspec(3, 3)
gs.update(wspace=0.2, hspace=0.25)

background_color = "#f6f5f5"

n_plots = len(features)
cnt_plots = 0
for row in range(3):
    for col in range(3):
        if cnt_plots < n_plots:
            ax = fig.add_subplot(gs[row, col])
            sns.kdeplot(train[features[cnt_plots]], shade=True, color='#287094', alpha=0.85, linewidth=0, zorder=2, ax=ax)
            sns.kdeplot(test[features[cnt_plots]], shade=True, color='#fcd12a', alpha=0.85, linewidth=0, zorder=1, ax=ax)
            
            for s in ["top","right"]:
                ax.spines[s].set_visible(False)
            
            ax.set_facecolor(background_color)
            ax.set_yticklabels([])
            ax.tick_params(axis='y', which=u'both',length=0)
            ax.set_ylabel('')
#             ax.tick_params(labelsize=5, width=0.5, length=1.5)
#             ax.grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.7)
#             ax.grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.7)
            cnt_plots+=1
    
fig.suptitle('Features Distribution')
fig.legend(['test', 'train'], ncol=2, facecolor=background_color, edgecolor=background_color, fontsize=10, loc = "upper left")

### Note
1. deg_C, absolute_humidity and sensor_4 show differing distributions (different mean and vars) between train and test

### Check date time

are all hours present?

In [None]:
print(min(train.date_time), max(train.date_time), max(train.date_time)-min(train.date_time))
print(min(test.date_time), max(test.date_time), max(test.date_time)-min(test.date_time))

In [None]:
# total hours
print((max(train.date_time)-min(train.date_time)).total_seconds()/60/60)
print((max(test.date_time)-min(test.date_time)).total_seconds()/60/60)

In [None]:
print(pd.date_range(min(train.date_time), max(train.date_time), freq='h').shape)
print(pd.date_range(min(test.date_time), max(test.date_time), freq='h').shape)

In [None]:
print(pd.date_range(min(train.date_time), max(train.date_time), freq='h')[0])
print(pd.date_range(min(train.date_time), max(train.date_time), freq='h')[-1])
print(pd.date_range(min(test.date_time), max(test.date_time), freq='h')[0])
print(pd.date_range(min(test.date_time), max(test.date_time), freq='h')[-1])

## Time series plots of features

In [None]:
my_alpha=0.25
for feature in features:
    fig, ax = plt.subplots(figsize=(16,4))
    ax.scatter(pd.to_datetime(train.date_time).dt.date, train[feature], alpha=my_alpha, color='green', s=4)
    ax.scatter(pd.to_datetime(test.date_time).dt.date, test[feature], alpha=my_alpha, color='darkred', s=4)
    ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
    plt.xticks(rotation=90)
    plt.title(feature)
    plt.grid()

plt.show()

### Note
1. deg_C: expected that temperature rises in the summer and drops in winter
2. absolute_humidity: expected that humidity will increase in the summer and drops in winter
3. sensor_4: still unknown

In [None]:
my_alpha=0.25
for target in targets:
    fig, ax = plt.subplots(figsize=(16,4))
    ax.scatter(pd.to_datetime(train.date_time).dt.date, train[target], alpha=my_alpha, color='green', s=4)
    ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
    plt.xticks(rotation=90)
    plt.title(target)
    plt.grid()

plt.show()

## Time Series Interactive Plots with Plotly

In [None]:
import plotly
import plotly.express as px
import plotly.graph_objects as go
print("plotly version: {}". format(plotly.__version__))

In [None]:
for feature in features:
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=train.date_time,y=train[feature],name='Train'))
    fig.add_trace(go.Scatter(x=test.date_time,y=test[feature],name='Test'))
    fig.update_layout(title=f'{feature} over time',yaxis_title=feature,xaxis_title='Date')
    fig.show()

### Note
1. it can be observed that are some blips in the data consistent throughout in deg_C, absolute_humidity, relative_humidity and sensor_2,3,4 readings

In [None]:
for target in targets:
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=train.date_time,y=train[target],name='Train'))
    fig.update_layout(title=f'{target} over time',yaxis_title=target,xaxis_title='Date')
    fig.show()

### Autocorrelation

As seen in the earlier plots, seasonality patterns are observed, hence displaying autocorrelation plots can confirm it

In [None]:
from statsmodels.graphics import tsaplots

# Stands for Time Series Analysis Plots (TSA Plots)
for f in features:
    fig, ax = plt.subplots(figsize=(10,8))
    tsaplots.plot_acf(train[f], lags=60, ax=ax)
    plt.title(f'Auto Correlation of {f}')
    plt.xlabel("Lag at k")
    plt.ylabel("Correlation coefficient")
    plt.show() # dots above shaded area are considered lags with autocorrelation problem

In [None]:
from statsmodels.graphics import tsaplots

# Stands for Time Series Analysis Plots (TSA Plots)
for t in targets:
    fig, ax = plt.subplots(figsize=(10,8))
    tsaplots.plot_acf(train[t], lags=60, ax=ax)
    plt.title(f'Auto Correlation of {t}')
    plt.xlabel("Lag at k")
    plt.ylabel("Correlation coefficient")
    plt.show() # dots above shaded area are considered lags with autocorrelation problem

In [None]:
# matplotlib's autocorrelation plots
for f in features:
    plt.figure(figsize=(10,4))
    plt.acorr(train[f].diff()[1:], maxlags=60)
    plt.title('Autocorrelations of increments of ' + f)
    plt.grid()
    plt.show()

In [None]:
# matplotlib's autocorrelation plots
for t in targets:
    plt.figure(figsize=(10,4))
    plt.acorr(train[f].diff()[1:], maxlags=60)
    plt.title('Autocorrelations of increments of ' + t)
    plt.grid()
    plt.show()

### Note
1. Clear autocorrelation problem in the features and targets
2. to slice features by datetime to improve granularity of features
3. there are also consistent positive autocorrelation (5 sticks) and negative autocorrelation (6 sticks) patterns

#### Partial autocorrelation

In [None]:
from statsmodels.graphics import tsaplots

# Stands for Time Series Analysis Plots (TSA Plots)
for f in features:
    fig, ax = plt.subplots(figsize=(10,8))
    tsaplots.plot_pacf(train[f], lags=60, ax=ax)
    plt.title(f'Partial Autocorrelation of {f}')
    plt.xlabel("Lag at k")
    plt.ylabel("Correlation coefficient")
    plt.show() # dots above shaded area are considered lags with autocorrelation problem

In [None]:
from statsmodels.graphics import tsaplots

# Stands for Time Series Analysis Plots (TSA Plots)
for t in targets:
    fig, ax = plt.subplots(figsize=(10,8))
    tsaplots.plot_pacf(train[t], lags=60, ax=ax)
    plt.title(f'Partial Autocorrelation of {t}')
    plt.xlabel("Lag at k")
    plt.ylabel("Correlation coefficient")
    plt.show() # dots above shaded area are considered lags with autocorrelation problem

In [None]:
# =====================================================================================
# =================== Finding the best cutoff for anomalous data ======================
# =====================================================================================

# for i in np.arange(0,0.4,0.02):
#     odd_data = train[train['absolute_humidity']<=i]
#     odd_data.plot(kind='hist',y='target_benzene', title=f'absolute_humidity at {i}')
# for i in np.arange(640,680,5):
#     odd_data = train[train['sensor_4']<=i]
#     odd_data.plot(kind='hist',y='target_benzene', title=f'sensor_4 at {i}')

# Best cutoffs found to be:
# X['sensor_4'] < 665 & X['absolute_humidity'] < 0.22

In [None]:
odd_data = train[(train['sensor_4']<665) | (train['absolute_humidity']<0.22)]
print(odd_data.shape)
# odc = odd_data.corr()

## Feature Engineering

In [None]:
all_data_bk = all_data.copy()

In [None]:
all_data.head()

In [None]:
# The months will be used for folds split
months = all_data["date_time"].dt.month[:len(train)]
# FE
all_data['year'] = all_data['date_time'].dt.year
all_data['month'] = all_data['date_time'].dt.month
all_data['week'] = all_data['date_time'].dt.week
all_data['day'] = all_data['date_time'].dt.day
all_data['dayofweek'] = all_data['date_time'].dt.dayofweek
all_data["hour"] = all_data["date_time"].dt.hour
all_data["working_hours"] =  all_data["hour"].isin(np.arange(8, 21, 1)).astype("int")
all_data["is_weekend"] = (all_data["date_time"].dt.dayofweek >= 5).astype("int")
all_data['total_mins'] = all_data.date_time.dt.hour*60+all_data.date_time.dt.minute
all_data['saturday'] = (all_data.date_time.dt.weekday==5).astype("int")
all_data["SMC"] = (all_data["absolute_humidity"] * 100) / all_data["relative_humidity"]
all_data['days_from_start'] = (all_data['date_time'] - min(all_data['date_time'])).dt.days

In [None]:
# https://www.kaggle.com/paddykb/catboost-14feature-cross-validation-fe
def pb_add(X):
    diff = X['date_time'] - min(X['date_time'])
    trend = diff.dt.days
    X['f1s'] = np.sin(trend * 2 * math.pi / (365 * 1)) 
    X['f1c'] = np.cos(trend * 2 * math.pi / (365 * 1))
    X['f2s'] = np.sin(2 * math.pi * trend / (365 * 2)) 
    X['f2c'] = np.cos(2 * math.pi * trend / (365 * 2)) 
    X['f3s'] = np.sin(2 * math.pi * trend / (365 * 3)) 
    X['f3c'] = np.cos(2 * math.pi * trend / (365 * 3)) 
    X['f4s'] = np.sin(2 * math.pi * trend / (365 * 4)) 
    X['f4c'] = np.cos(2 * math.pi * trend / (365 * 4)) 
    X['fh1s'] = np.sin(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 1))
    X['fh1c'] = np.cos(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 1))
    X['fh2s'] = np.sin(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 2))
    X['fh2c'] = np.cos(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 2))
    X['fh3s'] = np.sin(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 3))
    X['fh3c'] = np.cos(diff.dt.seconds * 2 * math.pi / ( 3600 * 24 * 3))
    
    sensor_features = [
        'deg_C', 
        'relative_humidity', 'absolute_humidity', 
        'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']
    lags = [-1, -4, -7, -24, -24 * 7]  # last hour, last 4 hours, last 7 hours, last day, last week
    
    for sensor_feature in sensor_features:
        this = X[sensor_feature]
        
        # shifts
        # look back
        for lag in lags:
            feature = f'{sensor_feature}_{abs(lag)}b_s'
            this_f = X[sensor_feature].shift(lag)
            X[feature] = (this_f - this).fillna(0)
        # look forwards
        for lag in lags:
            feature = f'{sensor_feature}_{abs(-lag)}f_s'
            this_f = X[sensor_feature].shift(-lag)
            X[feature] = (this_f - this).fillna(0)
        
        # diffs
        # look back
#         for lag in lags:
#             feature = f'{sensor_feature}_{abs(lag)}b_d'
#             this_f = X[sensor_feature].diff(lag)
#             X[feature] = (this_f - this).fillna(0)

#         # look forwards
#         for lag in lags:
#             feature = f'{sensor_feature}_{abs(-lag)}f_d'
#             this_f = X[sensor_feature].diff(-lag)
#             X[feature] = (this_f - this).fillna(0)
            
    return X

In [None]:
all_data = pb_add(all_data.copy())
all_data.drop(columns = 'date_time', inplace = True)

In [None]:
all_data.shape

In [None]:
X=all_data[:len(train)].drop(columns = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])
y=all_data[:len(train)][['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]
y_log=np.log(y)
X_test=all_data[len(train):].drop(columns = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])
print('X_train shape:', X.shape)
print('y_train shape:', y.shape)
print('X_test shape:', X_test.shape)

In [None]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler().fit(X)
# X_scaled = scaler.transform(X)
# X_test_scaled = scaler.transform(X_test)

In [None]:
X_scaled=X.values
X_test_scaled=X_test.values

In [None]:
preds = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

## Leave-one-group-out

In [None]:
# Sets of hyperparameters optimized by Optuna for each target
cb_params = [
                {'learning_rate': 0.010169009412219588,
                 'l2_leaf_reg': 8.908337085912136,
                 'bagging_temperature': 8.384477224270551,
                 'random_strength': 1.950237493637981,
                 'depth': 6,
                 'grow_policy': 'Lossguide',
                 'leaf_estimation_method': 'Newton'},
                {'learning_rate': 0.166394867169309,
                 'l2_leaf_reg': 8.704675157564441,
                 'bagging_temperature': 3.340826164726799,
                 'random_strength': 1.538518016574368,
                 'depth': 2,
                 'grow_policy': 'Depthwise',
                 'leaf_estimation_method': 'Newton'},
                {'learning_rate': 0.028141156076957437,
                 'l2_leaf_reg': 3.116523267336638,
                 'bagging_temperature': 4.420661209459851,
                 'random_strength': 1.8011752694610028,
                 'depth': 6,
                 'grow_policy': 'Depthwise',
                 'leaf_estimation_method': 'Newton'},
            ]

In [None]:
%%time
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import LeaveOneGroupOut
from catboost import CatBoostRegressor

all_fi = []
splits = 10
target_names=y_log.columns

for i, target in enumerate(target_names):
    print(f"\nTraining for {target}...")
    logo = LeaveOneGroupOut()
    oof_preds = np.zeros((X_scaled.shape[0],))
    model_preds = 0
    model_fi = 0
    for num, (train_idx, valid_idx) in enumerate(logo.split(X_scaled, y_log, months)):
        X_train, X_valid = X_scaled[[train_idx]], X_scaled[[valid_idx]]
        y_train, y_valid = y_log.loc[train_idx, target], y_log.loc[valid_idx, target]
        model = CatBoostRegressor(random_state=42,
                                 thread_count=4,
                                 verbose=False,
                                 loss_function='RMSE',
                                 eval_metric='RMSE',
                                 od_type="Iter",
                                 early_stopping_rounds=500,
                                 use_best_model=True,
                                 iterations=5000,
                                 **cb_params[i])
        model.fit(X_train, y_train,
                  eval_set=(X_valid, y_valid),
                  verbose=False)
        model_preds += np.exp(model.predict(X_test_scaled)) / splits
        model_fi += model.feature_importances_
        oof_preds[valid_idx] = np.exp(model.predict(X_valid))
        print(f"Fold {num} RMSLE: {np.sqrt(mean_squared_log_error(np.exp(y_valid), oof_preds[valid_idx]))}")
    print(f"\nOverall RMSLE: {np.sqrt(mean_squared_log_error(np.exp(y_log[target]), oof_preds))}")    
    preds[target] = model_preds
    all_fi.append(dict(zip(X.columns, model_fi)))

In [None]:
preds.head()

In [None]:
preds.shape

In [None]:
preds.to_csv('submission.csv', index=False)

In [None]:
# TODO - Feature engineering
# TODO - AutoML
# TODO - Classic scikitlearn ML, from catboost import CatBoostRegressor
# TODO - pyGAM