In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**NOTE: this notebook was created in July and with a basic non-tuned Random Regressor achieved a RMSE of 0.18893**

The goal was just to obtain some insights when working with time series.

In [None]:
import pandas as pd
import pandas_profiling as pp
import numpy as np
from datetime import date
import scipy.stats as sp
from scipy.special import boxcox, inv_boxcox
import math

# scikit-learn modules for feature selection and model evaluation
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE, SelectKBest, SelectFromModel, chi2, f_regression, mutual_info_regression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import mean_squared_error
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, MinMaxScaler


import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jul-2021/train.csv")
train["date_time"] = pd.to_datetime(train["date_time"], format="%Y-%m-%d %H:%M:%S")

test = pd.read_csv("../input/tabular-playground-series-jul-2021/test.csv")
test["date_time"] = pd.to_datetime(test["date_time"], format="%Y-%m-%d %H:%M:%S")

submission = pd.read_csv("../input/tabular-playground-series-jul-2021/sample_submission.csv", index_col = 0)

In [None]:
#Create a Pandas Profiling report to get a quick grasp of the data
pp.ProfileReport(train)

In [None]:
#Dataset periods
min_train_date = train["date_time"].dt.date.min()
min_test_date = test["date_time"].dt.date.min()
max_train_date = train["date_time"].dt.date.max()
max_test_date = test["date_time"].dt.date.max()

print(f'Train dataset period: {min_train_date} to {max_train_date}')
print(f'Test dataset period: {min_train_date} to {max_train_date}')

In [None]:
#Defining the targets and original features
targets = train.columns[-3:]
original_features = train.columns[1:-3]
train_original = train.columns

#Creating date features for seasonnality and investigation
for df in [train, test]:
    df["year"] = df["date_time"].dt.year
    df["month"] = df["date_time"].dt.month
    df["week"] = df["date_time"].dt.weekofyear
    df["day"] = df["date_time"].dt.dayofweek
    df["hour"] = df["date_time"].dt.hour
    df["winter"] = df["month"].isin([1, 2, 12]).map(lambda x: 1 if x==True else 0)
    df["spring"] = df["month"].isin([3, 4, 5]).map(lambda x: 1 if x==True else 0)
    df["summer"] = df["month"].isin([6, 7, 8]).map(lambda x: 1 if x==True else 0)
    df["autumn"] = df["month"].isin([9, 10, 11]).map(lambda x: 1 if x==True else 0)
    df["working_hours"] =  df["hour"].isin(np.arange(8, 21, 1)).astype("int")
    df["morning"] =  df["hour"].isin(np.arange(0, 12, 1)).astype("int")
    df["is_weekend"] = (df["date_time"].dt.dayofweek >= 5).astype("int")
    df['day_index'] = (df['date_time'].dt.date - min_train_date).apply(lambda x: x.days)
    
#Possible original_features shifts (Did not take the time to fine tune those parameters, so I will let them #):
# train["s1-6"] = train["sensor_1"] - train["sensor_1"].shift(periods=6, fill_value=0)
# train["s2-1"] = train["sensor_2"] - train["sensor_2"].shift(periods=1, fill_value=0)
# train["s2-6"] = train["sensor_2"] - train["sensor_2"].shift(periods=6, fill_value=0)
# train["s2-24"] = train["sensor_2"] - train["sensor_2"].shift(periods=24, fill_value=0)
# train["s2-7*24"] = train["sensor_2"] - train["sensor_2"].shift(periods=7*24, fill_value=0)
# train["s3-6"] = train["sensor_3"] - train["sensor_3"].shift(periods=6, fill_value=0)
# train["s4-6"] = train["sensor_4"] - train["sensor_4"].shift(periods=6, fill_value=0)
# train["s5-6"] = train["sensor_5"] - train["sensor_5"].shift(periods=6, fill_value=0)


#setting the date_time as index
train.set_index("date_time")
test.set_index("date_time")

In [None]:
#let's make some simple visualization to understand our data (features):
colors_train = ["tab:blue", "tab:orange", "tab:green"]
colors_test = ["tab:blue", "tab:orange", "tab:green"]

def plot_features(df=train, df2=test):
    
    colors_train = ["tab:blue", "tab:orange", "tab:green"]
    colors_test = ["tab:blue", "tab:orange", "tab:green"]
    fig, ax = plt.subplots(len(original_features) + len(targets), 1, figsize = (len(original_features)*2,len(original_features)*6))
    
    for i, col in enumerate(original_features):
        ax[i].plot(df[df.columns[0]], df[df.columns[i+1]], color=colors_train[0])
        ax[i].plot(df2[df2.columns[0]], df2[df2.columns[i+1]], color=colors_test[1])
        ax[i].set_title(df.columns[i+1], fontsize=15, color="crimson")
    
    for j, col in enumerate(targets):
        ax[j+len(original_features)].plot( df[df.columns[0]], df[df.columns[len(original_features)+1+j]], color=colors_train[0])
        ax[j+len(original_features)].set_title(df.columns[len(original_features)+1+j], fontsize=15, color="crimson")
    
plot_features(train, test)

In [None]:
#Let's try to get more granular information about the features:

granularity = ["month", "day", "hour"]

def plot_targets(df=train):
    fig, ax = plt.subplots(len(targets), 3, figsize = (len(original_features)*2,len(original_features)*2))
    colors_train = ["mediumblue", "darkorange", "olive"]
    
    for i, col in enumerate(targets):
        ax[0,i].plot(train.groupby("month")[targets[i]].mean(), color = colors_train[i])
        ax[0,i].set_title(f"Month - {targets[i]}")
        
        ax[1,i].plot(train.groupby("day")[targets[i]].mean(), color = colors_train[i])
        ax[1,i].set_title(f"Day - {targets[i]}")
        
        ax[2,i].plot(train.groupby("hour")[targets[i]].mean(), color = colors_train[i])
        ax[2,i].set_title(f"Hours - {targets[i]}")

plot_targets(train)

In [None]:
#Let's try to get more granular information about the variables:

granularity = ["month", "day", "hour"]

def plot_features2(df=train):
    fig, ax = plt.subplots(len(original_features), 3, figsize = (len(original_features)*2,len(original_features)*4))
    colors_train = ["tab:blue", "tab:orange", "tab:green"]
    for i, time in enumerate(granularity):
        for j, col in enumerate(original_features):
            ax[j,i].plot(train.groupby(time)[original_features[j]].mean(), color = colors_train[i])
            ax[j,i].set_title(f"{granularity[i]} - {original_features[j]}")
        
plot_features2(train)

In [None]:
#Let's now have a look at the distribution of the targets & features:

fig, ax = plt.subplots(3, 3,figsize=(15,15))


for i,col in enumerate(original_features):
    sns.histplot(data=train[original_features[i]], kde = True,  ax = ax[i%3][i//3])
    ax[i%3][i//3].set_title(f"{train.columns[i+1]}", fontsize=12, color="crimson")
    plt.subplots_adjust(hspace=0.35)

fig.delaxes(ax[2,2])

In [None]:
fig, ax = plt.subplots(3, 1,figsize=(10,10))

for i, col in enumerate(targets):
    sns.histplot(train[col], kde=True, ax= ax[i])
    ax[i].set_title(targets[i], fontsize=15, color= "crimson")
    ax[i].set_xlabel('')
    ax[i].spines.top.set_visible(False)
    ax[i].spines.right.set_visible(False)
    plt.subplots_adjust(hspace=0.45)

    
skew = {}
kurtosis = {}

for i, col in enumerate(targets):
    skew[col] = sp.skew(train[col])
    kurtosis[col] = sp.kurtosis(train[col])

ax[0].text(10, 500, f"skewness: {skew['target_carbon_monoxide']:.2f}", fontsize=11,fontweight='light',fontfamily='serif',color='#323232')
ax[0].text(10, 450, f"kurtosis: {kurtosis['target_carbon_monoxide']:.2f}", fontsize=11,fontweight='light',fontfamily='serif',color='#323232')

ax[1].text(52, 450, f"skewness: {skew['target_benzene']:.2f}", fontsize=11,fontweight='light',fontfamily='serif',color='#323232')
ax[1].text(52, 400, f"kurtosis: {kurtosis['target_benzene']:.2f}", fontsize=11,fontweight='light',fontfamily='serif',color='#323232')

ax[2].text(1200, 550, f"skewness: {skew['target_nitrogen_oxides']:.2f}", fontsize=11,fontweight='light',fontfamily='serif',color='#323232')
ax[2].text(1200, 500, f"kurtosis: {kurtosis['target_nitrogen_oxides']:.2f}", fontsize=11,fontweight='light',fontfamily='serif',color='#323232')


In [None]:
display("________ORIGINAL_FEATURES_________")
    
for col in original_features:
    print(f"skewness of {col} = {sp.skew(train[col])}")
    print(f"kurtosis of {col} = {sp.kurtosis(train[col])}")
    print("---------------------------------------------")


In [None]:
# Possible boxcox transformation to improve the RMSE

lambda_monoxide = sp.boxcox(train['target_carbon_monoxide'])[1]
train['target_carbon_monoxide'] = sp.boxcox(train['target_carbon_monoxide'])[0]

lambda_benzene = sp.boxcox(train['target_benzene'])[1]
train['target_benzene'] = sp.boxcox(train['target_benzene'])[0]

lambda_nitrogen = sp.boxcox(train['target_nitrogen_oxides'])[1]
train['target_nitrogen_oxides'] = sp.boxcox(train['target_nitrogen_oxides'])[0]

fig, ax = plt.subplots(1, 3,figsize=(15,2))
for i in range(3):
    sns.histplot(train[targets[i]], kde=True, ax= ax[i])
    ax[i].set_title(targets[i], fontsize=15, color= "crimson")
    
display("________TARGETS_________")
for col in targets:
    print(f"skewness of {col} = {sp.skew(train[col])}")
    print(f"kurtosis of {col} = {sp.kurtosis(train[col])}")
    print("---------------------------------------------")

In [None]:
plt.figure(figsize=(10,10))
train_features_corr = train[train_original].corr()
sns.heatmap(train_features_corr , annot=True, annot_kws={"weight": "bold", "fontsize":10}, cmap="coolwarm",mask= np.triu(train_features_corr))

In [None]:
#adding features based on other works on air qualities:
train ['Dew_point'] = train ['deg_C'].apply (lambda x: (17.27 * x) / (237.7 + x)) + train ['absolute_humidity'].apply (lambda x: math.log (x) )
train ['Partial_pressure'] = (train ['deg_C'].apply (lambda x: (237.7 + x) * 286.8) * train ['absolute_humidity']) / 100000
train ['Saturated_wvd'] = (train ['absolute_humidity'] * 100) / train ['relative_humidity']

test ['Dew_point'] = test ['deg_C'].apply (lambda x: (17.27 * x) / (237.7 + x)) + test ['absolute_humidity'].apply (lambda x: math.log (x) )
test ['Partial_pressure'] = (test ['deg_C'].apply (lambda x: (237.7 + x) * 286.8) * test ['absolute_humidity']) / 100000
test ['Saturated_wvd'] = (test ['absolute_humidity'] * 100) / test ['relative_humidity']

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(train.corr() , annot=True, annot_kws={"weight": "bold", "fontsize":10}, cmap="coolwarm",mask= np.triu(train.corr()))

In [None]:
#Let's first try to predict the benzene, considering how highly correlated it is to sensor_2:

train_benzene_corr  = train.corr()['target_benzene'].sort_values(ascending=False)
train_benzene_corr

In [None]:
#Let's quickly fit a model to check how accurate we are:
def Select_features(X, Y, n_features):
    
    # Split train and test sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.4, random_state = 123)
    
    #Scaling
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Fit to scaled data, then transform it:
    selector = SelectKBest(mutual_info_regression, k=n_features)
    X_new = selector.fit_transform(X_train_scaled, Y_train)
    
    # Drop the target variable
    feature_idx = selector.get_support()
    feature_names = train.drop(columns=['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides','date_time']).columns[feature_idx]

    
    return feature_names

In [None]:
#Let's quickly fit a model to check how accurate we are:

def fit_model(X, Y):
    '''Use a RandomForestRegressor for this problem.'''
    
    # define the model to use
    model = RandomForestRegressor(random_state=2)
    
    # Train the model
    model.fit(X, Y)
    
    return model

def train_and_get_metrics(X, Y):
    '''Train a Random Forest Regressor and get evaluation metrics'''
    
    # Split train and test sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.4, random_state = 12)

    # All features of dataset are float values. You normalize all features of the train and test dataset here.
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Call the fit model function to train the model on the normalized features and the diagnosis values
    model = fit_model(X_train_scaled, Y_train)

    # Make predictions on test dataset and calculate metrics.
    y_predict_r = model.predict(X_test_scaled)
    score = mean_squared_error(Y_test, y_predict_r)
    

    return score, y_predict_r

def evaluate_model_on_features(X, Y):
    '''Train model and display evaluation metrics.'''
    
    # Train the model, predict values and get metrics
    score, y_predict_r = train_and_get_metrics(X, Y)

    # Construct a dataframe to display metrics.
    display_df = pd.DataFrame([score], columns=["MSE"])
    
    return display_df, y_predict_r


In [None]:
#Making different test with different number of predictors:

#Split dataframe for feature selection:
X = train.drop(columns=['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides','date_time'])
Y = train['target_benzene']

all_features_eval_df, y_predict_r = evaluate_model_on_features(X, Y)
all_features_eval_df.index = ['All_features']
results = all_features_eval_df

A1 = Select_features(X, Y, 1)
Kbest_features_eval_df, y_predict_r = evaluate_model_on_features(X[A1], Y)
Kbest_features_eval_df.index = ['Kbest_features_1']
results = results.append(Kbest_features_eval_df)

A10 = Select_features(X, Y, 10)
Kbest_features_eval_df, y_predict_r = evaluate_model_on_features(X[A10], Y)
Kbest_features_eval_df.index = ['Kbest_features_10']
results = results.append(Kbest_features_eval_df)

A18 = Select_features(X, Y, 18)
Kbest_features_eval_df, y_predict_r = evaluate_model_on_features(X[A18], Y)
Kbest_features_eval_df.index = ['Kbest_features_18']
results = results.append(Kbest_features_eval_df)

A20 = Select_features(X, Y, 20)
Kbest_features_eval_df, y_predict_r = evaluate_model_on_features(X[A20], Y)
Kbest_features_eval_df.index = ['Kbest_features_20']
results = results.append(Kbest_features_eval_df)

# Check the metrics
results.head()

In [None]:
#Making different test with different number of predictors:

#Split dataframe for feature selection:
X = train.drop(columns=['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides','date_time'])
Y = train['target_carbon_monoxide']

all_features_eval_df, y_predict_r = evaluate_model_on_features(X, Y)
all_features_eval_df.index = ['All_features']
results = all_features_eval_df

A1 = Select_features(X, Y, 1)
Kbest_features_eval_df, y_predict_r = evaluate_model_on_features(X[A1], Y)
Kbest_features_eval_df.index = ['Kbest_features_1']
results = results.append(Kbest_features_eval_df)

A10 = Select_features(X, Y, 10)
Kbest_features_eval_df, y_predict_r = evaluate_model_on_features(X[A10], Y)
Kbest_features_eval_df.index = ['Kbest_features_10']
results = results.append(Kbest_features_eval_df)

A18 = Select_features(X, Y, 18)
Kbest_features_eval_df, y_predict_r = evaluate_model_on_features(X[A18], Y)
Kbest_features_eval_df.index = ['Kbest_features_18']
results = results.append(Kbest_features_eval_df)

A20 = Select_features(X, Y, 20)
Kbest_features_eval_df, y_predict_r = evaluate_model_on_features(X[A20], Y)
Kbest_features_eval_df.index = ['Kbest_features_20']
results = results.append(Kbest_features_eval_df)

# Check the metrics
results.head()

In [None]:
#Making different test with different number of predictors:

#Split dataframe for feature selection:
X = train.drop(columns=['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides','date_time'])
Y = train['target_nitrogen_oxides']

all_features_eval_df, y_predict_r = evaluate_model_on_features(X, Y)
all_features_eval_df.index = ['All_features']
results = all_features_eval_df

A1 = Select_features(X, Y, 1)
Kbest_features_eval_df, y_predict_r = evaluate_model_on_features(X[A1], Y)
Kbest_features_eval_df.index = ['Kbest_features_1']
results = results.append(Kbest_features_eval_df)

A10 = Select_features(X, Y, 10)
Kbest_features_eval_df, y_predict_r = evaluate_model_on_features(X[A10], Y)
Kbest_features_eval_df.index = ['Kbest_features_10']
results = results.append(Kbest_features_eval_df)

A18 = Select_features(X, Y, 18)
Kbest_features_eval_df, y_predict_r = evaluate_model_on_features(X[A18], Y)
Kbest_features_eval_df.index = ['Kbest_features_18']
results = results.append(Kbest_features_eval_df)

A20 = Select_features(X, Y, 20)
Kbest_features_eval_df, y_predict_r = evaluate_model_on_features(X[A20], Y)
Kbest_features_eval_df.index = ['Kbest_features_20']
results = results.append(Kbest_features_eval_df)

# Check the metrics
results.head()

Based on this simple EDA and formulas, it is quite convenient to look out for better combination of features to integrate in the model (especially the time shifts)