# Table of Contents
<a id="table-of-contents"></a>
- [1 Introduction](#Introduction)
- [2 Import modules](#import-modules)
    - [2.1 Roughly virsualise data](#roughly-virtualise-data)
    - [2.2 Preprocessing](#preprocessing)
    - [2.3 Explore data](#explore-data)
- [3 Normalisation](#normalisation)
    - [3.1 Boxcox](#boxcox)
    - [3.2 Detect outlier using IQR](#detect-outlier-using-iqr)
- [4 Modeling](#modeling)
    - [4.1 Initial cross validation](#initiall-cross-validation)
    - [4.2 Linear regression](#linear-regression)
    - [4.3 Evaluattion](#evaluation)
- [5 Final](#final)
    - [5.1 Export pipeline](#export-pipeline)
    - [5.2 Make submission](#make-submission)

<a id="import-modules"></a>
# Import modules

In [None]:
# OS
import os

# Data format
import datetime

# Tying
from copy import copy

# Data processing
import pandas as pd

# Data virtualisation
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px

# Widgets
import ipywidgets as widgets

# Exporter
from inspect import getsource

# Math and model
import numpy as np
import scipy
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

# Normaliser
from scipy.special import (
    boxcox,
    inv_boxcox
)

In [None]:
Train_data = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
Test_data = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')


# Copy dataframes to test pipeline
temp_Train_data = copy(Train_data)
temp_Test_data = copy(Test_data)

# Make pipeline folder
try:
    os.makedirs('pipeline')
except FileExistsError:
    pass

In [None]:
# Set up for tabbed output
# Initial tab object
kaggle_data_tabs = widgets.Tab()
kaggle_data_tabs.children = list([widgets.Output(), widgets.Output()]) # Add 2 tabs in the widget

# Add train
kaggle_data_tabs.set_title(0, 'Train')
with kaggle_data_tabs.children[0]:
    display(Train_data)

# Add test
kaggle_data_tabs.set_title(1, 'Test')
with kaggle_data_tabs.children[1]:
    display(Test_data)
display(kaggle_data_tabs)

<a id="roughly-virtualise-data"></a>
## Roughly virtualise data

In this section, we try pairploting train data to see relations.

<a id="roughly-pairplot"></a>
### Quantile Pairplot

In [None]:
sns.pairplot(temp_Train_data)

<a id="parallel-coordinates"></a>
### Parallel Coordinates

***This is an interectable section. Feel free to play around with following plost i.e. moveing columns or scoping region***

In [None]:
# set up function
def plot_parallel_coordinate(target: str):
    """
    Plot parallel coordinate according to the specified
    """
    table=Train_data
    fig = px.parallel_coordinates(
        pd.concat(
            [table.drop(
                [
                    'target_benzene',
                    'target_nitrogen_oxides',
                    'target_carbon_monoxide'
                ],
                axis=1
            ),
            table[[target]]], axis=1
        ),
        color=target,
        labels={
            'target_benzene': 'Benzene',
            'target_nitrogen_oxides': 'Nitrogen oxides',
            "target_carbon_monoxide": "Carbon monoxide"
        },
        color_continuous_scale=px.colors.diverging.Tealrose,
        color_continuous_midpoint=2,
        width=800,
        height=500,
        range_color=[
            min(table[target]),
            max(table[target])
        ]
    )

    fig.show()


#### Benzene

In [None]:
plot_parallel_coordinate('target_benzene')

#### nitrogen_oxides

In [None]:
plot_parallel_coordinate('target_nitrogen_oxides')

#### Carbon monoxide

In [None]:
plot_parallel_coordinate('target_carbon_monoxide')

<a id="preprocessing"></a>
## Preprocessing

### Format datetime

In [None]:
def format_date_time(table: pd.DataFrame):
    """
    :params:
    table (pd.DataFrame): An input dataframe to reformat
    :return:
    """
    table.date_time = table.date_time.apply(
        lambda date_iso: datetime.datetime.fromisoformat(date_iso)
    )
    return table


temp_Train_data = format_date_time(temp_Train_data)
temp_Test_data = format_date_time(temp_Test_data)


# Save to pipeline
with open('pipeline/01_format_date_time.py', 'a+') as f:
    f.write(getsource(format_date_time))

<a id="explore-data"></a>
## Explore data

### Check null value

In [None]:
null_value_report = pd.DataFrame()

# Train
for col in temp_Train_data.columns:
    Train_null_value = sum(temp_Train_data[col].isnull())
    null_value_report.loc[col, 'Train'] = Train_null_value
    
# Test
for col in temp_Test_data.columns:
    Test_null_value = sum(temp_Test_data[col].isnull())
    null_value_report.loc[col, 'Test'] = Test_null_value

null_value_report

***Lucky!*** We don't find any  null value here!

### Check catagorality

In [None]:
unique_value_report = pd.DataFrame()

# Train
for col in temp_Train_data.columns:
    Train_unique_value = len(set(temp_Train_data[col]))
    unique_value_report.loc[col, 'Train'] = Train_unique_value

# Test
for col in temp_Test_data.columns:
    Test_unique_value = len(set(temp_Test_data[col]))
    unique_value_report.loc[col, 'Test'] = Test_unique_value

unique_value_report

Also, they are seem no categorial column in this dataset.

### Seasonality check

Time in each day may affect, and that we will need to observe distribution value of each hour

In [None]:
# Get mean
grouped = temp_Train_data.groupby(
    temp_Train_data.date_time.map(lambda t: t.hour)
)
group_hour_mean = grouped.mean()
group_hour_mean.index = group_hour_mean.index.rename('hour')


group_hour_var = grouped.var()
group_hour_var.index = group_hour_var.index.rename('hour')

# Set up for tabbed output
# Initial tab object
kaggle_data_tabs = widgets.Tab()
kaggle_data_tabs.children = [widgets.Output(), widgets.Output()] # Add 2 tabs in the widget
# Add train
kaggle_data_tabs.set_title(0, 'Mean')
with kaggle_data_tabs.children[0]:
    display(group_hour_mean.head(5))

# Add test
kaggle_data_tabs.set_title(1, 'Var')
with kaggle_data_tabs.children[1]:
    display(group_hour_var.head(5))
display(kaggle_data_tabs)

In [None]:
plot = grouped.boxplot(rot=45, fontsize=24, figsize=(90,100), layout=(12,2), sharex=True)
_ = [ax_tmp.set_xlabel('') for ax_tmp in np.asarray(plot).reshape(-1)]
_ = [
        ax_tmp.set_title(f'Hour: {hour}', fontsize=36)
        for hour, ax_tmp
        in enumerate(np.asarray(plot).reshape(-1))
]
fig = np.asarray(plot).reshape(-1)[0].get_figure()
plt.show()

In [None]:
%matplotlib inline
# Take hour from date time
temp_Train_data['hour'] = temp_Train_data.date_time.map(lambda t: t.hour)
_ = temp_Train_data.boxplot(by='hour', fontsize=24, figsize=(90,100), layout=(12,2))

This is sad since you I quite believe that `hour` is related to `target_*`, but plots show that it just slightly affects. So, I will try take more focus on `target_*`

In [None]:
%matplotlib inline
plt.ioff()

plot = temp_Train_data[
    ['hour'] + ['target_carbon_monoxide', 'target_benzene']].boxplot(
    by='hour',
    fontsize=8,
    figsize=(16, 10),
    layout=(1, 2),
    rot=45
)
fig = np.asarray(plot).reshape(-1)[0].get_figure()
fig.suptitle('Scope plot', size=12)
plt.style.context('dark_background')
plt.show()

### Ono WAY ANOVA

One WAY ANOVA  (one-way analysis of variance ) is another to check out whether a feature is significantly differently varied by group samples.
</br>
First, we set hypothesis that there no differrent between each hour. If p < 0.05, we reject this hypothesis.

ref. 
* https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html
* https://en.wikipedia.org/wiki/One-way_analysis_of_variance

In [None]:
temp_Train_data.drop(['date_time', 'hour'], axis=1).columns

In [None]:
for col in temp_Train_data.drop(['date_time', 'hour'], axis=1).columns:
    col_at = []
    for hour in range(24):
        col_at.append(
            temp_Train_data\
            [temp_Train_data['hour'] == hour]\
            [col]
        )
    print(f"Columns {col}: ")
    print(scipy.stats.f_oneway(*col_at))

With this scale `hour`seems more affect to `target_*`. Also, One Way ANOVA testing suggests different hour significantly varies other features. That leads us data normalisation.
</br>
But before we moving next, we add `hour` extracttion columne to the pipeline.
* Note: I don't use `month` because there are some monthes of which samplings are too low in the table.

In [None]:
def take_hour(table: pd.DataFrame):
    """
    Extract hour
    """
    table['hour'] = table.date_time.map(lambda t: t.hour)
    return table


def take_month(table: pd.DataFrame):
    """
    Extract month
    """
    table['month'] = table.date_time.map(lambda t: t.month)
    return table


temp_Train_data = take_hour(temp_Train_data)
temp_Test_data = take_hour(temp_Test_data)


# Save to pipeline
with open('pipeline/02_take_hour.py', 'a+') as f:
    f.write(getsource(take_hour))


<a id="normalisation"></a>
# Normalisation

## Fiter outlier

First, we observer outlier using boxplot

In [None]:
temp_Train_data.boxplot(
    column=[
        'target_carbon_monoxide',
        'target_benzene',
        'target_nitrogen_oxides'
    ]
)
plt.title("Observe outlier using box plot")

In [None]:
def iqr_filter(table: pd.DataFrame):
    """
    Filter outlier using IQR
    Train data only
    """
    def iqr_filter_target(target: str):
        Q1 = table[target].quantile(0.25)
        Q3 = table[target].quantile(0.75)
        IQR = Q3 - Q1

        filtered = table.query(
            f'(@Q1 - 1.25 * @IQR) <= {target} <= (@Q3 + 1.25 * @IQR)'
        )
        return filtered

    table = iqr_filter_target('target_carbon_monoxide')
    table = iqr_filter_target('target_benzene')
    table = iqr_filter_target('target_nitrogen_oxides')
    
    return table


temp_Train_data = iqr_filter(temp_Train_data)
with open('pipeline/03_iqr_filter.py', 'a+') as f:
    f.write(getsource(iqr_filter))

In [None]:
temp_Train_data.boxplot(
    column=[
        'target_carbon_monoxide',
        'target_benzene',
        'target_nitrogen_oxides'
    ]
)
plt.title("Box plot after filtering")

<a id="boxcox"></a>
## Boxcox

In this section, we woill use `boxcox` to normalise numerical data where `boxcox` is a transformation which can convert non-normal variable to normal variable
* You may try any other normalisers which can make better results

In [None]:
def boxcox_normalise(table: pd.DataFrame):
    """
    Boxcox normalisation
    """
    boxcox_table = pd.DataFrame()

    skip_cols = [
        # I don't want to normalise time, I don't think it's useful
        'date_time',
        'hour',
        'month',
        # These columns seems normal already
        'relative_humidity',
        # Negativable
        'deg_C'
    ]
    for col in table.columns:
        if col not in skip_cols:
            boxcox_table[col] = boxcox(table[col], 0.0001)
        else:
            boxcox_table[col] = table[col]
    return boxcox_table


boxcox_Train_data = boxcox_normalise(temp_Train_data)
boxcox_Test_data = boxcox_normalise(temp_Test_data)

with open('pipeline/04_boxcox_normalise.py', 'a+') as f:
    f.write(getsource(boxcox_normalise))

<a id="detect-outlier-using-iqr"></a>
## Detect outlier using IQR

In [None]:
boxcox_Train_data.boxplot(
    column=[
        'target_carbon_monoxide',
        'target_benzene',
        'target_nitrogen_oxides'
    ]
)

In [None]:
# Sort columns
def sort_columns(table: pd.DataFrame):
    """
    Sort columns; train features only
    """
    table = pd.concat([
        table.drop(
            ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'],
            axis=1
        ),
        table[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]
    ], axis=1)
    
    return table

boxcox_Train_data = sort_columns(boxcox_Train_data)

with open('pipeline/03.1_sort_columns.py', 'a+') as f:
    f.write(getsource(sort_columns))

#### Final filter

In [None]:
boxcox_Train_data.boxplot(
    column=[
        'target_carbon_monoxide',
        'target_benzene',
        'target_nitrogen_oxides'
    ]
)

In [None]:
boxcox_Train_data = iqr_filter(boxcox_Train_data)
boxcox_Train_data.boxplot(
    column=[
        'target_carbon_monoxide',
        'target_benzene',
        'target_nitrogen_oxides'
    ]
)


#### Final visualisation

### Pairplot

In [None]:
sns.pairplot(boxcox_Train_data)

# Modeling

<a id="initial-cross-validation"></a>
## Initial cross validation

In [None]:
try:
    boxcox_Train_data.set_index('date_time')
except KeyError:
    pass
num_fold = 5
kf = KFold(n_splits=num_fold, shuffle=True, random_state=1234)
kf.get_n_splits(boxcox_Train_data)

print(kf)

K_FOLD = []
for train_index, test_index in kf.split(boxcox_Train_data):
    print("TRAIN:", train_index, "TEST:", test_index)
    K_FOLD.append((train_index, test_index))

In [None]:
try:
    boxcox_Train_data.drop('test', inplace=True)
except KeyError:
    pass

<a id="linear-regression"></a>
## Linear regression

In [None]:
try:
    boxcox_Train_data = boxcox_Train_data.set_index('date_time')
except KeyError:
    pass

def RMSLE(pred, act):
    pred = inv_boxcox(pred, 0.0001)
    act = inv_boxcox(act, 0.0001)
    return (np.mean(
        (np.log(pred + 1) - np.log(act + 1))**2
    )) ** 0.5


models = []
for k, (Train, Test) in enumerate(K_FOLD):
    print(f"K: {k}")
    y_columns = list(
        boxcox_Train_data.columns[
            boxcox_Train_data.columns.str.startswith('target')
        ]
    )
    X_Train = boxcox_Train_data.iloc[Train].drop(y_columns, axis=1)
    y_Train = boxcox_Train_data.iloc[Train][y_columns]

    X_Test = boxcox_Train_data.iloc[Test].drop(y_columns, axis=1)
    y_Test = boxcox_Train_data.iloc[Test][y_columns]

    reg_carbon_monoxide = LinearRegression().fit(X_Train, y_Train['target_carbon_monoxide'])

    train_score = reg_carbon_monoxide.score(X_Train, y_Train['target_carbon_monoxide'])
    test_score = reg_carbon_monoxide.score(X_Test, y_Test['target_carbon_monoxide'])
    
    predition = reg_carbon_monoxide.predict(X_Test)
    RMSLE_score = RMSLE(predition, y_Test['target_carbon_monoxide'])

    print('Target: Carbon Monoxide')
    print(f'Train score: {train_score}')
    print(f'Test score: {test_score}')
    print(f'RMSLE: {RMSLE_score}')
    print('-' * 36)

    reg_benzene = LinearRegression().fit(X_Train, y_Train['target_benzene'])
    train_score = reg_benzene.score(X_Train, y_Train['target_benzene'])
    test_score = reg_benzene.score(X_Test, y_Test['target_benzene'])
    
    predition = reg_benzene.predict(X_Test)
    RMSLE_score = RMSLE(predition, y_Test['target_benzene'])

    print('Target: Benzene')
    print(f'Train score: {train_score}')
    print(f'Test score: {test_score}')
    print(f'RMSLE: {RMSLE_score}')
    print('-' * 36)


    reg_nitrogen_oxides = LinearRegression().fit(X_Train, y_Train['target_nitrogen_oxides'])
    train_score = reg_nitrogen_oxides.score(X_Train, y_Train['target_nitrogen_oxides'])
    test_score = reg_nitrogen_oxides.score(X_Test, y_Test['target_nitrogen_oxides'])
    
    predition = reg_nitrogen_oxides.predict(X_Test)
    RMSLE_score = RMSLE(predition, y_Test['target_carbon_monoxide'])

    print('Target: Nitrogen oxides ')
    print(f'Train score: {train_score}')
    print(f'Test score: {test_score}')
    print(f'RMSLE: {RMSLE_score}')
    print('-' * 36)

    model = {
        'carbon_monoxide': reg_carbon_monoxide,
        'benzene': reg_benzene,
        'nitrogen_oxides': reg_nitrogen_oxides
    }
    models.append(model)
    print('=' * 36)

<a id="final"></a>
# Final
<a id="make-submission"></a>
## Make submission

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')
submission = submission.set_index('date_time')

In [None]:
submission['target_carbon_monoxide'] = 0
submission['target_benzene'] = 0
submission['target_nitrogen_oxides'] = 0

In [None]:
try:
    boxcox_Test_data = boxcox_Test_data.set_index('date_time')
except KeyError:
    pass

for k in range(num_fold):
    submission['target_carbon_monoxide'] += models[k]['carbon_monoxide'].predict(boxcox_Test_data)/num_fold
    submission['target_benzene'] += models[k]['benzene'].predict(boxcox_Test_data)/num_fold
    submission['target_nitrogen_oxides'] += models[k]['nitrogen_oxides'].predict(boxcox_Test_data)/num_fold

In [None]:
submission = inv_boxcox(submission, 0.0001)
submission.to_csv('submission.csv')

<a id="export"></a>
## Export

Finally, we have preprocessin pipeline for further use. In this section, we try save and load pipeline.

In [None]:
def preprocess_train(table: pd.DataFrame):
    table = format_date_time(table)
    table = take_hour(table)
    table = iqr_filter(table)
    table = boxcox_normalise(table)
    table = iqr_filter(table)
    table = sort_columns(table)
    table = table.set_index('date_time')
    return table


with open('pipeline/preprocess_train.py', 'a+') as f:
    f.write(getsource(preprocess_train))

In [None]:
def preprocess_test(table: pd.DataFrame):
    table = format_date_time(table)
    table = take_hour(table)
    table = boxcox_normalise(table)
    table = table.set_index('date_time')
    return table


with open('pipeline/preprocess_test.py', 'a+') as f:
    f.write(getsource(preprocess_test))

In [None]:
Train_data = preprocess_train(Train_data)

assert all(boxcox_Train_data.columns == Train_data.columns)
assert (boxcox_Train_data.shape == Train_data.shape)
assert np.all(boxcox_Train_data.values == Train_data.values)

In [None]:
Test_data = preprocess_test(Test_data)

In [None]:
assert boxcox_Test_data.shape == Test_data.shape
assert all(boxcox_Test_data.columns == Test_data.columns)
assert np.all(Test_data.values == boxcox_Test_data.values)