# [Tabular Playground Series - Jul 2021](https://www.kaggle.com/c/tabular-playground-series-jul-2021/data)

* [Scipy 2019](https://www.youtube.com/watch?v=v5ijNXvlC5A)
* [Scipy 2016](https://www.youtube.com/watch?v=JNfxr4BQrLk)
* [PyCon 2017](https://www.youtube.com/watch?v=zmfe2RaX)
* [Konrad](https://www.kaggle.com/konradb/practical-time-series-pt-1-the-basics)

## In this notebook, we will see how to analyze time series data.

# Import Libraries

In [None]:
# Import Basic
import numpy as np 
import pandas as pd
import altair as alt
import warnings
import itertools
from random import random
from pandas.plotting import autocorrelation_plot
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype

# Fastai
from fastai.tabular.all import *
from sklearn import ensemble
from sklearn import model_selection

# Prophet
from fbprophet import Prophet

# DataViz
import matplotlib.pyplot as plt
import seaborn as sns   
from random import gauss

#Statmodels
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller

# Settings 
plt.style.use('fivethirtyeight') 
pd.options.display.max_rows = 100
pd.options.display.max_columns = 50

# Utility Code

In [None]:
def feature_importance(model, X):
    fi = pd.DataFrame({'feature':X.columns.tolist(), 'importance':(model.feature_importances_ * 100).tolist()}).round(4).sort_values(by='importance', ascending=False)
    bars = alt.Chart(fi).mark_bar().encode(
      x='importance',
      y=alt.Y('feature', sort='-x'),
      tooltip=['feature','importance']
    )

    text = bars.mark_text(
      align='left',
      baseline='middle',
      dx=3  # Nudges text to right so it doesn't appear on top of the bar
    ).encode(
      text='importance:Q'
    )
    return (bars + text).properties(width=400).configure_axis(labelFontSize=13, titleFontSize=16)

# Read Data

In [None]:
# Set path variable
PATH = '/kaggle/input/tabular-playground-series-jul-2021'
train = pd.read_csv(f'{PATH}/train.csv')
test = pd.read_csv(f'{PATH}/test.csv')
ss = pd.read_csv(f'{PATH}/sample_submission.csv')

# EDA (WIP)

In [None]:
train['date_time'] = pd.to_datetime(train['date_time'])

In [None]:
train.dtypes

In [None]:
train.shape, test.shape

In [None]:
train.head(3)

In [None]:
train['date_time']

In [None]:
test['date_time']

In [None]:
test=test.assign(target_carbon_monoxide = -1., target_benzene = -1., target_nitrogen_oxides = -1.)

In [None]:
df = pd.concat([train, test], axis=0, ignore_index=True).rename(columns={'date_time':'ds', 'target_carbon_monoxide':'yco', 'target_benzene':'yb', 'target_nitrogen_oxides':'yno'})

In [None]:
df

In [None]:
fig, ax = plt.subplots(1,1)
ax.plot(df['ds'], df['target_carbon_monoxide']);

In [None]:
tr = train.copy()
te = test.copy()

Now we use the add_datepart function from fastai library to create features from the date column.

# Feature Engineering using **fastai**

In [None]:
# Add new features to dataframe.
tr = add_datepart(tr, 'date_time')
te = add_datepart(te, 'date_time')

Lets take a look at the new features added into our dataframe.

In [None]:
tr.head()

In [None]:
tr.info()

# Split the data into train & validation sets

We will do this step later.

In [None]:
# Define the features to use in the model.
feats = ['deg_C',
 'relative_humidity',
 'absolute_humidity',
 'sensor_1',
 'sensor_2',
 'sensor_3',
 'sensor_4',
 'sensor_5',
 'date_timeYear',
 'date_timeMonth',
 'date_timeWeek',
 'date_timeDay',
 'date_timeDayofweek',
 'date_timeDayofyear',
 'date_timeIs_month_end',
 'date_timeIs_month_start',
 'date_timeIs_quarter_end',
 'date_timeIs_quarter_start',
 'date_timeIs_year_end',
 'date_timeIs_year_start',
 'date_timeElapsed']

In [None]:
X = tr.loc[:, feats]
y

In [None]:
model_selection.train_test_split(tr, , test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)

# Modeling

Let's train a RandomForestClassifier on the complete dataset.

In [None]:
rf1 = ensemble.RandomForestRegressor(n_estimators = 40, n_jobs=-1);
rf2 = ensemble.RandomForestRegressor(n_estimators = 40, n_jobs=-1);
rf3 = ensemble.RandomForestRegressor(n_estimators = 40, n_jobs=-1);

In [None]:
y1 = tr['target_carbon_monoxide'].ravel()
y2 = tr['target_benzene'].ravel()
y3 = tr['target_nitrogen_oxides'].ravel()

Model to predict `target_carbon_monoxide`

In [None]:
rf1.fit(tr[feats], y1)

In [None]:
yp1 = rf1.predict(te[feats])

Model to predict `target_benzene`

In [None]:
rf2.fit(tr[feats], y2)

In [None]:
yp2 = rf2.predict(te[feats])

Model to predict `target_nitrogen_oxides`

In [None]:
rf3.fit(tr[feats], y3)

In [None]:
yp3 = rf3.predict(te[feats])

# Feature Importance Plots[](http://)

**Model 1**
<br>
sensor_2 & sensor_1 seem to be important features.

In [None]:
feature_importance(rf1, tr[feats])

Model 2
<br>
Sensor two seems to be the strongest signal

In [None]:
feature_importance(rf2, tr[feats])

**Model 3**
<br>
Here the date_timeElapsed feature we created has the second highest importance.

In [None]:
feature_importance(rf3, tr[feats])

In [None]:
# Check the format of submission file.
ss.head()

# Submission

In [None]:
# Create teh submission dataframe
sub = pd.DataFrame({
    'date_time': test['date_time'],
    'target_carbon_monoxide': yp1,
    'target_benzene':yp2,
    'target_nitrogen_oxides':yp3
})

In [None]:
sub.to_csv('sub1.csv', index=False)

In [None]:
sub.head()