# References
- https://github.com/thekimk/All-About-Time-Series-Analysis/
- https://github.com/JWarmenhoven/ISLR-python
- https://www.geeksforgeeks.org/python-pandas-series/


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from scipy import stats
from statsmodels.datasets import get_rdataset
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.stattools import adfuller, kpss

# Handling Time-series Data with Pandas

## Main data structures of Pandas: DataFrame and Series
<img src="https://media.geeksforgeeks.org/wp-content/uploads/dataSER-1.png">

## Read data



In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/JWarmenhoven/ISLR-python/master/Notebooks/Data/Advertising.csv', usecols=[1,2,3,4])
df.info()

## Print the loaded data

In [None]:
df = pd.read_excel('https://raw.githubusercontent.com/JWarmenhoven/ISLR-python/master/Notebooks/Data/Default.xlsx')
df.head(3)

In [None]:
df = pd.read_excel('https://raw.githubusercontent.com/JWarmenhoven/ISLR-python/master/Notebooks/Data/Default.xlsx')
df.tail(5)

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/JWarmenhoven/ISLR-python/master/Notebooks/Data/Advertising.csv', usecols=[1,2,3,4])
df[['Radio', 'TV', 'Newspaper']].describe()

## Select a few features from the data

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/JWarmenhoven/ISLR-python/master/Notebooks/Data/Hitters.csv')
df.head(3)

In [None]:
feature_names = ['Years', 'Hits']

X = df[feature_names]
X.head()

## Add a new feature to the data

In [None]:
credit = pd.read_csv('https://raw.githubusercontent.com/JWarmenhoven/ISLR-python/master/Notebooks/Data/Credit.csv', usecols=list(range(1,12)))
credit.head(3)

In [None]:
credit['Student2'] = credit.Student.map({'No':0, 'Yes':1})
credit.head(3)

## Removing features from the data

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/JWarmenhoven/ISLR-python/master/Notebooks/Data/Heart.csv')
df.head(3)

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/JWarmenhoven/ISLR-python/master/Notebooks/Data/Heart.csv').drop('Unnamed: 0', axis=1).dropna()
df.head(3)

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/JWarmenhoven/ISLR-python/master/Notebooks/Data/Heart.csv').drop('ChestPain', axis=1).dropna()
df.head(3)

## Visualization

In [None]:
co2_data = fetch_openml(data_id=41187, as_frame=True).frame
co2_data

In [None]:
co2_data['co2'].plot()
plt.xlabel('Year')
plt.ylabel('CO2 Concentration (PPM)')
plt.show()

In [None]:
deaths = get_rdataset('deaths', package='MASS').data
deaths.index = pd.date_range('1974-01-01', periods=len(deaths), freq='MS')
deaths = deaths[['value']]
deaths

In [None]:
deaths['value'].plot()
plt.xlabel('Year')
plt.ylabel('The Number of Deaths from Respiratory Disease')
plt.show()

## Exercise

In [None]:
data = get_rdataset('AirPassengers').data
data.index = pd.date_range(start='1949/1/1', periods=len(data), freq='MS')
data = data[['value']]
data.info()

In [None]:
# TODO: Print the first 5 rows of the dataset


In [None]:
# TODO: Print the last 5 rows of the dataset.


In [None]:
# TODO: Get a series object from the data


In [None]:
# TODO: Plot the air passenger over time


# White Noise

In [None]:
wn = stats.norm.rvs(size=300, random_state=0)
fig, ax = plt.subplots(1, 3, figsize=(17, 3))
ax[0].plot(wn)
ax[0].set_xlabel('Time')
ax[0].set_ylabel('Value')
plot_acf(wn, ax=ax[1])
plot_pacf(wn, ax=ax[2], method='ywm')
ax[1].set_xlabel('Lags')
ax[2].set_xlabel('Lags')
plt.tight_layout()
plt.show()

# Stationary Test

## Ljung-Box Test: Test randomness
- p < 0.05: The data is not independently distriubted. (Non-stationary)
- p ≥ 0.05: The data is randomly distributed. (Stationary)

In [None]:
lb_stat, lb_pvalue = acorr_ljungbox(wn, lags=[20])
print(f'lb_stat: {lb_stat}, lb_pvalue: {lb_pvalue}')

## Augmented Dickey-Fuller (ADF) test: Existence of trend
- p < 0.05: Trend does not exists. (Stationary)
- p ≥ 0.05: Trend exists. (Non-stationary)

In [None]:
adf, p_value, _, _, _, _ = adfuller(wn)
print(f'adf: {adf:f}, p_value: {p_value:f}')

## Kwiatkowski Phillips Schmidt Shin (KPSS) test: Existence of seasonality
- p ≥ 0.05: Seasonality does not exists. (Stationary)
- p < 0.05: Seasonality exists. (Non-stationary)

In [None]:
kpss_stats, p_value, lags, crit = kpss(wn, lags='auto')
print(f'kpss_stats: {kpss_stats:f}, p_value: {p_value:f}')

## Exercise

In [None]:
# TODO: Run ADF test on the CO2 concentration data

In [None]:
# TODO: Run KPSS test on the number of death from respiratory disease