# 1. Time-Series Data - Load, Clean, Visualize

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/others/daily-total-female-births-CA.csv")
df.head()

## Data Analysis

In [None]:
type(df)
df.info()

## But the date column is identified as object, we need to specify to look it as date type using one of pandas functions

In [None]:
df = pd.read_csv("/kaggle/input/others/daily-total-female-births-CA.csv", parse_dates=[0])
df.head()

In [None]:
df.info()

## For efficiency we import data as time-series

In [None]:
# Index is our time series date data and vales are births columns.

series = pd.read_csv("/kaggle/input/others/daily-total-female-births-CA.csv", parse_dates=[0], index_col=[0], squeeze=True)
series.head()

In [None]:
print(series.shape)
type(series)

## Filtering by Time

In [None]:
series

In [None]:
print(series['1959-05'])

## Descriptive Analysis

In [None]:
series.describe()

## Time Series - Visualization

In [None]:
import matplotlib.pyplot as plt
df['births'].plot()

In [None]:
df.index = df.date # X-axis is date column now
print(df.head())
df['births'].plot()

## Let's zoom in

In [None]:
dfplot = df[(df['date']>'1959-03-01') & (df['date']<'1959-06-01')]
dfplot.births.plot()

## Plotting Trend lines using seaborn

In [None]:
import seaborn as sns
df = pd.read_csv("/kaggle/input/others/daily-total-female-births-CA.csv", parse_dates=[0])
sns.regplot(x=df.index.values, y=df.births)

### Increasing Trend

## Polynomial Trendline

In [None]:
### Quadratic TL
sns.regplot(x=df.index.values, y=df.births, order=2)

In [None]:
### Cubic TL
sns.regplot(x=df.index.values, y=df.births, order=3)

## EDA of us-airlines-monthly-aircraft-miles-flown.csv

In [None]:
us_miles = pd.read_csv('../input/others/us-airlines-monthly-aircraft-miles-flown.csv', parse_dates=[0])
us_miles.head()

In [None]:
us_miles['MilesMM'].plot() # This shows seasonality, peak during end of the year

In [None]:
sns.regplot(x=us_miles.index.values, y=us_miles['MilesMM'])

## Seasonality Removal - Aggregated by year and taking mean of milesMM|

In [None]:
us_miles['year'] = us_miles['Month'].dt.year
us_miles.head()

In [None]:
us_miles.groupby('year')['year','MilesMM'].head(5)

## Let's take the mean value of milesMM for each year

In [None]:
print(us_miles.groupby('year')['MilesMM'].mean())
us_miles.groupby('year')['MilesMM'].mean().plot()
plt.title('Seasonality Removed - MilesMM')

## Creating Lag Plots
Used to check whether a single column feature values are dependent on each other, like daily temperature is +/- (3 to 5)C of previous day's value

In [None]:
us_miles['lags']= us_miles['MilesMM'].shift(1)
us_miles.head()

In [None]:
sns.scatterplot(x=us_miles['lags'], y=us_miles['MilesMM'])
plt.title('Positive Correlation is exhibited')

In [None]:
# Also you can use
from pandas.plotting import lag_plot
lag_plot(us_miles['MilesMM'])

## Auto-Correlation Plots
To check the correlation of the feature variable with itself with delayed values of itself

In [None]:
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(us_miles['MilesMM'])

# X-axis is lag value +1 (like previous plt) , +2, +3, ...90 lags and \
# Y-axis is correlation of actual MilesMM and it's lagged values, +ve means positive correlation and -ve vice versa.

# From the plot we see the first 5 lags are highly correlated with actual MilesMM feature.

## End of TS - Data - Load, Clean, Visualize

# 2. Feature Engineering

## A time series data must be transformed to be modeled into a supervised learning problem.

## The process of creating or inventing new features from the time series dataset is also called feature engineering (date --> Insights)

## Types of Features

1. Date-time Features --> These are the components of the time step itself for each observations (Date --> Day, Month, Year)

2. Lag Features --> These are values at prior time steps

3. Window Features --> Summary of values over a fixed window

_______________________________________________________________

## Window Features :
* Rolling Window : Add a summary of the values for the previous time steps
* Expanding Window : Include all previous data in the series

##  Date-time Features

In [None]:
df = pd.read_csv('../input/others/daily-total-female-births-CA.csv', parse_dates=[0])
df.head()

In [None]:
features = df.copy()
features['Year'] = df['date'].dt.year
features['Month'] = df['date'].dt.month
features['Days'] = df['date'].dt.day

# New features (year, month, days from date)
features.head()

## Lag Features

In [None]:
features['lag1'] = df['births'].shift(1)
features['lag2'] = df['births'].shift(365)

features.head()

## Window Features

In [None]:
# Take n and n-1 values of all date values and averages them into a single unit (window = 2)
# Take n , n-1, and n-2 values of all date values and averages them into a single unit (window = 3)

features['Roll_mean'] = df['births'].rolling(window=2).mean() 
features['Roll_Max'] = df['births'].rolling(window=3).max() # Mean or Max or Min

features.head(10)

## Expanding Features (It consider all the values before a particular date value and performs operations)

In [None]:
features['Expanding_Max'] = df['births'].expanding().max()
features.head(10)

# 3. Resampling 
## Changing Frequency of the data to our convenient format (year-->month, month-->week, week-->day)
## simply changing the frequency of the available data to match the frequency of the required forecast

1. Upsampling (Quaterly data --> Monthly)
2. Downsampling (Quaterly data --> Yearly)

In [None]:
df = pd.read_csv('../input/others/us-airlines-monthly-aircraft-miles-flown.csv', parse_dates=[0])
df.head()

## Downsampling

In [None]:
# Downsampling (12 months = 4 Quarters)
# 'Q' - Quaterly, 'A' - Annually
quaterly_miles = df.resample('Q', on='Month').mean() # Quarter has 3 sets and taking mean out of it
quaterly_miles.head()

In [None]:
yearly_miles = df.resample('A', on='Month').sum()
yearly_miles.head()

## Upsampling

In [None]:
daily_miles = df.resample('D', on='Month').mean() # Only creates structure, later fill them
daily_miles.head(50)

In [None]:
# To fill the data between day 1 and day 30, we can interpolate a linear function to fill those values
interpolated_df = daily_miles.interpolate(method='linear')
interpolated_df.head(50)

In [None]:
interpolated_df.plot()

In [None]:
# For smoothing , we replace linear with polynomial function (quadratic= order=2, third degree polynomial order=4)
poly_interpolated_df = daily_miles.interpolate(method='spline', order=2)
poly_interpolated_df.head(50)

In [None]:
poly_interpolated_df.plot()

In [None]:
# Comparing the two plots for smoothening
interpolated_df.plot()
poly_interpolated_df.plot()

# 4. Decomposing Time Series Model - For Detection

1. Additive Model -->
pred(t) = Level(Avg value) + Trend(+ve / -ve trend) + Seasonality(short term cycles in series) + noise(random variation)
2. Multiplicative Model -->
pred(t) = Level(Avg value) * Trend(+ve / -ve trend) * Seasonality(short term cycles in series) * noise(random variation)

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
df = pd.read_csv('../input/others/us-airlines-monthly-aircraft-miles-flown.csv', parse_dates=[0])
df.index = df.Month
df.head()

In [None]:
results_add = seasonal_decompose(df['MilesMM'], model='additive')
results_add.plot()
# Original, Trend, Seasonality, Noise (Residual vales)

In [None]:
results_mul = seasonal_decompose(df['MilesMM'], model='multiplicative')
results_mul.plot()
# Original, Trend, Seasonality, Noise (Residual vales)

# 5. Differencing Time Series (For removing trend and seasonality from the data)

In [None]:
df['lags'] = df['MilesMM'].shift(1)
df.head()

In [None]:
df['MilesMM-lags'] = df['MilesMM'].diff(periods=1)
df.head()

In [None]:
# Check for those 3 patterns in the original dataset
df.index = df.Month
result_1 = seasonal_decompose(df['MilesMM'], model='additive')
result_1.plot()

In [None]:
# The differencing must have removed the trend, but not the seasonality. let's check
# Refer the y-axis, the range is less which means there's no trend
result_2 = seasonal_decompose(df.iloc[1:,3], model='additive')
result_2.plot()

In [None]:
df.head()

In [None]:
df['MilesMM'].plot()

In [None]:
df['MilesMM-lags'].plot() # Seasonality for first 3 values

In [None]:
df['MilesMM-lags12'] = df['MilesMM'].diff(periods=12)
df['MilesMM-lags12'].plot()

In [None]:
df.head(20)

In [None]:
# The second differencing must have removed the seasonality. let's check
# Refer the y-axis, the range is less which means there's no seasonality and trend
result_3 = seasonal_decompose(df.iloc[12:,4], model='additive')
result_3.plot()