In [50]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # Data visualization library
import seaborn as sns # Data visualization library

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [51]:
# Seasonal Decompositionm 
from statsmodels.tsa.seasonal import seasonal_decompose

# Lag Scatter plots
from pandas.plotting import lag_plot

# Autocorrelation plots
from pandas.plotting import autocorrelation_plot

# Numerical Computation and Model Metrics 
from math import sqrt
from numpy import mean, polyfit 
from sklearn.metrics import mean_squared_error

# Sklearn models 
from sklearn.linear_model import LinearRegression 

# Statistical Tests and plots for checking white noise 
from random import gauss
from random import seed

# Questions to be asked:
1. Ask relevant questions related to the timestamps
2. Whose timestamp? How are those timestamps generated?
3. At which instant were those timestamps recorded?
4. Guesstimating time stamps to make sense of data
5. The questions determine the data pipeline 
6. Local or universal time. Most timestamps are stored according to the UTC
7. Guesstimating timestamps to make sense of data
8. Is it user behaviour or network behaviour?
9. Date-specific API calls???
10. Psychological Time-Discounting is a manifestation of a phenomenon known as psychological distance, which names our tendency to be more optimistic(and less realistic) when making assessments or estimates that are more distant from us. 
31. Humans know when time is passing.

In [52]:
# Treat the first column as index
series = pd.read_csv("../input/daily-total-female-births-in-california-1959/daily-total-female-births-CA.csv", header=0, index_col=0, parse_dates=True, squeeze=True)
series.head(10)

In [53]:
# Do the usual descriptive analysis stuff
print(series.size)
print(series['1959-01'])
print(series.describe())

# Basic Feature Engineering
Time Series data must be re-framed as a supervised learning dataset before we can start using
machine learning algorithms. There is no concept of input and output features in time series.
Instead, we must choose the variable to be predicted and use feature engineering to construct
all of the inputs that will be used to make predictions for future time steps. In this tutorial,
you will discover how to perform feature engineering on time series data with Python to model
your time series problem with machine learning algorithms.
After completing this tutorial, you will know:
*  The rationale and goals of feature engineering time series data.
*  How to develop basic date-time based input features.
* How to develop more sophisticated lag and sliding window summary statistics features.
Let’s dive in

I may enumerate with all the problems of time-stamp and consider what might be useful for the problem, such as:
* Minutes Elapsed
* Hour of Day
* Business Hours

Lag features:
* Weekend or not
* Season of the year
* Business quarter of the year
* Daylight savings or not
* Public holiday or not
* Leap year or not

Adding domain-specific features is a good start for time-series.

**Creating Rolling and Expanding Window summary statistic features**

In [54]:
# Create Rolling Window Statistics
temps = pd.DataFrame(series.values)
width = 3
shifted = temps.shift(width - 1)
window = shifted.rolling(window=width)
dataframe = pd.concat([window.min(), window.mean(), window.max(), temps], axis=1)
dataframe.columns = ['min', 'mean', 'max', 't+1']
print(dataframe.head(5))

In [55]:
print(temps)
print()
print(shifted)

In [56]:
# Create Expanding Window Statistics
temps = pd.DataFrame(series.values)
window = temps.expanding()
dataframe = pd.concat([window.min(), window.mean(), window.max(), temps.shift(-1)], axis=1)
dataframe.columns = ['min', 'mean', 'max', 't+1']
print(dataframe.head(5))

# Time Series Visualization

1. Line Plots
2. Histograms and Density Plots
3. Box and Whisker Plots
4. Heat Maps
5. Lag Plots or Scatter Plots
6. Autocorrelation Plots

In [57]:
# Reading data in a proper format
melbourne = pd.read_csv("../input/melbourne-temperature/daily-minimum-temperatures-in-me.csv", header=0, index_col=0, parse_dates=True, squeeze=True, on_bad_lines='warn')
melbourne = melbourne.apply(lambda x:x.replace('?', ''))
melbourne = melbourne.astype(float)

# Plotting the series
melbourne.plot(style='--') #'k--' is one of the type 
plt.show()

In [58]:
# Create subplots as well
groups = melbourne.groupby(pd.Grouper(freq='A'))
years = pd.DataFrame()
for name, group in groups:
    years[name.year] = group.values
    
years.plot(subplots=True, legend=False)
plt.show()

In [59]:
# Histogram and density plots
melbourne.hist()
plt.show()

In [60]:
# Draw a KDE to understand the data better
melbourne.plot(kind='kde')
plt.show()

In [61]:
# Box and whisker plots by interval
years.boxplot()
plt.show()

In [62]:
len(groups)

In [63]:
# Creating a boxplot of monthly data. 
one_year = melbourne['1990']
groups = one_year.groupby(pd.Grouper(freq='M'))

# The entire group will be grouped by months, and every succeeding month is stacked in a next column
months = pd.concat([pd.DataFrame(x[1].values) for x in groups], axis=1)
months = pd.DataFrame(months)
months.columns = range(1, 13)
months.boxplot()
plt.show()

In [64]:
years

In [65]:
# Heatmaps. pd.Grouper. If group is 'A', then data for each of the 12 months will be shown
groups = melbourne.groupby(pd.Grouper(freq='A'))
years = pd.DataFrame()
for name, group in groups:
    years[name.year]=group.values
years = years.T
years = pd.DataFrame(years)
plt.matshow(years, interpolation=None, aspect='auto')
plt.show()

In [66]:
years

In [67]:
# Do the same for a particular month as well
# Extract for the month and all particular years 

In [68]:
lag_plot(melbourne)
plt.show()

In [69]:
# Create multiple scatter plots for different lags
values = pd.DataFrame(melbourne.values)
lags = 7
columns = [values]

# Creating columns for 't-1', 't-2', 't-3',...., 't-n'th lags 
for i in range(1, (lags+1)):
    columns.append(values.shift(i))
dataframe = pd.concat(columns, axis=1)

columns = ['t']

# Appending name to existing column names
for i in range(1, (lags+1)):
    columns.append('t-' + str(i))
dataframe.columns = columns
plt.figure(i)

# Plotting the data
for i in range(1, (lags+1)):
    plt.scatter(x=dataframe['t'].values, y=dataframe['t-'+str(i)].values)
    
    # Creating subplots 
    ax = plt.subplot(240+i)
    
    # Setting the title
    ax.set_title('t vs. t-' + str(i))

In [70]:
autocorrelation_plot(melbourne)
plt.show()

# Observations
# The given data has a very strong seasonal component

# Resampling and Interpolation
1. **Upsampling** - For eg., from hours to minutes. Using interpolation - Depends upon the use-case
2. **Downsampling** - For eg., from minutes to hours. using aggregation(mean, median and mode) - Depends upon the use-case

1. **Upsampling calls for interpolations, which can be linear as well as complex**
2. **Situations under which one would do downsampling of data**
    *  When we want to study observations from a particular time frame
    *  The original resolution of the data isn't sensible
    *  Match against data at a lower frequency. In such cases one would simply aggregate the data or downsample rather than simply aggregating points
3. **Situations under which one would do upsampling of data**
    *  Irregular time series
    *  Inputs are sampled at different frequencies
    *  Knowledge of time-series dynamics determines the interpolation
4. **Why I would even need to upsample data**
    *  Upsampling data - here, resampling data, if data isn't available at the same frequency at which one would want to make predictions
    *  Feature Engineering - create more features, helps in deriving more useful insights 
5. **Identify the problem statement. See what is being asked to predict. Then check if the data is originally sampled at the asked frequency. If yes, then go ahead. 
     Else resampling would be required**

In [71]:
# Let's try and test upsampling on shampoo dataset
def parser(x):
    return pd.datetime.strptime('190'+x, '%Y-%m')

series = pd.read_csv('../input/shampoo-saled-dataset/shampoo_sales.csv', header=0, index_col=0, squeeze=True, parse_dates=True, date_parser=parser)

# The series is automatically converted to '%Y-%M-%d' type format 
print(series)
print("\n\n")
plt.plot(series)
plt.xticks(rotation=45)
plt.show()

In [72]:
# Upsampling monthly data to daily data 
upsampled = series.resample('D').mean()

# Interpolate 'NaN' values with linear interpolation 
interpolated = upsampled.interpolate(method='spline', order=2)

# Print the values
print(interpolated.head(32))
print("\n\n")
plt.plot(interpolated)
plt.xticks(rotation=45)
plt.show()

In [73]:
# In a similar fashion, I can downsample data as well
# Perhaps I want to predict for quaterly data
# Perhaps I want to predict for annual data

# Power Transforms - Data Transforms 

**Why do we even need to transform data?**
1. Data Transformation is required to remove noise(white noise) or improve signal in time series forecasting. 
2. Basically its a procedure to introduce trend into the data

**Observations**
1. Identify when to use and how to explore a square root transform 
2. Identify when to use and how to explore a log transform 
3. Using box-cox transform to perform square root and log transforms and identify the best transform available for the dataset 

In [74]:
# Airline passengers dataset 
series = pd.read_csv('../input/air-passengers/AirPassengers.csv', header=0, index_col=0, parse_dates=True)
series.columns = series.columns.str.replace('#', '')
series = series.squeeze()
print("\n\n")

# Define the figure 
plt.figure(1)

# Plot Line Plot
plt.subplot(211)
series.plot()

# Print histogram 
plt.subplot(212)
plt.hist(series)
plt.show()

In [75]:
# Square Root Transform 

# Contrive a time series 
series = [i**2 for i in range(1, 100)]
plt.figure(1)

# Construct a line plot 
plt.subplot(211)
plt.plot(series)

# Construct a histogram 
plt.subplot(212)
plt.hist(series)

plt.show()

1. Such a series follows a quadratic growth trend. 
2. Its transformation would provide a linear line plot with a uniformly distributed histogram. 

In [76]:
# Log Transform AND Exponential Series. Convert them to LINEAR form by taking their LOG
# As required, add a bias term(intercept) value to convert negative AND zero marked values as their NATURAL LOG isn't defined. 

In [77]:
# Upsampling calls for interpolations, which can be linear as well as complex
# Different values of lambda provide different types of transforms
# Just get the lambda value 
# lambda = -1.0 is a reciprocal transform.
# lambda = -0.5 is a reciprocal square root transform.
# lambda = 0.0 is a log transform.
# lambda = 0.5 is a square root transform.
# lambda = 1.0 is no transform

# Smoothing data
1. Purposes of smoothing - Smoothing data is strongly related to imputing missing data, check for measurement spikes and errors or both 
2. Smoothing is used for Data Preparation, Feature Engineering and even for making predictions. 
3. Smoothing removes noise from time-series and is used to fine-grain values between time-steps.


**Moving Average Smoothing**
1. Specify a window size called the window width. Window width w is slid along the time series to calculate aggregated values. Two main types of moving averages that are used - centered and trailing moving averages. 

2. Centered MA - Can be used to remove trend and seasonality componets from the dataset, and not generally used while forecasting. 

**Centered and Trailing Moving Average**

**Data Expectations**
1. Data should be stationary, i.e., there should be no prevalence of trend or seasonality components. They can be removed using the differencing method as described earlier. 

**MA as data preparation**
1. Create a smoothed version of the original dataset. 
2. It reduces the random variation in the latest value with respect to the preceding values and better expose the structure of the underlying causal processes. 


**Exponential Smoothing**
1. To be studied later 

In [78]:
# The MA-especially the trailing MA, is the same as rolling mean
# The rolling window works as a lag feature 

# Predict female -births using MA 
# Different values of lambda provide different types of transforms
# lambda = -1.0 is a reciprocal transform.
# lambda = -0.5 is a reciprocal square root transform.
# lambda = 0.0 is a log transform.
# lambda = 0.5 is a square root transform.
# lambda = 1.0 is no transform")

In [79]:
series = pd.read_csv("../input/daily-total-female-births-in-california-1959/daily-total-female-births-CA.csv", header=0, index_col=0, parse_dates=True,
squeeze=True)

# Prepare situation 
X = series.values 
window = 3
history = [X[i] for i in range(window)]
targets = [X[i] for i in range(window, len(X))]
preds = []

# Walk forward over time-steps in test data
for t in range(len(targets)):
    k = 0
    yhat = mean([X[i] for i in range(len(history)-window+k, len(history)+k)])
    obs = targets[t]
    preds.append(yhat)
    history.append(obs)
    print('predicted=%f, target=%f' % (yhat, obs))

# Calculate RMSE 
rmse = sqrt(mean_squared_error(targets, preds))
print("The RMSE value is: %.3f" % rmse)

In [80]:
# MA as a prediction 

# Temporal Structure in Time Series 

**Observations**:
1. White noise is nothing but a random sequence of numbers
2. Checking if a series is white noise
3. Statistical and diagnostic plots to check for white noise

**Conditions for a TS to be white noise**
1. Mean=0 with constant variance
2. The observations are independent of one another and there is almost negligible or no autocorrelationn. 
3. A WHITE NOISE can't be predicted

**Why it matters?**
1. **Predictability** - If a time series is white noise, it simply can't be predicted
2. **Model diagnostics** - The series of errors from a time series forecast model should ideally be a white noise

**How white noise can help**?
1. If a model blatantly predicts white noise, then its garbage - discard it 
2. If the residuals of a learning algorithm, after prediction on unseen data isn't complete white noise, there is scope of improvement in the model as more and more of main signal's information can be modelled for efficient predictions.

**How to check for white noise**?
1. Non-zero mean, variability of variance and effect of lagged features - autocorrelation
2. Create line plots, calculate summary statistics of the entire dataset or its subset and create autocorrelation plots as well

In [81]:
# Random Number seed generator 
seed(1)

# Create a white noise series 
series = [gauss(0.0, 1.0) for i in range(1000)]
series = pd.Series(series)

# Print summary statistics 
print(series.describe())
print("\n\n")

# Since the sample size is already less, it doesn't make much sense to split the time series as the expected mean and variance of the splits would almost be same 
# Let's plot a line plot 
plt.subplot(211)
series.plot()
plt.show()

# Create a histogram to verify if the distribution is Gaussian 
plt.hist(series)
plt.show()

# Let's plot an autocorrelation plot for verification 
autocorrelation_plot(series)
plt.show()

# Random Walk in Time Series

In [82]:
# Create a random plot in time series
from random import seed
from random import randrange
series = [randrange(10) for i in range(1000)]
plt.plot(series)
plt.show()

**This plot is nothing but a sequence of random numbers, not a drunkard's walk**

A random walk is different from a list of random numbers because the next value in the sequence is a modification of the previous value in the sequence. The process used to generate the series forces dependence from one-time step to the next. This dependence provides some consistency from step-to-step rather than the large jumps that a series of independent, random numbers provides. It is this dependency that gives the process its name as a random walk or a drunkard’s walk.

In [83]:
# Create and plot a random walk. The seed value is essential for keeping consistency
from random import random
from random import seed
random_walk = []
seed(1)
random_walk.append(-1 if random() < 0.5 else 1)
for i in range(1, 1000):
    movement = -1 if random() < 0.5 else 1
    value = random_walk[i-1] + movement 
    random_walk.append(value)
    
plt.subplot(311)
plt.plot(random_walk)
plt.show()
print("\n\n")

plt.subplot(312)
autocorrelation_plot(random_walk)
plt.show()
print("\n\n")
plt.subplot(313)
plt.hist(random_walk)
plt.show()


In [84]:
# Running the AD-Fuller test of stationarity 

# Import the test package
from statsmodels.tsa.stattools import adfuller 
result = adfuller(random_walk)

print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

In [85]:
# Random walk can be made stationary by taking the first difference 
diff = []
for i in range(1, len(random_walk)):
    value = random_walk[i]-random_walk[i-1]
    diff.append(value)
plt.plot(diff)
plt.show()

**Check if a time series is a random walk**
1. The time series shows a stromg temporal dependence that decays linearly or in a similar pattern. 
2. The time serie sis non-stationary and making it stationary shows no obviously learnable parameters in the data
3. The persistence model provides the best source of reliable predictions

In [86]:
# Plotting the trend, seasonality and residual of a random walk 
result_random_walk = seasonal_decompose(random_walk, model='additive', period=1)
result_random_walk.plot()
plt.show()

# There is no residual or seasonality component in a random walk. The given model is just a trend that can be fit. 
# Let's remove the trend

rem_trend = random_walk-result_random_walk.trend
plt.plot(rem_trend)
plt.show()

# Hence nothing remains after the trend component has been removed

# Decomposing a time series data

# Time Series Components
**Break a time series down into systematic and unsystematic components**
1. Systematic: Components of time series that have consistency or recurrence and can be described or modelled. 
2. Non-Systematic: Components of the time series that can't be modelled
3. It is a useful tool for analysis and helps us witbh forecasting


**A given time series model is thought to consist of three systematic components including level, trend, seasonality and noise. These components are**
1. Level: Average value in time series
2. Trend: Increasing or decreasing value in time series
3. Seasonality: Repeating short-term cycle in time series
4. Noise: The random variation in the series, i.e, the non-systematic component
5. Remember, it is not possible to perfecty break a given time series into an additive or multiplicative model 

In [87]:
series = [i+randrange(10) for i in range(1, 100)]
print(series)
print("\n")
result = seasonal_decompose(series, model='additive', period=1)
print("\n")
result.plot()
plt.show()

In [88]:
# HENCE, the entire series has been taken as a trend component and there is no seasonaity 
# This form of decomposition is the most naive form of decomposition. There exist some advanced forms like Seasonal and Trend decomposition, using Loess or STL decomposition
# Caution is advised when using automated decomposition methods

**Summary of Learnings**:
1. Structure of decomposing time series into level, trend, seasonlaity and noise
2. Automatically decompose a time series dataset with Python 
3. Decompose an additive or multiplicative model and plot the results 

# Use and Remove Trends
1. Importance and types of trends that may exist in a time series and identify them 
2. Use a simple differencing method to remove them 
3. Model a linear trend and remove it from a time series dataset 

**Types of trends**
1. Deterministic trends: Those that increase or decrease consistently 
2. Stochastic trends: Those that increase or decrease inconsistently 

**Furthermore**
1. Global and local trends 


**Identifying a trend**:
1. Identification and addition or removal of trend is a subjective process. Adding linear and non-linear trends to the data is also subjective
2. A trend can be prevalent, or added to the forecasting problem as an input 

**Methods of Trend Removal**:
1. De-Trend by differencing 
2. De-Trend by model fitting 


1. There may exist linear as well as non-linear trends in the dataset
2. The former can be removed by linear models, whereas the latter can be looked at through non-linear or expoential models

In [89]:
# Use a linear model to detrend a time series 
def parser(x):
    return pd.datetime.strptime('190'+x, '%Y-%m')

series = pd.read_csv('../input/shampoo-saled-dataset/shampoo_sales.csv', header=0, index_col=0, parse_dates=True, squeeze=True, date_parser=parser)

# Fit a linear model
X = [i for i in range(0, len(series))]
X = np.array(X)
X = X.reshape(-1, 1)
y = series.values

# The input feature is the list of indices to the model 
model = LinearRegression()
model.fit(X, y)

# Estimte trend
trend = model.predict(X)

# Plot trend
plt.plot(y)
plt.plot(trend)
plt.show()

In [90]:
# The orange line in the above graph refers to the linear trend 
# De-trend
detrended = [y[i]-trend[i] for i in range(0, len(series))]

# Plot de-trended series 
plt.plot(detrended)
plt.show()

In [91]:
# Again decomposing the de-trended model and plotting trend
detrended_decomposed = seasonal_decompose(detrended, model='additive', period=1)

# Keep in mind. This is a naive decomposition scheme and doesn't take into account the non-linear decompositions 
plt.plot(detrended_decomposed.trend)
plt.show()

# Plot the residual 
plt.plot(detrended_decomposed.resid)

# Using and Removing Seasonality

1. It may result in a clearer signal
2. Additional information about the seasonal component of time series can provide new information

**Identification of trend and seasonality components is subjective**:
1. For example:
   Time of Day, Daily, Weekly, Monthly and yearly 

**Seasonal Adjustment with Modelling**:
1. Once fit, the model can then be sued to calculate a seasonal component for any time index. In the case of temperature data, the time index would be the day of the year. We can then estimate the seasonal component for the day of the year for any historical observations or any new observations in the future
2. Use Numpy's polyfit() function that can be used to fit a polynomial of a chosen order to a dataset. First, we can create a dataset of time index(day in this case) to observation. We could take a single year of dat aor all the years. Ideally, we try and test on both of them 
3. Use the polyfit() function to estimate the coefficients of the polynomial function 

In [92]:
# Model seasonality with a polynomial model
# Working on the melbourne DATA
# Here I have converted the dataframe, i.e, all the years, into a single year for modelling  
X = [i%365 for i in range(len(melbourne))]
y = melbourne.values

# Seasonality is like a sine-wave. A fourth-degree polynomial can fit the pattern
deg = 4
coeff = np.polyfit(X, y, deg)
print(coeff)
print("\n")
# Remember, in the outputs, the larger degree always comes first 

In [93]:
# Create the curve out of yearly data 
# Transform each of the values in X to fit the polynomial 
curve = []
for i in range(len(X)):
    # Add the bias term initially 
    value = coeff[-1]
    
    # Then transform X[i] to fit the polynomial function and store the same
    # Loop over all degrees to fit the fourth-degree polynomial 
    
    for d in range(deg):
        value += X[i] ** (deg-d) * coeff[d]
    curve.append(value)
    
# Now plot the curve over the original plot 
plt.plot(melbourne.values, color="blue")
plt.plot(curve, color="yellow")
plt.show()

In [94]:
# Remove the curvature part from the series and then plot the dataset again 
season_diff = []
for i in range(len(melbourne)):
    season_diff.append(melbourne.values[i]-curve[i])
    
# Converting season_diff to a numpy array 
season_diff = np.array(season_diff)

# Plot season_diff
plt.subplot(211)
plt.plot(season_diff)

# Draw the histogram 
plt.subplot(212)
plt.hist(season_diff)

# The seasonality component is removed, but now its a random walk 

plt.show()

# Stationarity in Time Series Data

In [95]:
# There are important assumptions about the nature of data that needs to be handled during modelling 
# For eg., one of the key assumptions is that the summary statistics of data should be consistent throughout, i.e., constant mean and variance
# Hence, toning down the time component in time series is of utmost significance as it is dependent upon past observations, and hence, can contribute to erroneous modelling
# Since a time-dependent time series has inconsistent summary statistics, it is referred to as non-stationarity data 
# Hence, it must be treated and converted into stationary format. Such data don't have seasonal or trend effects.

# There are some terms associated with time series data
# A stationary process is one that exhibits a stationary series of observations
# A stationary model is one that describes a stationary series of observations 
# A time series that does not exhibit a trend is trend stationary 
# One that doesn't exhibit seasonality is seasonal stationary 
# A strictly stationary series is one whose joint distibuton of variables is time invariant 


# How non-stationarity can be checked?
# Check for line plots
# Check for statistical tests. T-tests or other standard statistical tests can't be applied as they work best for data as raw values, not residuals. 
# Checking for mean and variance - summary statistics, isn't always the best indicator for determining if a time series is stationary or not. As even white noise can conform to this observation. 
# Furthermore, they follow an underlying assumption that the data samples are independent and uncorrelated with one another, which is often the case with time series. 

In [96]:
# Draw a line plot for Daily Births Dataset 
series = pd.read_csv("../input/daily-total-female-births-in-california-1959/daily-total-female-births-CA.csv", header=0, index_col=0, parse_dates=True, squeeze=True)
series.plot()
plt.show()

In [97]:
# Decompose it and analyze it 
decomposed_series = seasonal_decompose(series, model="additive", period=1)

# Plot the trend
plt.subplot(311)
plt.plot(decomposed_series.trend)

# Plot seasonality 
plt.subplot(312)
plt.plot(decomposed_series.seasonal)


In [98]:
# The above plot fails the H0 of Ad-Fuller Test. Hence it is a stationary series. 