In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # Data visualization library
import seaborn as sns # Data visualization library

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Questions to be asked:
1. Ask relevant questions related to the timestamps
2. Whose timestamp? How are those timestamps generated?
3. At which instant were those timestamps recorded?
4. Guesstimating time stamps to make sense of data
5. I need to be in sync with my data pipeline. Only in this manner I can answer all the relevant questions 
6. Local or universal time. Most timestamps are stored according to the UTC
7. Guesstimating timestamps to make sense of data
8. Is it user behaviour or network behaviour?
9. Date-specific API calls???
10. Psychological Time-Discounting is a manifestation of a phenomenon known as psychological distance, which names our tendency to be more optimistic(and less realistic) when making assessments or estimates that are more distant from us. 
31. Humans know time is passing.

In [None]:
# Treat the first column as index
series = pd.read_csv("../input/daily-total-female-births-in-california-1959/daily-total-female-births-CA.csv", header=0, index_col=0, parse_dates=True, squeeze=True)
series.head(10)

In [None]:
# Do the usual descriptive analysis stuff
print(series.size)
print(series['1959-01'])
print(series.describe())

# Basic Feature Engineering
Time Series data must be re-framed as a supervised learning dataset before we can start using
machine learning algorithms. There is no concept of input and output features in time series.
Instead, we must choose the variable to be predicted and use feature engineering to construct
all of the inputs that will be used to make predictions for future time steps. In this tutorial,
you will discover how to perform feature engineering on time series data with Python to model
your time series problem with machine learning algorithms.
After completing this tutorial, you will know:
*  The rationale and goals of feature engineering time series data.
*  How to develop basic date-time based input features.
* How to develop more sophisticated lag and sliding window summary statistics features.
Let’s dive in

I may enumerate with all the problems of time-stamp and consider what might be useful for the problem, such as:
* Minutes Elapsed
* Hour of Day
* Business Hours

Lag features:
* Weekend or not
* Season of the year
* Business quarter of the year
* Daylight savings or not
* Public holiday or not
* Leap year or not

Adding domain-specific features is a good start for time-series.

**Creating Rolling and Expanding Window summary statistic features**

In [None]:
# Create Rolling Window Statistics
temps = pd.DataFrame(series.values)
width = 3
shifted = temps.shift(width - 1)
window = shifted.rolling(window=width)
dataframe = pd.concat([window.min(), window.mean(), window.max(), temps], axis=1)
dataframe.columns = ['min', 'mean', 'max', 't+1']
print(dataframe.head(5))

In [None]:
print(temps)
print()
print(shifted)

In [None]:
# Create Expanding Window Statistics
temps = pd.DataFrame(series.values)
window = temps.expanding()
dataframe = pd.concat([window.min(), window.mean(), window.max(), temps.shift(-1)], axis=1)
dataframe.columns = ['min', 'mean', 'max', 't+1']
print(dataframe.head(5))

# Time Series Visualization

1. Line Plots
2. Histograms and Density Plots
3. Box and Whisker Plots
4. Heat Maps
5. Lag Plots or Scatter Plots
6. Autocorrelation Plots

In [None]:
# Reading data in a proper format
melbourne = pd.read_csv("../input/melbourne-temperature/daily-minimum-temperatures-in-me.csv", header=0, index_col=0, parse_dates=True, squeeze=True, on_bad_lines='warn')
melbourne = melbourne.apply(lambda x:x.replace('?', ''))
melbourne = melbourne.astype(float)

# Plotting the series
melbourne.plot(style='--') #'k--' is one of the type 
plt.show()

In [None]:
# Create subplots as well
groups = melbourne.groupby(pd.Grouper(freq='A'))
years = pd.DataFrame()
for name, group in groups:
    years[name.year] = group.values
    
years.plot(subplots=True, legend=False)
plt.show()

In [None]:
#Histogram and density plots
melbourne.hist()
plt.show()

In [None]:
# Draw a KDE to understand the data better
melbourne.plot(kind='kde')
plt.show()

In [None]:
#Box and whisker plots by interval
years.boxplot()
plt.show()

In [None]:
len(groups)

In [None]:
#Creating a boxplot of monthly data. 
one_year = melbourne['1990']
groups = one_year.groupby(pd.Grouper(freq='M'))

#The entire group will be grouped by months, and every succeeding month is stacked in a next column
months = pd.concat([pd.DataFrame(x[1].values) for x in groups], axis=1)
months = pd.DataFrame(months)
months.columns = range(1, 13)
months.boxplot()
plt.show()

In [None]:
years

In [None]:
# Heatmaps. pd.Grouper. If group is 'A', then data for each of the 12 months will be shown
groups = melbourne.groupby(pd.Grouper(freq='A'))
years = pd.DataFrame()
for name, group in groups:
    years[name.year]=group.values
years = years.T
years = pd.DataFrame(years)
plt.matshow(years, interpolation=None, aspect='auto')
plt.show()

In [None]:
years

In [None]:
# Do the same for a particular month as well

In [None]:
# Lag Scatter plots
from pandas.plotting import lag_plot
lag_plot(melbourne)
plt.show()

In [None]:
# Create multiple scatter plots for different lags
values = pd.DataFrame(melbourne.values)
lags = 7
columns = [values]

# Creating columns for 't-1', 't-2', 't-3',...., 't-n'th lags 
for i in range(1, (lags+1)):
    columns.append(values.shift(i))
dataframe = pd.concat(columns, axis=1)

columns = ['t']

# Appending name to existing column names
for i in range(1, (lags+1)):
    columns.append('t-' + str(i))
dataframe.columns = columns
plt.figure(i)

# Plotting the data
for i in range(1, (lags+1)):
    plt.scatter(x=dataframe['t'].values, y=dataframe['t-'+str(i)].values)
    
    # Okay, I can plot the subplots in this manner
    ax = plt.subplot(240+i)
    
    # Setting the title
    ax.set_title('t vs. t-' + str(i))

In [None]:
# Autocorrelation plots
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(melbourne)
plt.show()

#Observations
#The given data has a very strong seasonal component

# Resampling and Interpolation
1. Upsampling - For eg., from hours to minutes. Using interpolation - Depends upon the use-case
2. Downsampling - For eg., from minutes to hours. Using aggregation(mean, median and mode) - Depends upon the use-case

1. Upsampling calls for interpolations, which can be linear as well as complex
2. Situations under which we do downsampling of data:
-> When we want to study observations from a particular time frame
-> The original resolution of the data isn't sensible
-> Match against data at a lower frequency. In such cases I would simply aggregate the data or downsample rather than simply aggregating points

3. Situations under which I would upsample the data:
-> Irregular time series
-> Inputs are sampled at different frequencies
-> Knowledge of time-series dynamics determines the interpolation

# Smoothing data
1. Purposes of smoothing - Smoothing data is strongly related to imputing missing data, check for mesurement spikes and errors or both 
2. Data Prepraration


Exponential Smoothing:
1. 

In [None]:
# Upsampling calls for interpolations, which can be linear as well as complex
