# Time Series Analysis - Data Normalisation and Pivot Table

A reference notebook to normalisae data, and to create pivot table

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# read data
tsla = pd.read_csv('../input/tesla-stock-price/Tesla.csv - Tesla.csv.csv')
tsla.head()

In [None]:
# Set Date as Datetime index
tsla["Date"] = pd.to_datetime(tsla["Date"])
tsla = tsla.set_index(keys = 'Date')
tsla.head()

In [None]:
# Only look at Close column
tsla = tsla['Close'].to_frame()
tsla.head()

In [None]:
# plot graph
tsla.plot(figsize=(15,8))

In [None]:
tsla_2011 = tsla.loc['2011']
tsla_2011.plot(figsize=(15,8))

In [None]:
# re-index with all days!
all_days = pd.date_range(start=tsla_2011.index.min(), end=tsla_2011.index.max())
tsla_2011_all = tsla_2011.reindex(all_days)
tsla_2011_all.head(10)

In [None]:
# plot graph. Notice gaps in data
tsla_2011_all.plot(figsize=(15,8))

In [None]:
# fill NA through forward fill strategy
tsla_2011_all = tsla_2011_all.ffill()
tsla_2011_all.plot(figsize=(15,8))

In [None]:
# re-sample all to only weekdays
tsla_2011_all.resample('B').first().plot(figsize=(15,8))

In [None]:
# fun graph!
import matplotlib.pyplot as plt
tsla.groupby(tsla.index.year)['Close'].plot(legend=True)

In [None]:
# group data by year, plot each year in each line

fig, ax = plt.subplots(figsize=(15,8))
for label, df in tsla.groupby(tsla.index.year):
    df['Date'] = df.index.day_of_year
    df.plot(kind="line", x='Date', y='Close', ax=ax, label=label)

plt.legend(loc='lower right')
plt.show()

In [None]:
# alternatively, use pivot_table to create multi-index object
# first, re-index for entire date range. Then, forward fill NaN
all_days = pd.date_range(start=tsla.index.min(), end=tsla.index.max())
tsla = tsla.reindex(all_days).ffill()
piv_tsla = tsla.pivot_table(index=tsla.index.day_of_year, columns=[tsla.index.year], values='Close')
piv_tsla.head()

In [None]:
piv_tsla.tail()

In [None]:
# plot the pivot table
piv_tsla.plot(figsize=(15,8))

In [None]:
# note the plenty of NaN values should I choose to not perform re-index first
tsla = pd.read_csv('../input/tesla-stock-price/Tesla.csv - Tesla.csv.csv')
tsla["Date"] = pd.to_datetime(tsla["Date"])
tsla = tsla.set_index(keys = 'Date')
piv_tsla_bad = tsla.pivot_table(index=tsla.index.day_of_year, columns=[tsla.index.year], values='Close')
piv_tsla_bad.isna().sum()

In [None]:
# see the gaps
piv_tsla_bad.plot(figsize=(15,8))

In [None]:
# see and horrible forward-fill
piv_tsla_bad.ffill().plot(figsize=(15,8))

In [None]:
# note the 2017 values
piv_tsla_bad.ffill().tail()

In [None]:
# smooth graph if we create pivot table and re-index per week
all_days = pd.date_range(start=tsla.index.min(), end=tsla.index.max())
tsla = tsla.reindex(all_days).ffill()
piv_tsla_weekly = tsla.pivot_table(index=tsla.index.isocalendar().week, columns=[tsla.index.year], values="Close")
piv_tsla_weekly.plot(figsize=(15,8))

In [None]:
# use re-sample strategy
tsla_resample = tsla.resample('W')['Close'].mean().to_frame()
piv_tsla_weekly_resample = tsla_resample.pivot_table(index=tsla_resample.index.isocalendar().week, columns=[tsla_resample.index.year])
piv_tsla_weekly_resample.plot(figsize=(15,8))

In [None]:
# normalise the data, so that value on Jan 1st is 100, and other days are a ratio of this.
# allows visability of inter-year growth
norm_df = piv_tsla.div(piv_tsla.iloc[0]).mul(100)
norm_df.plot()

In [None]:
# smooth it by using a rolling average.
rolling_norm_df = norm_df.rolling(10).mean()
rolling_norm_df.plot(figsize=(15,8))
plt.show()

In [None]:
# another way to normalise date
norm_df_2 = ((piv_tsla - piv_tsla.mean()) / piv_tsla.std()).rolling(10).mean()
norm_df_2.plot()