In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.offline as pyo
import warnings
import os
import sys

from statsmodels.graphics.tsaplots import plot_acf

warnings.filterwarnings('ignore')
pyo.init_notebook_mode()
%matplotlib inline

In [None]:
df = pd.read_csv('../input/crypto-historical-price/data/AAVE-USD.csv', index_col='Date', parse_dates=True, infer_datetime_format=True)
df.head(10)

# Dataset Basic Know-How

In [None]:
print(f'Number of records in the data : {df.shape[0]}')
print(f'Number of time-dependent factors in the data : {df.shape[1]}')

In [None]:
df.describe()

In [None]:
print(f'Start date : {df.index[0]}')
print(f'End date : {df.index[-1]}')

# DataSet Know-How : Percentage of Missing Values

In [None]:
pd.DataFrame((df.isna().sum() / df.isna().count()) * 100, columns=['% Missing']).sort_values(by=['% Missing'], ascending=False)

# DataSet Know-How : Cross Correlation Amongst the Variables

What is Cross-Correlation? 

When the correlation is measured between two different variables, then it is known as Cross-Correlation.

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), vmax=1, vmin=-1, annot=True, linewidth=2).set_title('Cross Correlation amongst the Variables')
plt.show()

What can be seen from the above Cross-Correlation plot is that the variables `Close` and `Adj Close` are literally the same values. The values `Open`, `High`, `Low`, `Close` are almost similar with a correaltion >.95. Now all these variables seems to have a semi-strong correlation with `Volume` variable.

# Data Visualization

In [None]:
def interactive_plot(df, target, title='Interactive Plot'):
    """
    Function to plot an interactive plot of the time-series data.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Represents the original data.
        
    target : str
        Represents a column of the dataframe.
    
    title : str, optional; default : 'Interactive Plot'
        Represents the title for the plot.
    
    """
    
    data = go.Scatter(name=target, x=df.index, y=df[target], mode='lines', line=dict(width=3, color='royalblue'))
    layout = dict(autosize=False, width=900, title=title,
                 xaxis = dict(
                     rangeslider = dict(visible=True),
                     rangeselector = dict(
                         buttons = list([
                             dict(count=1, label='1m', step='month', stepmode='backward'),
                             dict(count=5, label='5m', step='month', stepmode='backward'),
                             dict(count=7, label='7m', step='month', stepmode='backward'),
                             dict(step='all')
                         ])
                     )
                 ))
    
    fig = go.Figure([data], layout=layout)
    
    fig.show()

In [None]:
interactive_plot(df, 'Open', title='Data of Open Price')

In [None]:
interactive_plot(df, 'Volume', title='Data of the Volume time series')

# DataSet Know-How : Auto-Correlation Plots

The AutoCorrelation plot tells us whether there are some correlation between a variable's present value and it's past value.

Another use of AutoCorrelation is to get a visualize expression of whether seasonality exists in the data or not.

In [None]:
fig, ax = plt.subplots(figsize=(10, 4))
plot_acf(df['Open'], ax=ax, use_vlines=False, lags = 150, title='AutoCorrelation of the Open Data');

It can be seen from the above autocorrelation plot of the `Open` Data that after almost like 15 days the autocorrelation amongst relatively weak. Also what can be seen from the plot is that there are some repetative pattern after 100 days. So does this suggest that there is some seasonality present in the data?

In [None]:
fig, ax = plt.subplots(figsize=(10, 4))
plot_acf(df['Volume'], ax=ax, use_vlines=False, lags=150, title='AutoCorrelation of the Volume Data');

# DataSet Know-How : Lag Plots

The Lag Plots are used to see the lagged correlation between a variable's data (which is also the autocorrelation).

In [None]:
def plot_lags(data, lags):
    """
    Function to plot the lag plots of a data.
    
    Parameters:
    -----------
    data : numpy.array
        Represents the data whose lag plots needs to be generated.
        
    lags : array-like
        Represents the array of different type of lags.
        
    """
    
    if len(lags) % 2 != 0:
        raise Exception('Cannot evenly divide the lag plots.')
        
    fig, ax = plt.subplots(nrows=len(lags)//2, ncols=2, figsize=(10, 5))
    plt.subplots_adjust(wspace=0.4, hspace=0.4, top=2.4)
    
    for i in range(len(lags)):
        ax[i//2, i%2].set_title(f'Lag : {lags[i]}')
        pd.plotting.lag_plot(data, lag=lags[i], ax=ax[i//2, i%2], alpha=0.69)
        
        
    fig.show()
    

## DataSet Know-How : Lag Plot for the Open Data

In [None]:
plot_lags(df['Open'], [1, 5, 15, 30, 60, 90]);

The above lag plot of the `Open` Data confirms the claim that the autocorrelation amongst the variable decreases after 15 days.

## DataSet Know-How : Lag Plot for the Volume Data

In [None]:
plot_lags(df['Volume'], [1, 5, 15, 30, 60, 90]);