# Distribution Shifts

+ Consider our stock data. 
+ We are interested in testing changes in return distribution for our sample data around the time of the onset of the COVID 19 pandemic.

In [1]:
%load_ext dotenv
%dotenv

import sys
sys.path.append("../../05_src")

from logger import get_logger
_logs = get_logger(__name__)

In [2]:
import dask
dask.config.set({'dataframe.query-planning': True})
import dask.dataframe as dd
import pandas as pd
import numpy as np
import os
from glob import glob

In [3]:
ft_dir = os.getenv("FEATURES_DATA")
ft_glob = glob(ft_dir+'/*.parquet')
df = dd.read_parquet(ft_glob).compute().reset_index()

## Data Preparation

+ First, prepare four datasets, each with returns between March of a given year and March of the following year.
+ For each data set, we can compute some descriptive statistics.
+ We observe that there may be some distribution changes.

In [4]:
df_2018 = df[(df['Date'] >= '2018-03-01') & (df['Date']  < '2019-03-01')]
df_2019 = df[(df['Date'] >= '2019-03-01') & (df['Date']  < '2020-03-01')]
df_2020 = df[(df['Date'] >= '2020-03-01') & (df['Date']  < '2021-03-01')]
df_2021 = df[(df['Date'] >= '2021-03-01') & (df['Date']  < '2022-03-01')]
df_2022 = df[(df['Date'] >= '2022-03-01') & (df['Date']  < '2023-03-01')]

In [5]:
df_2018['returns'].describe()

count    121490.000000
mean          0.008862
std           0.340117
min          -0.973106
25%          -0.007950
50%           0.000912
75%           0.009282
max          61.683842
Name: returns, dtype: float64

In [6]:
df_2019['returns'].describe()

count    123327.000000
mean          0.008416
std           0.281940
min          -0.892941
25%          -0.007516
50%           0.001028
75%           0.008948
max          40.907243
Name: returns, dtype: float64

In [7]:
df_2020['returns'].describe()

count    123499.000000
mean          0.010048
std           0.486798
min          -0.794548
25%          -0.012919
50%           0.001297
75%           0.016129
max         136.020301
Name: returns, dtype: float64

In [8]:
df_2021['returns'].describe()

count    124735.000000
mean          0.011876
std           0.435168
min          -0.839560
25%          -0.008815
50%           0.000739
75%           0.010304
max          85.929291
Name: returns, dtype: float64

In [9]:
df_2022['returns'].describe()

count    124021.000000
mean          0.009247
std           0.298627
min          -0.783247
25%          -0.012258
50%           0.000171
75%           0.012640
max          52.553910
Name: returns, dtype: float64

# Komogorov-Smirnov Test

+ The KS test can be accessed via the scipy library: `scipy.stats.kstest`
+ This function can be used to perform two sample tests.
+ The null hypothesis is that the two distributions are identical.

In [10]:
from scipy.stats import kstest

kstest(df_2018['returns'].dropna(), 
       df_2019['returns'].dropna())

KstestResult(statistic=0.012290504975285277, pvalue=1.850509583438257e-08, statistic_location=0.017902198410837622, statistic_sign=-1)

In [11]:
kstest(df_2019['returns'].dropna(), 
       df_2020['returns'].dropna())

KstestResult(statistic=0.13591212536833863, pvalue=0.0, statistic_location=0.01746190224532107, statistic_sign=1)

In [12]:
kstest(df_2020['returns'].dropna(), 
       df_2021['returns'].dropna())

KstestResult(statistic=0.10015613356025355, pvalue=0.0, statistic_location=0.017609724541885585, statistic_sign=-1)

In [13]:
kstest(df_2021['returns'].dropna(), 
       df_2022['returns'].dropna())

KstestResult(statistic=0.06202337702997382, pvalue=2.053099238782615e-208, statistic_location=-0.011759943380979188, statistic_sign=-1)