In [1]:
import pandas as pd
from scipy.stats import chi2
import numpy as np

## Discretization of the set into 37, 74, and 148 states

In [2]:
def descritezation37(df):
    df['Val']=df['Val']*4
    df['Down']=df['Down']*4
    df['Up']=df['Up']*4
    df = df.round({'Val':0, 'Down':0, 'Up':0})
    return df

In [3]:
def descritezation74(df):
    df['Val']=df['Val']*8
    df['Down']=df['Down']*8
    df['Up']=df['Up']*8
    df = df.round({'Val':0, 'Down':0, 'Up':0})
    return df

In [4]:
def descritezation148(df):
    df['Val']=df['Val']*16
    df['Down']=df['Down']*16
    df['Up']=df['Up']*16
    df = df.round({'Val':0, 'Down':0, 'Up':0})
    return df

## Set normalization

In [5]:
def norm(df):
    temp = min(min(df['Down']),min(df['Val']))
    df['Val']=df['Val']-temp
    df['Down']=df['Down']-temp
    df['Up']=df['Up']-temp
    return df

## Representation of a chain as a matrix of sequential states

In [6]:
def view(df):
    temp = df.copy()
    temp['step1']=temp['Val'].shift(-1)
    temp['step2']=temp['Val'].shift(-2)
    return temp

## Splitting the chain into time intervals

In [7]:
def period(df,year1,month1,day1,year2,month2,day2):
    temp = df[(df['Date'] >= str(year1) + '-' + str(month1) + '-' + str(day1)) & (df['Date'] <= str(year2) + '-' + str(month2) + '-' + str(day2))]
    return temp

## Сalculating chain frequencies

In [8]:
def freq(df):
    temp = df.copy()
    value=temp.sort_values('Val').Val.drop_duplicates().reset_index()
    freq_i =temp.groupby('Val').count().reset_index()
    freq_i.drop(['Date', 'Up','step1','step2'], axis='columns', inplace=True)
    freq_i.rename(columns={'Down': 'Count_freq_i'}, inplace=True)
    freq_ij=temp.groupby(['Val','step1']).count().reset_index()
    freq_ij.drop(['Date', 'Up','step2'], axis='columns', inplace=True)
    freq_ij.rename(columns={'Down': 'Count_freq_ij'}, inplace=True)
    freq_ij2=temp.groupby(['Val','step1','step2']).count().reset_index()
    freq_ij2.drop(['Date', 'Up',], axis='columns', inplace=True)
    freq_ij2.rename(columns={'Down': 'Count_freq_ij2'}, inplace=True)
    freq_j=temp.groupby('step1').count().reset_index()
    freq_j.drop(['Date', 'Up','Val','step2'], axis='columns', inplace=True)
    freq_j.rename(columns={'Down': 'Count_freq_j'}, inplace=True)
    freq_0i0=temp.groupby('step1').count().reset_index()
    freq_0i0.drop(['Date', 'Up','Val','step2'], axis='columns', inplace=True)
    freq_0i0.rename(columns={'Down': 'Count_freq_0i0'}, inplace=True)
    freq_0ij=temp.groupby(['step1','step2']).count().reset_index()
    freq_0ij.drop(['Date', 'Up','Val'], axis='columns', inplace=True)
    freq_0ij.rename(columns={'Down': 'Count_freq_0ij'}, inplace=True)
    return freq_i,freq_ij,freq_ij2,freq_j,freq_0i0,freq_0ij

## Сalculating statistics to test the independence hypothesis

In [9]:
def independent(temp, conf):
    N=len(temp.sort_values('Val').Val.drop_duplicates())
    n=len(temp)
    freq_i,freq_ij,freq_ij2,freq_j,freq_0i0,freq_0ij = freq(temp)
    h=temp[['Val','step1']].merge(freq_i,how='left', on='Val',suffixes=['','_freq_i']).merge(freq_j,how='left', on='step1',suffixes=['','_freq_j']).merge(freq_ij,how='left', on=['Val','step1'],suffixes=['','_freq_ij']).drop_duplicates()
    h=h.fillna(0) 
    h['t']=(h.Count_freq_ij-h.Count_freq_i*h.Count_freq_j/n)**2/(h.Count_freq_i*h.Count_freq_j/n)
    t=h.t.sum()
    t_chi=chi2.ppf(conf, ((N**1-1)*(N-1)) )
    return t,t_chi

## Сalculating statistics to test chain order hypothesis

In [10]:
def order1(temp, conf):
    freq_i,freq_ij,freq_ij2,freq_j,freq_0i0,freq_0ij = freq(temp)
    h=temp[['Val','step1', 'step2']].merge(freq_ij2,how='left',on=['Val', 'step1', 'step2'],suffixes=['','freq_ij2'])
    h=h.merge(freq_0i0, how='left', on='step1', suffixes=['','_0i0'])
    h=h.merge(freq_0ij,how='left',on=['step1', 'step2'],suffixes=['','freq_0ij'])
    h=h.merge(freq_ij ,how='left',on=['Val','step1'],suffixes=['','freq_ij'])
    h=h.merge(freq_i, how='left',on=['Val'],suffixes=['','freq_i']).drop_duplicates().rename(columns={'val':'val freq_ij2'})
    N = len(temp.sort_values('Val').Val.drop_duplicates())
    h['t']=(h.Count_freq_ij-h.Count_freq_i*h.Count_freq_0ij/h.Count_freq_0i0)**2/(h.Count_freq_i*h.Count_freq_0ij/h.Count_freq_0i0)
    t = h.t.sum()
    t_chi = chi2.ppf(conf, ((N**2-N**1)*(N-1)) )
    return t,t_chi

##  Сalculating statistics to test the uniformity hypothesis

In [11]:
def uniform(temp, conf):
    f1=temp[temp.index<temp.index[0] + len(temp)//2]
    f2=temp[temp.index>=temp.index[0] + len(temp)//2]
    N = len(temp.Val.drop_duplicates())
    d = len(temp[["Val","step1"]].drop_duplicates())
    freq_i,freq_ij,freq_ij2,freq_j,freq_0i0,freq_0ij = freq(f1)
    h1 = f1[['Val','step1']].merge(freq_i,how='left',on=['Val'],suffixes=['','freq_i']).merge(freq_ij,how='left',on=['Val','step1'],suffixes=['','freq_ij']).drop_duplicates()
    freq_i,freq_ij,freq_ij2,freq_j,freq_0i0,freq_0ij = freq(f2)
    h2 = f2[['Val','step1']].merge(freq_i,how='left',on=['Val'],suffixes=['','freq_i']).merge(freq_ij,how='left',on=['Val','step1'],suffixes=['','freq_ij']).drop_duplicates()
    freq_i,freq_ij,freq_ij2,freq_j,freq_0i0,freq_0ij = freq(temp)
    h10 = h1.merge(freq_i,how='left',on=['Val'],suffixes=['','freq_i0']).merge(freq_ij,how='left',on=['Val','step1'],suffixes=['','freq_ij0']).drop_duplicates()
    h20 = h2.merge(freq_i,how='left',on=['Val'],suffixes=['','freq_i0']).merge(freq_ij,how='left',on=['Val','step1'],suffixes=['','freq_ij0']).drop_duplicates()
    h10['t'] = h10.Count_freq_ij*np.log(h10.Count_freq_ij*h10.Count_freq_ifreq_i0/(h10.Count_freq_i*h10.Count_freq_ijfreq_ij0))
    h20['t'] = h20.Count_freq_ij*np.log(h20.Count_freq_ij*h20.Count_freq_ifreq_i0/(h20.Count_freq_i*h20.Count_freq_ijfreq_ij0))
    t = h10.t.sum()+h20.t.sum()
    t_chi = chi2.ppf(conf, (d-N)*(2-1))
    return t,t_chi

### Hypothesis testing over a 2-year interval with 37 discrete states and a confidence level of 0.99

In [12]:
data = pd.read_excel('Data.xlsx')
data['Date'] = pd.to_datetime(data['Date'],format="%d.%m.%Y")

In [13]:
temp = descritezation37(data)
temp = norm(temp)
temp = period(temp,2012,1,15,2014,1,15)
temp = view(temp)

In [14]:
print("(t, t_chi)")
print(independent(temp,0.99))
print(order1(temp,0.99))
print(uniform(temp,0.99))

(t, t_chi)
(3300.433344864523, 559.3059462065796)
(3977.451659336085, 11482.055760113757)
(42.36023828774787, 85.95017624510335)
