In [2]:
from statsmodels.sandbox.stats.runs import runstest_1samp
import scipy.stats as st
import numpy as np
import pandas as pd

# On Random Data

In [2]:
# generate random array of True and False
x = np.random.normal(size=100) > 0

(-0.8730565722994805, 0.3826322430748368)

In [3]:
true_count = sum(x)
false_count = len(x) - true_count
# expected mean runs and variance + stdev for independent trials
mean = 2*true_count*false_count/len(x) + 1
variance = (mean-1)*(mean-2)/(len(x)-1)
stdev = variance**.5

true_count, false_count, mean, variance

(44, 56, 50.28, 24.032711111111112)

In [4]:
# manually count runs
run_count = 1
for i, val in enumerate(x[1:]):
    prev_val = x[i]
    if val is not prev_val:
        run_count += 1
run_count        

46

In [5]:
# calculate z statistic and p value
z = (run_count - mean)/variance**.5
p_value = 2*(1 - st.norm.cdf(abs(z)))
# z and p_value should match output of runstest_1samp
z, p_value

(-0.8730565722994805, 0.3826322430748368)

In [6]:
# verify against statsmodels function
runstest_1samp(x)

(-0.8730565722994805, 0.3826322430748368)

# On SPY

In [5]:
df = pd.read_csv('SPY.csv')
df['pct_change'] = df['Adj Close'].pct_change()
df['positive_return'] = df['pct_change'] > 0
df[['Adj Close', 'pct_change', 'positive_return']].head()

Unnamed: 0,Adj Close,pct_change,positive_return
0,26.299288,,False
1,26.486324,0.007112,True
2,26.542448,0.002119,True
3,26.822998,0.01057,True
4,26.93524,0.004185,True


In [11]:
len(df[df['pct_change']<0])

3103

In [21]:
total_days = len(df['positive_return'])
up_days = sum(df['positive_return'])
down_days = total_days - up_days
up_pct, down_pct = round(100*up_days/total_days, 2), round(100*down_days/total_days, 2)

print(f'up days: {up_days} ({up_pct}%)\ndown/even days: {down_days} ({down_pct}%)')

up days: 3690 (53.84%)
down/even days: 3164 (46.16%)


In [24]:
run_count = 1
for i, val in enumerate(df['positive_return'].values[1:]):
    prev_val = df['positive_return'].values[i]
    if val is not prev_val:
        run_count += 1
expected_run_count = 2*up_days*down_days/len(df['positive_return']) + 1

print(f'expected run count {expected_run_count}\nactual run count: {run_count}')

expected run count 3407.8164575430405
actual run count 3502


In [25]:
runstest_1samp(df['positive_return'])

(2.2889188671396647, 0.022084066820886827)