# Time series demo 3: AR(p) models on real tick data

**Guest lecture**

Columbia IEOR 4729 : _Model Based Trading: Theory and Practice_

Q McCallum (http://qethanm.cc)

In [None]:
import numpy as np
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
## %matplotlib inline

In [None]:
np.random.seed( 4729 )

In [None]:
## local: 
data_file = "data/GOOG-trades-20190901-20190930-382d94c05e6b384c.pkl.bz2"

In [None]:
data_raw = pd.read_pickle( data_file )

In [None]:
data_raw.columns

In [None]:
len( data_raw )

In [None]:
## That's a lot of data for this example, so let's take the first
## few thousand values.

## For the sake of example, we are making the (very big!) assumption that the
## data will fit an autoregressive model...
y_ar_tick = data_raw[ "PRICE" ][ 1000:2000 ]

In [None]:
y_ar_tick[:10]

In [None]:
## Well, how does the raw data look?
_ = pd.Series( y_ar_tick ).plot(
    title = "AR(?) data" ,
    figsize = ( 20 , 6 )
)

## Looking like a random walk ...

In [None]:
## As always, let's check those ACF and PACFs
_ = sm.graphics.tsa.plot_acf( y_ar_tick )
_ = sm.graphics.tsa.plot_pacf( y_ar_tick )


So ... these ACF and PACF plots are indicative of a random walk.  Notice the slow degrade on the ACF/correlogram, compared to the what you see with white noise.

In [None]:
## Still, let's try to fit this series to a model. 
## Our trusty for() loop makes a return:

ar_p_to_try = [
    (5,0) ,
    (4,0) ,
    (3,0) ,
    (2,0) ,
    (1,0) ,
]

param_search_results = []

print( "(Remember: lowest AIC wins)" )

for ar_p in ar_p_to_try :
    print( "trying parameters: {}".format( ar_p ) )
    model_testing = sm.tsa.ARMA( y_ar_tick , ar_p ).fit( trend="nc" , disp=0 )
    ## model_testing = sm.tsa.AR( y_ar ).fit( maxlag=10 )

    print( "model params: {}".format( model_testing.params ) )
    print( "AIC:     {}".format( model_testing.aic ) )
    print( "BIC:     {}".format( model_testing.bic ) )
    print()

In [None]:
testing_params = ( 5 , 0 )
fit_ar_tick = sm.tsa.ARMA( y_ar_tick , testing_params ).fit( trend="nc" , disp=0 )

In [None]:
fit_ar_tick.summary()

In [None]:
_ = pd.DataFrame(
    {
        "y_ar"   : y_ar_tick , 
        "model"  : fit_ar_tick.fittedvalues
    }
).plot(
    title = "AR( ? ) series: reality/tick (y_ar) vs prediction (model)" ,
    figsize = ( 20 , 6 )
)

In [None]:
## Don't forget to check those residuals!
_ = sm.graphics.tsa.plot_acf( fit_ar_tick.resid )
_ = sm.graphics.tsa.plot_pacf( fit_ar_tick.resid )

In [None]:
## ##############################################################

## Let's try this again ...

In [None]:
## The _second_ time we run this, we handle the integrated series
## (aka we take the _diff_ of the values) and try again.
y_ar_tick_diff = y_ar_tick.diff()

In [None]:
## How does this diff look?
## Let's inspect the first few values
y_ar_tick_diff[:20]

In [None]:
## We drop the first term, since the first item in the diff()
## result is NaN ... and that _really_ throws off the rest
## of the code.

y_ar_tick_diff = y_ar_tick_diff[1:]

In [None]:
y_ar_tick_diff[:20]

In [None]:
## Well, how does the data look now?
_ = pd.Series( y_ar_tick_diff ).plot(
    title = "AR(?) data" ,
    figsize = ( 20 , 6 )
)

## Looking like a random walk ...

In [None]:
## As always, let's check those ACF and PACFs
_ = sm.graphics.tsa.plot_acf( y_ar_tick_diff )
_ = sm.graphics.tsa.plot_pacf( y_ar_tick_diff )


In [None]:
## Now, let's try to fit this again:

ar_p_to_try = [
    (5,0) ,
    (4,0) ,
    (3,0) ,
    (2,0) ,
    (1,0) ,
]

param_search_results = []

print( "(Remember: lowest AIC wins)" )

for ar_p in ar_p_to_try :
    print( "trying parameters: {}".format( ar_p ) )
    model_testing = sm.tsa.ARMA( y_ar_tick_diff , ar_p ).fit( trend="nc" , disp=0 )
    ## model_testing = sm.tsa.AR( y_ar ).fit( maxlag=10 )

    print( "model params: {}".format( model_testing.params ) )
    print( "AIC:     {}".format( model_testing.aic ) )
    print( "BIC:     {}".format( model_testing.bic ) )
    print()

In [None]:
testing_params = ( 3 , 0 )
fit_ar_tick_diff = sm.tsa.ARMA( y_ar_tick_diff , testing_params ).fit( trend="nc" , disp=0 )

In [None]:
fit_ar_tick.summary()

In [None]:
_ = pd.DataFrame(
    {
        "y_ar"   : y_ar_tick_diff , 
        "model"  : fit_ar_tick_diff.fittedvalues
    }
).plot(
    title = "AR( ? ) series: reality/tick (y_ar) vs prediction (model)" ,
    figsize = ( 20 , 6 )
)

In [None]:
## Don't forget to check those residuals!
_ = sm.graphics.tsa.plot_acf( fit_ar_tick_diff.resid )
_ = sm.graphics.tsa.plot_pacf( fit_ar_tick_diff.resid )