## Creat dummy data

In [1]:
import yfinance as yf
index = yf.Ticker('XWD.TO')
index_returns = index.history(period='10y')
r = index_returns['Open']

In [2]:
r = r.pct_change().dropna()*100
r

Date
2011-05-19    0.207463
2011-05-20    0.455487
2011-05-24   -1.154162
2011-05-25   -0.875717
2011-05-26    1.262089
                ...   
2021-05-12   -0.455189
2021-05-13   -0.504572
2021-05-14    1.236137
2021-05-17    0.516590
2021-05-18   -0.218034
Name: Open, Length: 2509, dtype: float64

## Test out arch model

In [3]:
from arch import arch_model
am = arch_model(r)

In [4]:
res = am.fit()

Iteration:      1,   Func. Count:      6,   Neg. LLF: 16114.861992806535
Iteration:      2,   Func. Count:     15,   Neg. LLF: 752153.6328243011
Iteration:      3,   Func. Count:     24,   Neg. LLF: 3504.0171689396484
Iteration:      4,   Func. Count:     32,   Neg. LLF: 3552.7127038993694
Iteration:      5,   Func. Count:     39,   Neg. LLF: 2887.171220917003
Iteration:      6,   Func. Count:     44,   Neg. LLF: 2887.092341376512
Iteration:      7,   Func. Count:     49,   Neg. LLF: 2887.091410701239
Iteration:      8,   Func. Count:     54,   Neg. LLF: 2887.0914039175414
Iteration:      9,   Func. Count:     58,   Neg. LLF: 2887.091403918342
Optimization terminated successfully    (Exit mode 0)
            Current function value: 2887.0914039175414
            Iterations: 9
            Function evaluations: 58
            Gradient evaluations: 9


In [5]:
print(res.summary())

                     Constant Mean - GARCH Model Results                      
Dep. Variable:                   Open   R-squared:                       0.000
Mean Model:             Constant Mean   Adj. R-squared:                  0.000
Vol Model:                      GARCH   Log-Likelihood:               -2887.09
Distribution:                  Normal   AIC:                           5782.18
Method:            Maximum Likelihood   BIC:                           5805.49
                                        No. Observations:                 2509
Date:                Wed, May 19 2021   Df Residuals:                     2508
Time:                        11:13:08   Df Model:                            1
                                 Mean Model                                 
                 coef    std err          t      P>|t|      95.0% Conf. Int.
----------------------------------------------------------------------------
mu             0.0644  1.417e-02      4.547  5.437e-06 [3.

In [6]:
forecasts = res.forecast(reindex=False)
forecasts.variance['h.1'][0]

0.8799900048475655

## Create dataset for linear model

In [7]:
import tqdm
import numpy as np
garch_estimates = [None]*2000
true_volatility = [None]*2000

for i in tqdm.tqdm(range(2000, len(r)-1)):
    # Get estimated garcg
    r_filtered = r[i-200:i]
    am = arch_model(r_filtered)
    res = am.fit(disp="off")
    forecasts = res.forecast(reindex=False)
    garch_estimates.append(forecasts.variance['h.1'][0])
    
    # Get true volatility
    r_filtered = r[i-99:i+1]
    true_volatility.append(np.std(r_filtered)) 

100%|█████████████████████████████████████████████████████████████████████████████| 508/508 [00:23<00:00, 21.90it/s]


In [8]:
import pandas as pd
result = pd.DataFrame(true_volatility, columns=['true_volatility'])
result.loc[:, 'garch_volatility_prediction'] = garch_estimates
result.loc[:, 'dummy'] =  np.random.uniform(0,1,result.shape[0])
result.dummy = result.dummy.apply(lambda x: int(x > 0.8))
result = result.dropna()
result

Unnamed: 0,true_volatility,garch_volatility_prediction,dummy
2000,0.787779,0.483027,0
2001,0.779430,0.484552,0
2002,0.781971,0.448278,1
2003,0.777944,0.632526,0
2004,0.777648,0.558970,0
...,...,...,...
2503,0.678074,0.414559,1
2504,0.676145,2.715654,0
2505,0.677314,1.923364,0
2506,0.687561,1.416646,1


## Fit linear model


In [9]:
import statsmodels.api as sm
X = result[['garch_volatility_prediction', 'dummy']]
X = sm.add_constant(X)
y = result['true_volatility']
mod = sm.OLS(y, X)
res = mod.fit()

In [10]:
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:        true_volatility   R-squared:                       0.213
Model:                            OLS   Adj. R-squared:                  0.209
Method:                 Least Squares   F-statistic:                     68.14
Date:                Wed, 19 May 2021   Prob (F-statistic):           6.32e-27
Time:                        11:13:32   Log-Likelihood:                -360.86
No. Observations:                 508   AIC:                             727.7
Df Residuals:                     505   BIC:                             740.4
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

## Prediction


In [11]:
intercept = res.params['const']
garch_vol = res.params['garch_volatility_prediction']
dummy_var = res.params['dummy']

obs = len(r)
r = r[obs-200:obs] # Only use last 50 obs just like in training

# Get arch forecast
am = arch_model(r)
res = am.fit(disp="off")
forecasts = res.forecast(reindex=False)
forecast = forecasts.variance['h.1'][0]

# Prediction
print("For dummy equals 1")
print("------------------")
dummy = 1
pred = intercept + garch_vol*forecast + dummy_var*dummy
print(pred)

print("\nFor dummy equals 0")
print("------------------")
dummy = 0
pred = intercept + garch_vol*forecast + dummy_var*dummy
print(pred)

For dummy equals 1
------------------
0.9625378738654022

For dummy equals 0
------------------
0.9923481739556161
