In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import numpy as np
import statsmodels.api as sm
from functools import partial as par
import functools as ft
import time
from ctypes import c_void_p, Structure, c_double, c_int, cdll, cast, POINTER
from numpy.ctypeslib import ndpointer
from sklearn.model_selection import train_test_split as tts
def F(*z):
    z = [*z]
    z[0] = [z[0]]
    return [*ft.reduce(lambda x, y: map(y, x), z)][0]
FF = lambda *z: [*ft.reduce(lambda x, y: map(y, x), z)]
fyx = lambda f, *x: lambda *y: f(*y, *x)

lib = cdll.LoadLibrary("./a.so")

In [2]:
df = pd.read_hdf("sp500.h5")
temp = df["volume"].groupby(level = 1)\
                    .apply(lambda x: x.ewm(halflife = 60).mean());
df.loc[temp.index, "naive"] = temp.groupby(level = 1).shift(1);
# print(df.loc[pd.IndexSlice["2000-01-01":"2000-01-10", "AAPL"], :])
# -> Appendix A1
window = 5
for i in range(window + 1):
	name = f"k{i}"
	df[name] = (df["volume"].groupby(level = 1).shift(i)
                / df["naive"] - 1)

In [3]:
tvSlice = pd.IndexSlice[:"2015-01-01", :]
testSlice = pd.IndexSlice["2015-01-01":, :]

In [4]:
df_nona = df.loc[tvSlice].dropna(
	subset = [f"k{i}" for i in range(window + 1)]
)

In [5]:
uspn = list(set(df_nona.index.get_level_values("uspn").to_list()))
uspn.sort()
print(len(uspn))
X, Y, w = [], [], []
for x in uspn:
	table = df_nona.loc[
		pd.IndexSlice[:, x],
		["sp_weight"] + [f"k{i}" for i in range(window + 1)]
	]
	w.append(table["sp_weight"].to_numpy())
	Y.append(table["k0"].to_numpy())
	X.append(np.array([
		table[f"k{i + 1}"].to_numpy() for i in range(window)
	]).T)
X = np.concatenate(X)
Y = np.concatenate(Y)
w = np.concatenate(w)
tot = w.shape[0]
print(tot)
step = tot - tot // 10
print(step)
perm = F(tot, range, np.random.permutation)
w = w[perm]
X = X[perm]
Y = Y[perm]
X1, Y1, w1 = X[:step], Y[:step], w[:step]
X2, Y2, w2 = X[step:], Y[step:], w[step:]

484
1629303
1466373


In [6]:
def c_wls_iter(
    y_next, x_next, w_next, n2, m,
    x = np.array([]), xTwx = None, xTwy = None,
    n1 = 0, update = False
):
    tot_len = n1 + 2 * n2 + m * (m + 2)
    if xTwx is None:
        xTwx = np.zeros((m, m))
    if xTwy is None:
        xTwy = np.zeros((m, 1))
    double = lambda x: x.astype('d')
    x_next = double(x_next)
    w_next = double(w_next)
    y_next = double(y_next)
    x = double(x)
    xTwx = double(xTwx)
    xTwy = double(xTwy)
    lib.wls_iter.restype = ndpointer(dtype = c_double, shape = (tot_len,))
    results = lib.wls_iter(
        c_void_p(x.ctypes.data),
        c_void_p(xTwx.ctypes.data),
        c_void_p(xTwy.ctypes.data),
        c_void_p(x_next.ctypes.data),
        c_void_p(w_next.ctypes.data),
        c_void_p(y_next.ctypes.data),
        n1, n2, m, update
    )
    xTwx = results[:m * m].reshape(m, m)
    xTwy = results[m * m : m * (m + 1)]
    predict = results[m * (m + 1) : m * (m + 2)]
    yhat_next = results[m * (m + 2) : m * (m + 2) + n2]
    if update:
        yhat = results[-n1 - n2:]
    else:
        yhat = None
    return xTwx, xTwy, predict, yhat_next, yhat

In [10]:
time0 = time.time()
test = sm.WLS(Y1, X1, weights = w1, missing = "drop").fit()
print(test.summary())
time1 = time.time()
results = c_wls_iter(Y1, X1, w1, step, window)
xTwx = results[0]
xTwy = results[1]
[print(x) for x in results[2]]
time2 = time.time()
print("Time Usage")
print(f"statsmodels WLS {step}:", time1 - time0)
print(f"C code {step}:", time2 - time1)
print("We get the same results for 2 methods above.")
test = sm.WLS(Y, X, weights = w, missing = "drop").fit()
print(test.summary())
time3 = time.time()
results = c_wls_iter(Y2, X2, w2, tot - step, window,
                        xTwx = xTwx, xTwy = xTwy, n1 = step)
[print(x) for x in results[2]]
time4 = time.time()
print(f"statsmodels WLS {tot}:", time3 - time2)
print(f"C code next {tot - step}:", time4 - time3)
print("We get the same results for 2 methods above.")

                                 WLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.161
Model:                            WLS   Adj. R-squared (uncentered):              0.161
Method:                 Least Squares   F-statistic:                          5.619e+04
Date:                Fri, 24 Feb 2023   Prob (F-statistic):                        0.00
Time:                        01:21:17   Log-Likelihood:                     -1.9748e+06
No. Observations:             1466373   AIC:                                  3.950e+06
Df Residuals:                 1466368   BIC:                                  3.950e+06
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------