In [1]:
import pandas as pd
import numpy as np

In [2]:
prices = pd.read_csv("data/GSPC.csv").rename(columns={"Date": "date"}).copy()
prices["date"] = prices["date"].astype("datetime64[D]")
prices["price"] = prices["Adj Close"]
prices["return"] = np.log(prices["Adj Close"]) - np.log(prices["Adj Close"].shift(1))
prices["return"] = prices["return"].fillna(0)
prices = prices[["date", "price", "return"]]
prices.head()

Unnamed: 0,date,price,return
0,2000-01-03,1455.219971,0.0
1,2000-01-04,1399.420044,-0.039099
2,2000-01-05,1402.109985,0.00192
3,2000-01-06,1403.449951,0.000955
4,2000-01-07,1441.469971,0.02673


In [3]:
df = pd.read_parquet("data/spx_vol_surface_history_full_data_23_cal_arb_free.parquet")
df = df.reset_index().rename(columns={"index": "date"})
df = pd.merge(df, prices, on="date")
df.head()

Unnamed: 0,date,ttm_one_month_moneyness_pt_seven,ttm_one_month_moneyness_pt_eightfive,ttm_one_month_moneyness_pt_one,ttm_one_month_moneyness_pt_oneonefive,ttm_one_month_moneyness_pt_onethree,ttm_three_month_moneyness_pt_seven,ttm_three_month_moneyness_pt_eightfive,ttm_three_month_moneyness_pt_one,ttm_three_month_moneyness_pt_oneonefive,...,ttm_two_year_moneyness_pt_eightfive,ttm_two_year_moneyness_pt_one,ttm_two_year_moneyness_pt_oneonefive,ttm_two_year_moneyness_pt_onethree,r_squared,mean_error,mean_absolute_error,observation,price,return
0,2000-01-03,0.00350819,0.310908,0.204889,0.130579,2.364229e-14,0.124386,0.278589,0.220996,0.169352,...,0.271207,0.228578,0.206192,0.189288,0.99461,9e-06,0.001197,114.0,1455.219971,0.0
1,2000-01-04,0.1318434,0.32004,0.240044,0.145056,2.89569e-13,0.41371,0.300571,0.230718,0.18286,...,0.280368,0.24752,0.215297,0.19333,0.997887,-6e-06,0.001046,114.0,1399.420044,-0.039099
2,2000-01-05,3.694933e-14,0.342747,0.22967,0.144152,7.160933e-14,0.12469,0.29515,0.234337,0.188956,...,0.281653,0.247433,0.217685,0.200575,0.998167,-7e-06,0.00108,110.0,1402.109985,0.00192
3,2000-01-06,-4.915936e-32,0.155414,0.186371,0.144371,2.119899e-14,0.043123,0.25011,0.235561,0.164629,...,0.276084,0.244555,0.213681,0.200158,0.980879,-2e-06,0.002502,107.0,1403.449951,0.000955
4,2000-01-07,0.7350337,0.400577,0.205546,0.125253,1.41984e-13,0.393098,0.303779,0.220236,0.170683,...,0.268053,0.237572,0.212611,0.192597,0.999162,3e-06,0.001425,128.0,1441.469971,0.02673


In [4]:
df = df.dropna()
print(len(df))

5822


In [5]:
# each day we have 5 by 5 data
# the numpy array is Nxttmxmoneyness
# ttm = [0.08333,0.25,0.5,1,2] years
# moneyness = [0.7,0.85,1,1.15,1.3]
cols_map = {
    "ttm_one_month_moneyness_pt_seven": (0, 0),
    "ttm_one_month_moneyness_pt_eightfive": (0, 1),
    "ttm_one_month_moneyness_pt_one": (0, 2),
    "ttm_one_month_moneyness_pt_oneonefive": (0, 3),
    "ttm_one_month_moneyness_pt_onethree": (0, 4),

    "ttm_three_month_moneyness_pt_seven": (1, 0),
    "ttm_three_month_moneyness_pt_eightfive": (1, 1),
    "ttm_three_month_moneyness_pt_one": (1, 2),
    "ttm_three_month_moneyness_pt_oneonefive": (1, 3),
    "ttm_three_month_moneyness_pt_onethree": (1, 4),

    "ttm_six_month_moneyness_pt_seven": (2, 0),
    "ttm_six_month_moneyness_pt_eightfive": (2, 1),
    "ttm_six_month_moneyness_pt_one": (2, 2),
    "ttm_six_month_moneyness_pt_oneonefive": (2, 3),
    "ttm_six_month_moneyness_pt_onethree": (2, 4),

    "ttm_one_year_moneyness_pt_seven": (3, 0),
    "ttm_one_year_moneyness_pt_eightfive": (3, 1),
    "ttm_one_year_moneyness_pt_one": (3, 2),
    "ttm_one_year_moneyness_pt_oneonefive": (3, 3),
    "ttm_one_year_moneyness_pt_onethree": (3, 4),   

    "ttm_two_year_moneyness_pt_seven": (4, 0),
    "ttm_two_year_moneyness_pt_eightfive": (4, 1),
    "ttm_two_year_moneyness_pt_one": (4, 2),
    "ttm_two_year_moneyness_pt_oneonefive": (4, 3),
    "ttm_two_year_moneyness_pt_onethree": (4, 4),
}
surface_arr = np.zeros((len(df), 5, 5))

In [6]:
for col in cols_map:
    idx = cols_map[col]
    surface_arr[:, idx[0], idx[1]] = df[col].values

In [7]:
skews = (surface_arr[:, 3, 1] + surface_arr[:, 3, 3]) / 2 - surface_arr[:, 3, 2]
slopes = surface_arr[:, 4, 2] - surface_arr[:, 1, 2]
levels = surface_arr[:, 3, 2]

In [8]:
ret = np.array(df["return"].values)
prices = np.array(df["price"].values)
print(len(surface_arr))
print(len(ret))
print(len(prices))
print(skews.shape)
print(slopes.shape)
print(levels.shape)

5822
5822
5822
(5822,)
(5822,)
(5822,)


In [9]:
np.savez("data/vol_surface_with_ret_cal_arb_free.npz", surface=surface_arr, ret=ret, price=prices, slopes=slopes, skews=skews, levels=levels)

In [10]:
data = np.load("data/vol_surface_with_ret_cal_arb_free.npz")
print(data.files)

['surface', 'ret', 'price', 'slopes', 'skews', 'levels']


In [11]:
v = np.concatenate([ret[...,np.newaxis], skews[...,np.newaxis], slopes[...,np.newaxis], levels[...,np.newaxis]], axis=-1)
print(v.shape)

(5822, 4)
