# Extracting Features using PCA

$X_{t}(m, \tau) = \ln \bar{\sigma}_{t}(m, \tau), \ \text{where} \ \bar{\sigma}_{t}(m, \tau) \in \bar{\Sigma}_{t}$

$U_{t}(m, \tau) = \ln \bar{\sigma}_{t}(m, \tau) - \ln \bar{\sigma}_{t-1}(m, \tau) \, \text{for} \, (m, \tau) \in \mathcal{I}_0.$


**We perform PCA on** $\{U_{t}(m, \tau), (m, \tau) \in \mathcal{I}_0\}$

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('predicted_iv_new.csv')
df.head()

Unnamed: 0,date,tau,m,IV
0,2024-01-02,0.027397,-0.510826,0.337169
1,2024-01-02,0.027397,-0.223144,0.223939
2,2024-01-02,0.027397,-0.105361,0.210374
3,2024-01-02,0.027397,-0.051293,0.210528
4,2024-01-02,0.027397,-0.025318,0.212029


In [3]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

In [4]:
df.sort_values(by=['date', 'tau', 'm'], inplace=True)
df.reset_index(drop=True, inplace=True)
df['Xt'] = np.log(df['IV'])
df['U_mt'] = df.groupby(['m', 'tau'])['Xt'].diff()
df.replace(np.nan, 0, inplace=True)
df.head()

Unnamed: 0,date,tau,m,IV,Xt,U_mt
0,2024-01-02,0.027397,-0.510826,0.337169,-1.087172,0.0
1,2024-01-02,0.027397,-0.223144,0.223939,-1.496383,0.0
2,2024-01-02,0.027397,-0.105361,0.210374,-1.558868,0.0
3,2024-01-02,0.027397,-0.051293,0.210528,-1.558138,0.0
4,2024-01-02,0.027397,-0.025318,0.212029,-1.551032,0.0


In [5]:
dates = df['date'].unique()
dates

array(['2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05',
       '2024-01-08', '2024-01-09', '2024-01-10', '2024-01-11',
       '2024-01-12', '2024-01-16', '2024-01-17', '2024-01-18',
       '2024-01-19', '2024-01-22', '2024-01-23', '2024-01-24',
       '2024-01-25', '2024-01-26', '2024-01-29', '2024-01-30',
       '2024-01-31'], dtype=object)

In [6]:
dates = df['date'].unique()
starting_date = dates[0]
pivot = df.pivot(index='date',columns=['m','tau'],values='U_mt')
K = pivot.cov().values
print(pivot.shape)
eigenvalues, eigenvectors = np.linalg.eigh(K)
sorted_indices = np.argsort(eigenvalues)[::-1]
sorted_eigenvectors = eigenvectors[:, sorted_indices]
k = 5
top_eigenvectors = sorted_eigenvectors[:, :k]
print(top_eigenvectors.shape)
features = pd.DataFrame()
for date in dates[1:]:
    xt = df[df['date'] == date].sort_values(by=['m', 'tau'])['Xt'].values
    x0 = df[df['date'] == starting_date].sort_values(by=['m', 'tau'])['Xt'].values
    u = xt - x0
    feature = np.dot(u, top_eigenvectors)
    features[date] = feature
features = features.T
features.head()

(21, 154)
(154, 5)


Unnamed: 0,0,1,2,3,4
2024-01-03,-0.028288,-0.067036,0.014254,0.000928,0.009124
2024-01-04,0.050312,0.091208,-0.048933,-0.004202,-0.051605
2024-01-05,0.115499,0.185125,-0.058014,-0.005276,-0.087394
2024-01-08,0.088601,-0.221109,0.500685,0.033823,0.195944
2024-01-09,0.126621,-0.075945,0.367717,0.032924,0.122525
