In [1]:
!pip install -q pandas requests numpy scipy matplotlib dask scikit-learn

In [3]:
import pandas as pd
import requests
import json
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
from scipy.stats import norm
from datetime import datetime
import os
from decimal import Decimal, getcontext
from multiprocessing import Pool

import dask.dataframe as dd
from dask.multiprocessing import get
from dask import delayed

from sklearn.linear_model import LinearRegression
from datetime import datetime
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler

In [4]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [5]:
class api_keys:
    def __init__(self):
        self.session = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(self.session)
        self.list = (
           # "LULKG4NIIZEJI6D2",
            "YPOLRZMHBE5X9RKF",
            "0C4JF7CVP6E14A6N",
            "2YOP96J6F1HGCWPA", # d2
            "3ZFGTFL238KPNCIY", # d3
            "BXPI10OBKLNR0QQ8", # d4
            "K8Q66F1AA7SDI317", # d5
            "MC1DGRM5VJ6QQ4U3", # mp1
            "KTPJ50N8JC8RKEGK", # mp2
            "BC3IFP07ZZFZICJB", # mp3
            "6Y3LB06GFNIU0OB1", # mp4
            "60B1KPJJNGHZIDTC", # bits 1
            "F0NZ9L4D8K4V3QAV", # bits 2
            "8TV5GX40IW442BWY", # bits 3
            "JDGY7JGQ2CGEZ6ER", # d6
            "JT9NDBWC8VFGML3R", # d7
            "SOYVTYHXUFZNH2A1", # d8
            "G5MW2DYRW52UQKEO", # d9
            "QBYT5P0FZ79JJBHT", # d10
            "BY7HI8VLA18UUNVY", #d11
            "WUD5HQDYD0KDK2MH" #d12
        )

    def iterator(self):
        for i in self.list:
            yield i

In [6]:
class getData:
    """
    Use alpha vantage api to get SPY options data
    """
    def __init__(self, symbol='SPY'):
        self.api_keys = api_keys().list
        self.key_iterator = iter(self.api_keys)
        self.symbol = symbol

    def _get_next_key(self):
            try:
                return next(self.key_iterator)
            except StopIteration:
                print("API keys exhausted.")
                return None

    def _fetch_data(self, start_date, end_date):
        query_date_range = pd.date_range(start=start_date, end=end_date)
        data = []
        current_key = self._get_next_key()
        for query_date in query_date_range:
            query_date_str = query_date.strftime('%Y-%m-%d')
            retries = len(self.api_keys)
            while retries > 0:
                url = f'https://www.alphavantage.co/query?function=HISTORICAL_OPTIONS&symbol={self.symbol}&date={query_date_str}&apikey={current_key}'
                response = requests.get(url)
                d = response.json()
                if 'message' in d and d['message'] == 'success':
                    data.append(d['data'])
                    print(f"Fetched data for {query_date_str}")
                    break  # Successful fetch, move to next date
                elif 'message' in d and d['message']!='success':
                    print("No data found")
                    print(d)
                    break # no data for date
                else:
                    print(f"API key {current_key} exhausted: at {query_date_str}")
                    current_key = self._get_next_key()
                    if current_key is None:
                        print(f"API keys exhausted at {query_date_str}")
                        break  # All keys exhausted, stop trying
                    print(f"Using key: {current_key}")
                retries -= 1
        return data

    def run(self, start_date, end_date, savefile):
        data = self._fetch_data(start_date, end_date)
        cwd = os.getcwd()
        savefile = os.path.join(cwd, savefile)
        with open(savefile, 'w') as f:
            json.dump(data, f)

class preprocessData:
    """
    create a dataframe from the json file with required features
    """
    def __init__(self, json_path):
        self.json_path = json_path
        getcontext().prec = 8

    def _get_stats(self, obj):
        exp_date = datetime.strptime(obj['expiration'], '%Y-%m-%d')
        curr_date = datetime.strptime(obj['date'], '%Y-%m-%d')
        tte = (exp_date - curr_date).days
        tau = Decimal(tte) / Decimal(365)
        delta = abs(float(obj['delta']))
        if(delta==1):
            delta = 0.999
        elif(delta==0):
            delta = 0.001
        IV = np.float64(obj['implied_volatility'])
        sigma = IV
        q = Decimal('0.01')
        N_inv = norm.ppf(delta * float(np.exp(float(q) * float(tau))))
        sigma = Decimal(sigma)
        m = (Decimal('0.5') * sigma**2 * tau) + (Decimal(N_inv) * sigma * Decimal(tau).sqrt())
        current_date = curr_date.strftime('%Y-%m-%d')
        expiry_date = exp_date.strftime('%Y-%m-%d')
        return [current_date, expiry_date, tte, tau, delta, IV, sigma, N_inv, m]

    @delayed
    def delayed_stats(self, obj):
        return self._get_stats(obj)

    def fit(self):
        with open(self.json_path, 'r') as f:
            data = json.load(f)

        # results = [self.delayed_stats(d) for d in data]
        # results = dd.from_delayed(results)
        # df = results.compute(scheduler='processes')

        # df = dd.from_pandas(df, npartitions=4)
        # merge all sublists in data into one list
        data = [item for sublist in data for item in sublist]
        result = []
        for d in data:
            result.append(self._get_stats(d))
        df = pd.DataFrame(result)
        df.columns = ['date', 'expiry_date', 'tte', 'tau', 'delta', 'IV', 'sigma', 'N_inv', 'm']
        df = df.groupby(['date', 'm', 'tau', 'delta'])['IV'].mean().reset_index()
        df = df[df.IV < 1]
        return df

class fitSurface:
    """
    columns = ['date','m','tau','delta','IV']
    fit a surface to the IV data with a discrete grid of m and tau
    following Dumas et al. (1998)
    """
    def __init__(self, df):
        self.df = df
        self.grid_tau = [x/365 for x in [10,30,60,91,122,152,182,273,365,547,730]]
        self.grid_m = [np.log(m) for m in [0.6, 0.8, 0.9, 0.95, 0.975, 1, 1.025, 1.05, 1.1, 1.2, 1.3, 1.5, 1.75, 2]]
        try:
            self.dates = self.df['date'].unique()
        except:
            raise ValueError('Input dataframe must have a "date" column')

    def _fit_and_get_model_params(self):
        self.df['m_squared'] = self.df['m']**2
        self.df['tau_squared'] = self.df['tau']**2
        self.df['m_tau'] = self.df['m'] * self.df['tau']
        X = self.df[['m', 'tau', 'm_squared', 'tau_squared', 'm_tau']]
        y = self.df['IV']
        model = LinearRegression()
        model.fit(X, y)
        return model

    def _predict(self, tau, m, model):
        m_squared = m**2
        tau_squared = tau**2
        m_tau = m * tau
        input_data = pd.DataFrame({'m': [m], 'tau': [tau], 'm_squared': [m_squared], 'tau_squared': [tau_squared], 'm_tau': [m_tau]})
        iv_ = max(0.01, model.predict(input_data)[0])
        return iv_

    def fit(self):
        predicted_iv = pd.DataFrame()
        model = self._fit_and_get_model_params()
        for date in self.dates:
            for tau in self.grid_tau:
                for m in self.grid_m:
                    iv = self._predict(tau, m, model)
                    new_row = pd.DataFrame({'date': [date], 'tau': [tau], 'm': [m], 'IV': [iv]})
                    predicted_iv = pd.concat([predicted_iv, new_row])

        predicted_iv.columns = ['date', 'tau', 'm', 'IV']
        return predicted_iv

In [None]:
# get = getData()
# get.run('2016-01-01', '2016-12-31', 'spy_options_data_20.json')
preprocess = preprocessData('./spy_options_data_23.json')
df = preprocess.fit()
print(df.head())
fit = fitSurface(df)
predicted_iv = fit.fit()
print(predicted_iv)
predicted_iv.to_csv('./predicted_iv_23q1.csv')

2025-05-06 15:18:59
No data found
{'endpoint': 'Historical Options', 'message': 'No data for symbol SPY on date 2016-01-01. Please specify a valid combination of symbol and trading day.', 'data': []}
No data found
{'endpoint': 'Historical Options', 'message': 'No data for symbol SPY on date 2016-01-02. Please specify a valid combination of symbol and trading day.', 'data': []}
No data found
{'endpoint': 'Historical Options', 'message': 'No data for symbol SPY on date 2016-01-03. Please specify a valid combination of symbol and trading day.', 'data': []}


JSONDecodeError: Expecting value: line 1 column 1 (char 0)