## Transform a stock option input file into a bigquery dataset
- Calcualte equilibrium stock price where call and put implied volatility are equal
- Isolate strikes that have a bid and are within +- 2 standard deviations of equilibrium price
- Calculate moneyness for all strikes filtered above
- Calculate implied volatility of all strikes filtered above
- generate json formatted output
- write daily output files to bigquery

<b>Data Source:</b> https:\\historicaloptiondata.com

### Destination Table Field Definitions
- partition_date - constant date used to send all records to the same partition (2070-01-01)
- quote_date - market closing date for which the data was captured
- expiry_date - date on which the options expiry
- days_to_expiry _ calendar days between quote_date and expiry_date not including the quote_date
- underlying_price - the price of the underlying asset
- atm_price - adjusted underlying price such that the implied volatility of calls an puts are the same
- atm_iv - the implied volatility for both calls and puts using the atm_price as the current price
- strike_prices
 - strike_price - price at which an option can be excercised
 - call_bid - bid price for the call option 
 - call_ask - ask price for the call option
 - call_volume -  number of call contracts traded
 - call_open_iterest - number of open call contracts
 - call_moneyness - probability that the call option will close in the money on the expiry date (based on atm_implied_volatility)
 - call_iv - the implied volatiity of the call option using the midpoint between the call bid and call ask price
 - put_bid - bid price for the put option 
 - put_ask - ask price for the put option
 - put_volume -  number of put contracts traded
 - put_open_iterest - number of open put contracts
 - put_moneyness - probability that the put option will close in the money on the expiry date (base on atm_implied_volatiity)
 - put_iv - the implied volatiity of the put option using the midpoint between the put bid and put ask price
- sampling_key - a random number between 0 and 1. Facilitates repeatable data sampling without the need for a hash key 


## Codebase

In [39]:
#declare dependencies and constants
from google.cloud import bigquery
import pandas as pd
import pandas_market_calendars as mcal
import datetime
import math
import mibian
import scipy
import json
import random
import sys

STOCK_SYMBOLS = ['SPY']
PROJECT_ID = 'expiry-week'
DATASET_ID = 'option_quotes'
JOB_ID = '' #supplied as an arguement when running program

In [40]:
def get_interest_rate(quote_date):
    "return the fed funds rate that was in effect on the supplied quote date"
    df_fedfunds = pd.read_csv('gs://expiry-week-data/options/FEDFUNDS.csv', parse_dates=['DATE'])
    df_fedfunds = df_fedfunds[df_fedfunds['DATE'].dt.date <= quote_date]
    target_index = df_fedfunds['DATE'].idxmax()
    return df_fedfunds.loc[target_index]['FEDFUNDS']


In [41]:
def calc_call_iv(stock_price, strike_price, interest_rate, days_to_expiry, call_price):
    """
    calculate the implied volatility of a call option
    - return annualized implied volatility as a decimal value
    """
    bs = mibian.BS([stock_price, strike_price, interest_rate, days_to_expiry], callPrice=call_price)
    return bs.impliedVolatility / 100

In [42]:
def calc_put_iv(stock_price, strike_price, interest_rate, days_to_expiry, put_price):
    """
    calculate the implied volatility of a put option
    - return annualized implied volatility as a decimal value
    """
    bs = mibian.BS([stock_price, strike_price, interest_rate, days_to_expiry], putPrice=put_price)
    return bs.impliedVolatility / 100

In [43]:
def merge_options(df_expiry):
    """
    merge calls and puts for a given expiry date and quote date using strike price as the key
    - this will produce a straddle layout and keep only strike prices that exist on both the call and put side
    """

    df_calls = df_expiry[df_expiry['option_type'] == 'call'].reset_index(drop=True)
    df_calls.rename(columns={'bid': 'call_bid', 'ask': 'call_ask', 'volume': 'call_volume', 'open_interest' : 'call_open_interest'}, inplace=True)
    del df_calls['option_type']

    df_puts = df_expiry[df_expiry['option_type'] == 'put'][['bid', 'ask', 'volume', 'open_interest', 'strike_price']].reset_index(drop=True)
    df_puts.rename(columns={'bid': 'put_bid', 'ask': 'put_ask', 'volume': 'put_volume', 'open_interest' : 'put_open_interest'}, inplace=True)

    df_straddle = pd.merge(df_calls, df_puts, on='strike_price', how='inner')
    return df_straddle


In [87]:
def create_base_record(quote_date, expiry_date, days_to_expiry, underlying_price, atm_price, atm_iv):
    """Generate the non repeating part of an option record"""
    base_record = {}
    base_record['partition_date'] = '1970-01-01'
    base_record['quote_date'] = quote_date
    base_record['expiry_date'] = expiry_date
    base_record['days_to_expiry'] = days_to_expiry
    base_record['underlying_price'] = underlying_price
    base_record['atm_price'] = round(atm_price, 2)
    base_record['atm_iv'] = round(atm_iv, 3)
    base_record['strike_prices'] = []
    base_record['sampling_key'] = round(random.random(), 5)
    return base_record


In [45]:
def calc_put_moneyness(atm_price, atm_iv, strike_price, days_to_expiry):
    """
    calculate the probability that a put option will close in the money on expiry date
    Note that the probability of a call option with the same strike price closing in the money
    will be 1 - the probability of the put option closing in the money    
    """
    iv_to_expiry = atm_iv * math.sqrt(days_to_expiry / 365)
    zscore = math.log(strike_price /  atm_price) / iv_to_expiry
    norm_cdf = scipy.stats.norm.cdf(zscore)
    return norm_cdf
    

In [46]:
def create_strike_record(row):
    """Generate a strike price record"""
    strike_record = {}
    strike_record['strike_price'] = row['strike_price']
       
    #call attributes
    strike_record['call_bid'] = row['call_bid']
    strike_record['call_ask'] = row['call_ask']
    strike_record['call_volume'] = row['call_volume']
    strike_record['call_open_interest'] = row['call_open_interest']
    strike_record['call_moneyness'] = round(1 - row['put_moneyness'], 3)
    strike_record['call_iv'] = round(row['call_iv'],3)
    
     #put attributes
    strike_record['put_bid'] = row['put_bid']
    strike_record['put_ask'] = row['put_ask']
    strike_record['put_volume'] = row['put_volume']
    strike_record['put_open_interest'] = row['put_open_interest']
    strike_record['put_moneyness'] = round(row['put_moneyness'], 3)
    strike_record['put_iv'] = round(row['put_iv'], 3)
        
    return strike_record
    

In [47]:
def process_expiry_date(df_expiry, interest_rate):
    """
    read and process options for a given expiry date
    """
    #transpose calls and puts into a straddle layout
    df_straddle = merge_options(df_expiry)

    #find nearest strike price
    target_index = abs(df_straddle['strike_price'] - df_straddle['underlying_price']).idxmin()
    df_strike = df_straddle.loc[target_index]

    #get option values at target strike price
    quote_date = df_strike['quote_date'].strftime('%Y-%m-%d')
    expiry_date = df_strike['expiry_date'].strftime('%Y-%m-%d')
    underlying_price = df_strike['underlying_price']
    strike_price = df_strike['strike_price']
    days_to_expiry = (df_strike['expiry_date'] -  df_strike['quote_date']).days 
    call_price = (df_strike['call_bid'] + df_strike['call_ask']) / 2
    put_price = (df_strike['put_bid'] + df_strike['put_ask']) / 2

    #find adjusted underlying price and Implied Volatility where call and put implied volatility are the same
    atm_price, atm_iv = center_underlying_price(underlying_price, strike_price, interest_rate, 
        days_to_expiry, call_price, put_price)    
  
    #calculate put moneyness and exclude options that are more that 2 stard deviations in or out of the money
    df_straddle['put_moneyness'] = df_straddle.apply(lambda x: calc_put_moneyness(atm_price, atm_iv, x['strike_price'], days_to_expiry), axis=1)
    df_straddle = df_straddle[(df_straddle['put_moneyness'] > .05) & (df_straddle['put_moneyness'] < .95)].reset_index(drop=True)

    #calculate call implied volatilities using atm_price as the current price
    df_straddle['call_iv'] = df_straddle.apply(lambda x: calc_call_iv(atm_price, x['strike_price'], \
        interest_rate, days_to_expiry, (x['call_bid'] + x['call_ask']) / 2), axis=1)
  
    #calculate put implied volatilities using atm_price as the current price
    df_straddle['put_iv'] = df_straddle.apply(lambda x: calc_put_iv(atm_price, x['strike_price'], \
        interest_rate, days_to_expiry, (x['put_bid'] + x['put_ask']) / 2), axis=1)
  
    #create base record
    base_record = create_base_record(quote_date, expiry_date, days_to_expiry, underlying_price, atm_price, atm_iv)

    #add strike price records
    for index, row in df_straddle.iterrows():
        strike_record =  create_strike_record(row)
        base_record['strike_prices'].append(strike_record)
 
    return base_record

    

In [92]:
def process_quote_date(df_daily, quote_date, symbol):
    """
    process options for a given symbol and quote date
    - save transformed data to bigquery dataset    
    """
    #get fed funds rate that was effective on the supplied quote date
    interest_rate = get_interest_rate(quote_date)
    
    #get list of expiry dates
    expiry_dates = df_daily['expiry_date'].unique()
       
    #process each expiry date (one dictionary record per quote_date expiry_date combinations)
    #records need to be in newline delimited json format since we will be loading the output file into bigquery
    records = []
    for expiry_date in expiry_dates:
        df_expiry = df_daily[df_daily['expiry_date'] == expiry_date].reset_index(drop=True)
        record = process_expiry_date(df_expiry, interest_rate)
        ndjson_record = json.dumps(record) + '\n'
        records.append(ndjson_record)
    
    filepath = '{}_{}.ndjson'.format(symbol.upper(), JOB_ID)
    with open(filepath, 'a') as f:
        f.writelines(records)
  

In [49]:
def end_of_week_dates(start_date, end_date):
    "return a list of end of week trading days between start_date and end_date"
    "start and end dates must be passed in as date values (without time component)"
    eow_dates = []
    date_offsets = [4, 3, 2, 1, 0, 6, 5]
    offset = date_offsets[start_date.weekday()]
    next_date = start_date + datetime.timedelta(days=offset)
   
    #load stock exchange holidays
    nyse = mcal.get_calendar('NYSE')
    market_holidays = nyse.holidays().holidays
   
    while next_date <= end_date:
        #subtract 1 day from next_date if it is a market holiday
        if next_date in market_holidays:
            adj_date = next_date - datetime.timedelta(days=1)
            eow_dates.append(adj_date)
        else:
            eow_dates.append(next_date)
        next_date += datetime.timedelta(days=7)
    return eow_dates
  


In [91]:
def load_to_bq(symbol):
    """load transformed data into bigquery"""

    client = bigquery.Client()
    table_id = '{}.{}.{}'.format(PROJECT_ID, DATASET_ID, symbol.upper()) 
   
    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
    )

    filepath = '{}_{}.ndjson'.format(symbol.upper(), JOB_ID)
    with open(filepath, 'rb') as source_file:
        job = client.load_table_from_file(source_file, table_id, job_config=job_config)

    job.result()  # Waits for the job to complete.
    
    table = client.get_table(table_id)  # Make an API request.
    print("Loaded {} rows to table {}".format(table.num_rows, table_id))
  

In [93]:
def main(in_filepath):
    """
    capture options data on a weekly cadence
    file path is path to options input file
    example: gs://expiry-week-data/options/SPY_2020.csv
    """
    #load file into a dataframe
    input_columns = [0,1,5,6,7,8,10,11,12,13]

    column_names = ['underlying_symbol', 'underlying_price', 'option_type', 'expiry_date', 'quote_date',
        'strike_price', 'bid', 'ask', 'volume', 'open_interest']

    #exclude options that have already expired and options with no meaningful bid
    df_options = pd.read_csv(in_filepath, usecols=input_columns, names=column_names, header=0, parse_dates=[3,4])
    df_options = df_options[(df_options['underlying_symbol'].isin(STOCK_SYMBOLS)) & 
        (df_options['expiry_date'] > df_options['quote_date']) &
        (df_options['bid'] >= .05)]
  
    #get end of week quote dates
    start_date = df_options['quote_date'].dt.date.min()
    end_date = df_options['quote_date'].dt.date.max()
    eow_dates = end_of_week_dates(start_date, end_date)
    
    #process each stock symbol
    for symbol in STOCK_SYMBOLS:

        #process end of week quote dates
        for quote_date in eow_dates:
            df_daily = df_options[df_options['quote_date'].dt.date == quote_date]
            process_quote_date(df_daily, quote_date, symbol)
            
        #upload records to bigquery
        load_to_bq(symbol)
        

In [81]:
#RUN PROGRAM
"""
process end-of-day stock options file
arguements:
argv[1] = Job ID (used to generate a unique local filename for storing intermediate results)
argv[2] = Imput filepath (example: gs://expiry-week-data/options/SPY_2020.csv)
"""
#read in arguements if running .py program from command line
if sys.argv[0] == 'capture_options.py':
    JOB_ID = argv[1]
    main(sys.argv[2])

#supply arguements manually if running inside JupyterLab
else:
    JOB_ID = '01'
    in_filepath = 'gs://expiry-week-data/options/SPY_2019.csv'
    df_options = main(in_filepath)
    print('All Done!')

All Done!


## Code Exploration Section

In [89]:
symbol = 'SPY'
filepath = '{}_{}.ndjson'.format(symbol.upper(), JOB_ID)
with open(filepath, 'r') as f:
    records = f.readlines()
    
record = json.loads(records[7])
print(json.dumps(record, indent=4))


{
    "partition_date": "1970-01-01",
    "quote_date": "2019-01-04",
    "expiry_date": "2019-01-23",
    "days_to_expiry": 19,
    "underlying_price": 252.39,
    "atm_price": 252.43,
    "atm_iv": 0.186,
    "strike_prices": [
        {
            "strike_price": 236.0,
            "call_bid": 17.24,
            "call_ask": 17.59,
            "call_volume": 0,
            "call_open_interest": 23,
            "call_moneyness": 0.943,
            "call_iv": 0.242,
            "put_bid": 0.66,
            "put_ask": 0.69,
            "put_volume": 2,
            "put_open_interest": 76,
            "put_moneyness": 0.057,
            "put_iv": 0.24
        },
        {
            "strike_price": 237.0,
            "call_bid": 16.33,
            "call_ask": 16.67,
            "call_volume": 0,
            "call_open_interest": 48,
            "call_moneyness": 0.931,
            "call_iv": 0.234,
            "put_bid": 0.74,
            "put_ask": 0.77,
            "put_volume": 3,
 