# Cleans Data

In [29]:
# Imports
import os
import pandas as pd

# Params

In [30]:
# Set Ticker
ticker = 'QQQ'

## Read And Concat

In [31]:
# Initialize all_options_data to concatenate later
all_options_data_to_concat = []

# Ignore .DS_Store
folders = [file for file in sorted(os.listdir(f'data/raw/{ticker}')) if file != '.DS_Store']

# Iterate through `data` folder
for folder in folders:

    # Print status
    print(f"Folder: {folder.split('_')[2].split('-')[0]}")

    # Get sub folder with each spy eod
    folder_path = os.path.join(f'data/raw/{ticker}', folder)

    # Iterate through files in each subfolder
    for file in os.listdir(folder_path):

        # Get file path
        file_path = os.path.join(folder_path, file)

        # Read file and append to array for later concatenation
        all_options_data_to_concat.append(pd.read_csv(file_path, low_memory=False))

# Concatenate into one dataframe
all_options_data = pd.concat(all_options_data_to_concat, ignore_index=True)

Folder: 2012
Folder: 2013
Folder: 2014
Folder: 2015
Folder: 2016
Folder: 2017
Folder: 2018
Folder: 2019
Folder: 2020
Folder: 2021q12
Folder: 2021q3
Folder: 2021q4
Folder: 2022q1
Folder: 2022q2
Folder: 2022q3
Folder: 2022q4
Folder: 2023q1
Folder: 2023q2
Folder: 2023q3
Folder: 2023q4


# Clean Columns And Assign Proper Feature Types

In [32]:
# Rename columns
all_options_data.rename(columns={
    '[QUOTE_UNIXTIME]': 'datetime',
    ' [QUOTE_READTIME]': 'quote_readtime',
    ' [QUOTE_DATE]': 'date',
    ' [QUOTE_TIME_HOURS]': 'time',
    ' [UNDERLYING_LAST]': 'stock_price',
    ' [EXPIRE_DATE]': 'expiration_date',
    ' [EXPIRE_UNIX]': 'expire_unix',
    ' [DTE]': 'days_till_expiration',
    ' [C_DELTA]': 'c_delta',
    ' [C_GAMMA]': 'c_gamma',
    ' [C_VEGA]': 'c_vega',
    ' [C_THETA]': 'c_theta',
    ' [C_RHO]': 'c_rho',
    ' [C_IV]': 'c_iv',
    ' [C_VOLUME]': 'c_volume',
    ' [C_LAST]': 'c_last',
    ' [C_SIZE]': 'c_size',
    ' [C_BID]': 'c_bid',
    ' [C_ASK]': 'c_ask',
    ' [STRIKE]': 'strike',
    ' [P_BID]': 'p_bid',
    ' [P_ASK]': 'p_ask',
    ' [P_SIZE]': 'p_size',
    ' [P_LAST]': 'p_last',
    ' [P_DELTA]': 'p_delta',
    ' [P_GAMMA]': 'p_gamma',
    ' [P_VEGA]': 'p_vega',
    ' [P_THETA]': 'p_theta',
    ' [P_RHO]': 'p_rho',
    ' [P_IV]': 'p_iv',
    ' [P_VOLUME]': 'p_volume',
    ' [STRIKE_DISTANCE]': 'strike_distance_delete',
    ' [STRIKE_DISTANCE_PCT]': 'strike_distance'
}, inplace=True)

# Cast to appropriate types
all_options_data['datetime'] = pd.to_datetime(all_options_data['datetime'], unit='s', errors='coerce')
all_options_data['quote_readtime'] = pd.to_datetime(all_options_data['quote_readtime'], errors='coerce')
all_options_data['date'] = pd.to_datetime(all_options_data['date'], errors='coerce')
all_options_data['time'] = pd.to_numeric(all_options_data['time'], errors='coerce')
all_options_data['stock_price'] = pd.to_numeric(all_options_data['stock_price'], errors='coerce')
all_options_data['expiration_date'] = pd.to_datetime(all_options_data['expiration_date'], errors='coerce')
all_options_data['expire_unix'] = pd.to_datetime(all_options_data['expire_unix'], unit='s', errors='coerce')
all_options_data['days_till_expiration'] = pd.to_numeric(all_options_data['days_till_expiration'], errors='coerce').astype(int)
all_options_data['c_delta'] = pd.to_numeric(all_options_data['c_delta'], errors='coerce')
all_options_data['c_gamma'] = pd.to_numeric(all_options_data['c_gamma'], errors='coerce')
all_options_data['c_vega'] = pd.to_numeric(all_options_data['c_vega'], errors='coerce')
all_options_data['c_theta'] = pd.to_numeric(all_options_data['c_theta'], errors='coerce')
all_options_data['c_rho'] = pd.to_numeric(all_options_data['c_rho'], errors='coerce')
all_options_data['c_iv'] = pd.to_numeric(all_options_data['c_iv'], errors='coerce')
all_options_data['c_volume'] = pd.to_numeric(all_options_data['c_volume'], errors='coerce')
all_options_data['c_last'] = pd.to_numeric(all_options_data['c_last'], errors='coerce')
all_options_data['c_size'] = pd.to_numeric(all_options_data['c_size'], errors='coerce')
all_options_data['c_bid'] = pd.to_numeric(all_options_data['c_bid'], errors='coerce')
all_options_data['c_ask'] = pd.to_numeric(all_options_data['c_ask'], errors='coerce')
all_options_data['strike'] = pd.to_numeric(all_options_data['strike'], errors='coerce')
all_options_data['p_bid'] = pd.to_numeric(all_options_data['p_bid'], errors='coerce')
all_options_data['p_ask'] = pd.to_numeric(all_options_data['p_ask'], errors='coerce')
all_options_data['p_size'] = pd.to_numeric(all_options_data['p_size'], errors='coerce')
all_options_data['p_last'] = pd.to_numeric(all_options_data['p_last'], errors='coerce')
all_options_data['p_delta'] = pd.to_numeric(all_options_data['p_delta'], errors='coerce')
all_options_data['p_gamma'] = pd.to_numeric(all_options_data['p_gamma'], errors='coerce')
all_options_data['p_vega'] = pd.to_numeric(all_options_data['p_vega'], errors='coerce')
all_options_data['p_theta'] = pd.to_numeric(all_options_data['p_theta'], errors='coerce')
all_options_data['p_rho'] = pd.to_numeric(all_options_data['p_rho'], errors='coerce')
all_options_data['p_iv'] = pd.to_numeric(all_options_data['p_iv'], errors='coerce')
all_options_data['p_volume'] = pd.to_numeric(all_options_data['p_volume'], errors='coerce')
all_options_data['strike_distance_delete'] = pd.to_numeric(all_options_data['strike_distance_delete'], errors='coerce')
all_options_data['strike_distance'] = pd.to_numeric(all_options_data['strike_distance'], errors='coerce')

## Add Variable For Call Price

In [33]:
# Set `call_price` to midpoint between call bid and call ask
all_options_data['call_price'] = (all_options_data['c_bid'] + all_options_data['c_ask']) / 2

## Drop Unnecessary Data

In [34]:
# Drop columns
all_options_data = all_options_data.drop(columns=['datetime', 'quote_readtime', 'expire_unix', 'strike_distance_delete'])
all_options_data = all_options_data.dropna(subset=['call_price'])

## Sort Data

In [35]:
all_options_data = all_options_data.sort_values(by=['date', 'expiration_date'])

## Convert To Parquet

In [36]:
# Save for later
all_options_data.to_parquet(f'data/clean/{ticker}.parquet')

In [41]:
all_options_data.groupby('date').first()

Unnamed: 0_level_0,time,stock_price,expiration_date,days_till_expiration,c_delta,c_gamma,c_vega,c_theta,c_rho,c_iv,...,p_last,p_delta,p_gamma,p_vega,p_theta,p_rho,p_iv,p_volume,strike_distance,call_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-03,16.0,56.90,2012-01-06,3,1.00000,0.00000,0.00000,-0.00544,0.00466,-0.00036,...,0.00,-0.00313,0.00232,0.00035,-0.00408,0.00000,0.79359,0.0,0.192,10.920
2012-01-04,16.0,57.14,2012-01-06,2,1.00000,0.00000,0.00000,-0.00526,0.00306,-0.00017,...,0.00,-0.00300,0.00221,0.00080,-0.00508,-0.00041,0.94375,0.0,0.195,10.880
2012-01-05,16.0,57.61,2012-01-06,1,1.00000,0.00000,0.00000,0.00000,0.00000,0.00047,...,0.00,-0.00290,0.00210,0.00039,-0.00539,0.00000,1.21355,0.0,0.202,11.610
2012-01-06,16.0,57.78,2012-01-06,0,1.00000,0.00000,0.00000,0.00000,0.00000,3.95079,...,0.00,-0.00300,0.00202,-0.00002,-0.00464,0.00000,1.90573,0.0,0.204,11.730
2012-01-09,16.0,57.63,2012-01-13,4,0.97987,0.00738,0.00331,-0.03356,0.00539,0.99438,...,0.00,-0.00374,0.00204,0.00074,-0.00407,0.00000,0.74188,0.0,0.202,11.705
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22,16.0,408.44,2023-12-22,0,1.00000,0.00000,0.00000,-0.02625,0.00471,-0.00010,...,0.01,0.00000,0.00001,-0.00012,-0.00535,0.00000,3.54337,0.0,0.412,168.465
2023-12-26,16.0,410.88,2023-12-26,0,1.00000,0.00000,0.00000,-0.03773,0.00690,0.01133,...,0.01,-0.00033,0.00007,0.00087,-0.00519,-0.00043,1.34640,0.0,0.173,70.770
2023-12-27,16.0,411.50,2023-12-27,0,1.00000,0.00000,0.35981,0.00000,0.00000,0.00206,...,0.00,-0.00048,0.00011,0.00015,-0.00458,-0.00002,1.28269,5.0,0.165,67.725
2023-12-28,16.0,411.21,2023-12-28,0,1.00000,0.00000,0.00000,0.00000,0.00000,0.00248,...,0.00,-0.00093,0.00013,0.00070,-0.00464,0.00000,1.27869,0.0,0.164,67.335
