# Setup

In [1]:
pip install autofeat

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting autofeat
  Downloading autofeat-2.0.10-py3-none-any.whl (24 kB)
Collecting pint (from autofeat)
  Downloading Pint-0.21-py3-none-any.whl (286 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.2/286.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pint, autofeat
Successfully installed autofeat-2.0.10 pint-0.21


In [2]:
# import modules
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from autofeat import FeatureSelector, AutoFeatRegressor

import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [3]:
# mounting Google Drive and setting path
from google.colab import drive
drive.mount('/content/drive')
path = 'drive/MyDrive/DS301 Final Project/data/'

Mounted at /content/drive


In [4]:
# read TSLA dataset into a DataFrame 
names = ['t_date', 'stock_id', 'stock_symbol', 'expiration_date', 'strike', 
         'call_put', 'style', 'symbol', 'price_bid', 'price_ask', 'date_bid', 
         'date_ask', 'size_bid', 'size_ask','exchange_bid', 'exchange_ask', 'volume',
         'implied_volatility', 'price_opt', 'delta', 'gamma', 'theta', 'vega', 
         'rho', 'pre_iv', 'implied_yield', 'calc_date', 'dump_time']

tsla = pd.read_csv(path + "options_tsla.csv", names=names)

In [5]:
tsla

Unnamed: 0,t_date,stock_id,stock_symbol,expiration_date,strike,call_put,style,symbol,price_bid,price_ask,date_bid,date_ask,size_bid,size_ask,exchange_bid,exchange_ask,volume,implied_volatility,price_opt,delta,gamma,theta,vega,rho,pre_iv,implied_yield,calc_date,dump_time
0,2021-10-19 09:30:00,21400,TSLA,2021-10-22 00:00:00,50.0,C,A,TSLA 211022C00050000,0.00,0.00,2021-10-19 09:24:59,2021-10-19 09:24:59,0,0,*,*,0,-1.0000,870.11,0.00000,0.000000,0.000000,0.00000,0.00000,-1.0000,0.0,2021-10-19 09:25:46,2021-10-19 09:30:00
1,2021-10-19 09:30:00,21400,TSLA,2021-10-22 00:00:00,50.0,P,A,TSLA 211022P00050000,0.00,0.00,2021-10-19 09:25:01,2021-10-19 09:25:01,0,0,*,*,0,-1.0000,870.11,0.00000,0.000000,0.000000,0.00000,0.00000,-1.0000,0.0,2021-10-19 09:25:46,2021-10-19 09:30:00
2,2021-10-19 09:30:00,21400,TSLA,2021-10-22 00:00:00,100.0,C,A,TSLA 211022C00100000,0.00,0.00,2021-10-19 09:25:07,2021-10-19 09:25:07,0,0,*,*,0,-1.0000,870.11,0.00000,0.000000,0.000000,0.00000,0.00000,-1.0000,0.0,2021-10-19 09:25:46,2021-10-19 09:30:00
3,2021-10-19 09:30:00,21400,TSLA,2021-10-22 00:00:00,100.0,P,A,TSLA 211022P00100000,0.00,0.00,2021-10-19 09:25:06,2021-10-19 09:25:06,0,0,*,*,0,-1.0000,870.11,-0.00000,0.000000,-0.000000,0.00000,-0.00000,-1.0000,0.0,2021-10-19 09:25:46,2021-10-19 09:30:00
4,2021-10-19 09:30:00,21400,TSLA,2021-10-22 00:00:00,150.0,C,A,TSLA 211022C00150000,0.00,0.00,2021-10-19 09:25:01,2021-10-19 09:25:01,0,0,*,*,0,-1.0000,870.11,0.00000,0.000000,0.000000,0.00000,0.00000,-1.0000,0.0,2021-10-19 09:25:46,2021-10-19 09:30:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2000823,2021-10-19 16:18:00,21400,TSLA,2024-01-19 00:00:00,1675.0,P,A,TSLA 240119P01675000,857.00,874.05,2021-10-19 15:59:58,2021-10-19 15:59:58,9,95,AO,XO,2,0.4361,864.27,-0.75618,0.000577,-0.099708,4.09159,-18.16715,0.4361,0.0,2021-10-19 16:16:38,2021-10-19 16:18:00
2000824,2021-10-19 16:18:00,21400,TSLA,2024-01-19 00:00:00,1700.0,C,A,TSLA 240119C01700000,70.05,75.00,2021-10-19 15:59:58,2021-10-19 15:59:48,27,1,XO,MP,16,0.4639,864.27,0.26973,0.000549,-0.122301,4.28636,3.61674,0.4639,0.0,2021-10-19 16:16:38,2021-10-19 16:18:00
2000825,2021-10-19 16:18:00,21400,TSLA,2024-01-19 00:00:00,1700.0,P,A,TSLA 240119P01700000,879.50,896.55,2021-10-19 15:59:58,2021-10-19 15:59:58,10,65,AO,XO,0,0.4353,864.27,-0.76402,0.000569,-0.097444,3.92524,-18.19406,0.4353,0.0,2021-10-19 16:16:38,2021-10-19 16:18:00
2000826,2021-10-19 16:18:00,21400,TSLA,2024-01-19 00:00:00,1725.0,C,A,TSLA 240119C01725000,68.00,76.00,2021-10-19 15:59:58,2021-10-19 15:59:58,25,17,XO,BT,60,0.4682,864.27,0.26678,0.000541,-0.122713,4.26275,3.57122,0.4682,0.0,2021-10-19 16:16:38,2021-10-19 16:18:00


# Pre-processing and Cleaning

In [6]:
# creating target column
tsla['price'] = (tsla['price_bid'] + tsla['price_ask'])  / 2

In [7]:
# dropping non-numeric and constant columns 
dropped_cols = ['stock_id', 'stock_symbol', 'style', 'implied_yield', 'symbol',
                'exchange_bid', 'exchange_ask', 'price_bid', 'price_ask', 't_date',
                'expiration_date', 'date_bid', 'date_ask', 'calc_date', 'dump_time']
tsla.drop(dropped_cols, axis=1, inplace=True)

# Removing Outliers and Invalid Values Within Data

In [8]:
# Removing values without a price (Options that weren't traded)
tsla = tsla.drop(tsla[tsla["price"] < 0.01].index)
tsla

Unnamed: 0,strike,call_put,size_bid,size_ask,volume,implied_volatility,price_opt,delta,gamma,theta,vega,rho,pre_iv,price
4892,50.0,C,2,1,0,3.3919,875.3929,1.00000,0.000000,-0.000101,0.00000,0.00448,-1.0000,825.350
4894,100.0,C,1,1,0,3.3919,875.3929,1.00000,0.000000,-0.000202,0.00000,0.00896,-1.0000,776.550
4896,150.0,C,1,2,0,3.3919,875.3929,1.00000,0.000000,-0.000305,0.00000,0.01344,-1.0000,726.475
4898,200.0,C,2,1,0,3.3919,875.3929,1.00000,0.000000,-0.000611,0.00000,0.01792,-1.0000,675.350
4900,250.0,C,1,1,0,3.3919,875.3929,0.99998,0.000000,-0.004952,0.00009,0.02240,-1.0000,626.500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2000823,1675.0,P,9,95,2,0.4361,864.2700,-0.75618,0.000577,-0.099708,4.09159,-18.16715,0.4361,865.525
2000824,1700.0,C,27,1,16,0.4639,864.2700,0.26973,0.000549,-0.122301,4.28636,3.61674,0.4639,72.525
2000825,1700.0,P,10,65,0,0.4353,864.2700,-0.76402,0.000569,-0.097444,3.92524,-18.19406,0.4353,888.025
2000826,1725.0,C,25,17,60,0.4682,864.2700,0.26678,0.000541,-0.122713,4.26275,3.57122,0.4682,72.000


In [9]:
# splitting dataset into calls and puts
calls = tsla[tsla['call_put'] == 'C'].drop('call_put', axis=1)
puts = tsla[tsla['call_put'] == 'P'].drop('call_put', axis=1)

In [10]:
calls

Unnamed: 0,strike,size_bid,size_ask,volume,implied_volatility,price_opt,delta,gamma,theta,vega,rho,pre_iv,price
4892,50.0,2,1,0,3.3919,875.3929,1.00000,0.000000,-0.000101,0.00000,0.00448,-1.0000,825.350
4894,100.0,1,1,0,3.3919,875.3929,1.00000,0.000000,-0.000202,0.00000,0.00896,-1.0000,776.550
4896,150.0,1,2,0,3.3919,875.3929,1.00000,0.000000,-0.000305,0.00000,0.01344,-1.0000,726.475
4898,200.0,2,1,0,3.3919,875.3929,1.00000,0.000000,-0.000611,0.00000,0.01792,-1.0000,675.350
4900,250.0,1,1,0,3.3919,875.3929,0.99998,0.000000,-0.004952,0.00009,0.02240,-1.0000,626.500
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2000818,1600.0,1,256,146,0.4745,864.2700,0.30880,0.000572,-0.133333,4.56806,4.04002,0.4745,87.500
2000820,1650.0,29,20,0,0.4670,864.2700,0.28690,0.000562,-0.126888,4.41730,3.81019,0.4670,78.775
2000822,1675.0,25,17,0,0.4675,864.2700,0.28014,0.000555,-0.125580,4.36716,3.72998,0.4675,76.500
2000824,1700.0,27,1,16,0.4639,864.2700,0.26973,0.000549,-0.122301,4.28636,3.61674,0.4639,72.525


# Feature Engineering

In [11]:
X = calls.drop('price', axis=1)
y = calls['price']

afreg = AutoFeatRegressor(verbose=1, feateng_steps=1)
X_new = afreg.fit_transform(X, y)
print('New features: ', afreg.new_feat_cols_)

[AutoFeat] The 1 step feature engineering process could generate up to 84 features.
[AutoFeat] With 997605 data points this new feature matrix would use about 0.34 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 50 transformed features from 12 original features - done.
[feateng] Generated altogether 59 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 34 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 38 features after 5 feature selection runs
[featsel] 30 features after correlation filtering
[featsel] 29 features after noise filtering
[AutoFeat] Computing 17 new features.
[AutoFeat]    17/   17 new features ...done.
[AutoFeat] Final dataframe with 29 feature columns (17 n

In [12]:
X_new

Unnamed: 0,strike,size_bid,size_ask,volume,implied_volatility,price_opt,delta,gamma,theta,vega,rho,pre_iv,rho**3,1/delta,vega**3,1/theta,1/pre_iv,theta**3,pre_iv**3,volume**2,strike**3,log(delta),1/size_ask,size_ask**3,log(strike),Abs(pre_iv),size_bid**3,sqrt(volume),log(size_ask)
0,50.0,2.0,1.0,0.0,3.3919,875.3929,1.00000,0.000000,-0.000101,0.00000,0.00448,-1.0000,8.991539e-08,1.000000,0.000000e+00,-9900.990099,-1.000000,-1.030301e-12,-1.000000,0.0,1.250000e+05,0.000000,1.000000,1.0,3.912023,1.0000,8.0,0.000000,0.000000
1,100.0,1.0,1.0,0.0,3.3919,875.3929,1.00000,0.000000,-0.000202,0.00000,0.00896,-1.0000,7.193231e-07,1.000000,0.000000e+00,-4950.495050,-1.000000,-8.242408e-12,-1.000000,0.0,1.000000e+06,0.000000,1.000000,1.0,4.605170,1.0000,1.0,0.000000,0.000000
2,150.0,1.0,2.0,0.0,3.3919,875.3929,1.00000,0.000000,-0.000305,0.00000,0.01344,-1.0000,2.427716e-06,1.000000,0.000000e+00,-3278.688525,-1.000000,-2.837262e-11,-1.000000,0.0,3.375000e+06,0.000000,0.500000,8.0,5.010635,1.0000,1.0,0.000000,0.693147
3,200.0,2.0,1.0,0.0,3.3919,875.3929,1.00000,0.000000,-0.000611,0.00000,0.01792,-1.0000,5.754585e-06,1.000000,0.000000e+00,-1636.661211,-1.000000,-2.280991e-10,-1.000000,0.0,8.000000e+06,0.000000,1.000000,1.0,5.298317,1.0000,8.0,0.000000,0.000000
4,250.0,1.0,1.0,0.0,3.3919,875.3929,0.99998,0.000000,-0.004952,0.00009,0.02240,-1.0000,1.123942e-05,1.000020,7.290000e-13,-201.938611,-1.000000,-1.214344e-07,-1.000000,0.0,1.562500e+07,-0.000020,1.000000,1.0,5.521461,1.0000,1.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997600,1600.0,1.0,256.0,146.0,0.4745,864.2700,0.30880,0.000572,-0.133333,4.56806,4.04002,0.4745,6.594024e+01,3.238342,9.532249e+01,-7.500019,2.107482,-2.370353e-03,0.106834,21316.0,4.096000e+09,-1.175061,0.003906,16777216.0,7.377759,0.4745,1.0,12.083046,5.545177
997601,1650.0,29.0,20.0,0.0,0.4670,864.2700,0.28690,0.000562,-0.126888,4.41730,3.81019,0.4670,5.531462e+01,3.485535,8.619274e+01,-7.880966,2.141328,-2.042968e-03,0.101848,0.0,4.492125e+09,-1.248622,0.050000,8000.0,7.408531,0.4670,24389.0,0.000000,2.995732
997602,1675.0,25.0,17.0,0.0,0.4675,864.2700,0.28014,0.000555,-0.125580,4.36716,3.72998,0.4675,5.189428e+01,3.569644,8.329085e+01,-7.963051,2.139037,-1.980439e-03,0.102175,0.0,4.699422e+09,-1.272466,0.058824,4913.0,7.423568,0.4675,15625.0,0.000000,2.833213
997603,1700.0,27.0,1.0,16.0,0.4639,864.2700,0.26973,0.000549,-0.122301,4.28636,3.61674,0.4639,4.730988e+01,3.707411,7.875279e+01,-8.176548,2.155637,-1.829321e-03,0.099833,256.0,4.913000e+09,-1.310334,1.000000,1.0,7.438384,0.4639,19683.0,4.000000,0.000000


# Saving the data

In [13]:
X_new.to_csv(path + 'calls_data.csv', index=False)
y.to_csv(path + 'calls_target.csv', index=False)

# Repeating Feature Engineering for Puts

In [14]:
X = puts.drop('price', axis=1)
y = puts['price']

afreg = AutoFeatRegressor(verbose=1, feateng_steps=1)
X_new = afreg.fit_transform(X, y)
print('New features: ', afreg.new_feat_cols_)

[AutoFeat] The 1 step feature engineering process could generate up to 84 features.
[AutoFeat] With 984489 data points this new feature matrix would use about 0.33 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 45 transformed features from 12 original features - done.
[feateng] Generated altogether 55 new features in 1 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 32 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 37 features after 5 feature selection runs
[featsel] 29 features after correlation filtering
[featsel] 29 features after noise filtering
[AutoFeat] Computing 18 new features.
[AutoFeat]    18/   18 new features ...done.
[AutoFeat] Final dataframe with 30 feature columns (18 n

In [15]:
X_new.to_csv(path + 'puts_data.csv', index=False)
y.to_csv(path + 'puts_target.csv', index=False)

In [16]:
X_new

Unnamed: 0,strike,size_bid,size_ask,volume,implied_volatility,price_opt,delta,gamma,theta,vega,rho,pre_iv,rho**3,vega**3,1/pre_iv,1/strike,exp(rho),theta**3,pre_iv**3,volume**2,pre_iv**2,strike**3,1/size_ask,size_ask**3,log(strike),size_bid**3,size_ask**2,sqrt(volume),log(size_ask),implied_volatility**3
0,250.0,0.0,10.0,0.0,2.6161,875.3929,0.00000,0.000000,-0.000019,0.00000,-0.00000,-1.0000,-0.000000,0.000000e+00,-1.000000,0.004000,1.000000e+00,-6.859000e-15,-1.000000,0.0,1.000000,1.562500e+07,0.100000,1000.0,5.521461,0.0,100.0,0.000000,2.302585,17.904534
1,300.0,50.0,11.0,0.0,2.6161,875.3929,-0.00000,0.000000,-0.000668,0.00002,0.00000,-1.0000,0.000000,8.000000e-15,-1.000000,0.003333,1.000000e+00,-2.980776e-10,-1.000000,0.0,1.000000,2.700000e+07,0.090909,1331.0,5.703782,125000.0,121.0,0.000000,2.397895,17.904534
2,310.0,0.0,1.0,0.0,2.6161,875.3929,-0.00001,0.000000,-0.001193,0.00003,-0.00000,-1.0000,-0.000000,2.700000e-14,-1.000000,0.003226,1.000000e+00,-1.697936e-09,-1.000000,0.0,1.000000,2.979100e+07,1.000000,1.0,5.736572,0.0,1.0,0.000000,0.000000,17.904534
3,320.0,0.0,1.0,0.0,2.6161,875.3929,-0.00001,0.000000,-0.002057,0.00005,-0.00000,-1.0000,-0.000000,1.250000e-13,-1.000000,0.003125,1.000000e+00,-8.703679e-09,-1.000000,0.0,1.000000,3.276800e+07,1.000000,1.0,5.768321,0.0,1.0,0.000000,0.000000,17.904534
4,330.0,2.0,1.0,0.0,2.6161,875.3929,-0.00002,0.000000,-0.003435,0.00009,-0.00000,-1.0000,-0.000000,7.290000e-13,-1.000000,0.003030,1.000000e+00,-4.053034e-08,-1.000000,0.0,1.000000,3.593700e+07,1.000000,1.0,5.799093,8.0,1.0,0.000000,0.000000,17.904534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984484,1600.0,13.0,78.0,63.0,0.4371,864.2700,-0.73164,0.000602,-0.105726,4.22072,-18.05741,0.4371,-5887.980688,7.518992e+01,2.287806,0.000625,1.438025e-08,-1.181804e-03,0.083511,3969.0,0.191056,4.096000e+09,0.012821,474552.0,7.377759,2197.0,6084.0,7.937254,4.356709,0.083511
984485,1650.0,9.0,69.0,8.0,0.4358,864.2700,-0.74895,0.000586,-0.101431,4.26633,-18.13077,0.4358,-5960.034119,7.765391e+01,2.294631,0.000606,1.336308e-08,-1.043547e-03,0.082768,64.0,0.189922,4.492125e+09,0.014493,328509.0,7.408531,729.0,4761.0,2.828427,4.234107,0.082768
984486,1675.0,9.0,95.0,2.0,0.4361,864.2700,-0.75618,0.000577,-0.099708,4.09159,-18.16715,0.4361,-5995.983183,6.849775e+01,2.293052,0.000597,1.288567e-08,-9.912656e-04,0.082939,4.0,0.190183,4.699422e+09,0.010526,857375.0,7.423568,729.0,9025.0,1.414214,4.553877,0.082939
984487,1700.0,10.0,65.0,0.0,0.4353,864.2700,-0.76402,0.000569,-0.097444,3.92524,-18.19406,0.4353,-6022.667229,6.047817e+01,2.297266,0.000588,1.254354e-08,-9.252632e-04,0.082483,0.0,0.189486,4.913000e+09,0.015385,274625.0,7.438384,1000.0,4225.0,0.000000,4.174387,0.082483
