In [1]:
import yfinance as yf

# Define the ticker symbol (e.g., ^GSPC for S&P 500)
ticker = '^GSPC'

# Download historical data
data = yf.download(ticker, start='2021-12-20', end='2022-03-18', interval='1d')

# Save data to a CSV file
data.to_csv('historical_data.csv')

# Preview the data
print(data.head())


[*********************100%%**********************]  1 of 1 completed

                   Open         High          Low        Close    Adj Close  \
Date                                                                          
2021-12-20  4587.899902  4587.899902  4531.100098  4568.020020  4568.020020   
2021-12-21  4594.959961  4651.140137  4583.160156  4649.229980  4649.229980   
2021-12-22  4650.359863  4697.669922  4645.529785  4696.560059  4696.560059   
2021-12-23  4703.959961  4740.740234  4703.959961  4725.790039  4725.790039   
2021-12-27  4733.990234  4791.490234  4733.990234  4791.189941  4791.189941   

                Volume  
Date                    
2021-12-20  4635700000  
2021-12-21  4072430000  
2021-12-22  3319610000  
2021-12-23  2913040000  
2021-12-27  2770290000  





In [1]:
import yfinance as yf

# Define the ticker symbol (e.g., ^GSPC for S&P 500)
ticker = '^GSPC'

# Download historical data
data = yf.download(ticker, start='2021-01-01', end='2024-12-19', interval='1d')
data.rename(columns={"Adj Close": "Underlying"}, inplace=True)
data = data["Underlying"]

# Save data to a CSV file
data.to_csv('./data/underlying_price/historical_data_sp500.csv')

# Preview the data
print(data.head())


[*********************100%%**********************]  1 of 1 completed

Date
2021-01-04    3700.649902
2021-01-05    3726.860107
2021-01-06    3748.139893
2021-01-07    3803.790039
2021-01-08    3824.679932
Name: Underlying, dtype: float64





In [5]:
import yfinance as yf

# Define the stock symbol (e.g., AAPL for Apple)
ticker_symbol = "AAPL"

# Create a Ticker object
ticker = yf.Ticker(ticker_symbol)

# Get available expiration dates
expirations = ticker.options
print("Available Expiration Dates:", expirations)

# Select an expiration date (e.g., the first one)
chosen_expiration = expirations[0]

# Fetch option chain for the chosen expiration date
option_chain = ticker.option_chain(chosen_expiration)

# Extract call and put data
calls = option_chain.calls
puts = option_chain.puts

# Print the first few rows of call and put data
print("\nCalls:")
print(calls.head())

print("\nPuts:")
print(puts.head())

# Extract and display strike prices
strike_prices = calls['strike'].unique()
print("\nStrike Prices:")
print(strike_prices)


Available Expiration Dates: ('2024-12-20', '2024-12-27', '2025-01-03', '2025-01-10', '2025-01-17', '2025-01-24', '2025-01-31', '2025-02-21', '2025-03-21', '2025-04-17', '2025-06-20', '2025-07-18', '2025-08-15', '2025-09-19', '2025-12-19', '2026-01-16', '2026-06-18', '2026-12-18', '2027-01-15')

Calls:
        contractSymbol             lastTradeDate  strike  lastPrice     bid  \
0  AAPL241220C00005000 2024-12-10 20:58:31+00:00     5.0     242.70  245.70   
1  AAPL241220C00010000 2024-12-16 16:01:02+00:00    10.0     239.15  240.70   
2  AAPL241220C00015000 2024-11-19 20:20:20+00:00    15.0     213.65  235.70   
3  AAPL241220C00020000 2024-08-02 14:52:49+00:00    20.0     203.60  207.85   
4  AAPL241220C00030000 2024-11-01 16:54:22+00:00    30.0     192.45  212.80   

      ask    change  percentChange  volume  openInterest  impliedVolatility  \
0  247.10  0.000000       0.000000       2            10          16.390630   
1  242.05  7.599991       3.282224       7             1        

In [None]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import tqdm

# Load the CSV file to analyze the missing data issue
file_path = './sp500_options_price_C4525.csv'
data = pd.read_csv(file_path)


# Prepare the data for the time series model
# Extract the relevant column for the strike price 4525 and its date index
data['Unnamed: 0'] = pd.to_datetime(data['Unnamed: 0'])
data.set_index('Unnamed: 0', inplace=True)
strike_4525 = data['C4525']

# Ensure the data is numeric for ARIMA modeling and handle missing values temporarily for model training
strike_4525_filled = strike_4525.interpolate(method='linear')  # Temporary linear interpolation for model training

# Predict the missing values
missing_indices = strike_4525[strike_4525.isnull()].index


# Define a function to evaluate ARIMA models using cross-validation
def evaluate_arima_model(data, p_values, d_values, q_values, splits=5):
    best_score, best_cfg = float("inf"), None
    tscv = TimeSeriesSplit(n_splits=splits)
    data_filled = data.interpolate(method='linear')  # Ensure no missing values for training

    for p in tqdm(p_values):
        for d in d_values:
            for q in q_values:
                try:
                    cv_errors = []
                    for train_index, test_index in tscv.split(data_filled):
                        train, test = data_filled.iloc[train_index], data_filled.iloc[test_index]
                        model = ARIMA(train, order=(p, d, q))
                        model_fit = model.fit()
                        predictions = model_fit.forecast(steps=len(test))
                        error = mean_squared_error(test, predictions)
                        cv_errors.append(error)
                    avg_error = np.mean(cv_errors)
                    if avg_error < best_score:
                        best_score, best_cfg = avg_error, (p, d, q)
                except Exception:
                    continue
    return best_cfg, best_score

# Define range for p, d, q values
p_values = range(0, 10)
d_values = range(0, 3)
q_values = range(0, 10)

# Evaluate models and find the best configuration
best_order, best_error = evaluate_arima_model(strike_4525, p_values, d_values, q_values)

# Fit the ARIMA model with the best parameters
best_model = ARIMA(strike_4525_filled, order=best_order)
best_model_fit = best_model.fit()

# Predict the missing values using the best model
best_predictions = best_model_fit.predict(start=missing_indices.min(), end=missing_indices.max())

# Fill the missing values with predictions from the best model
strike_4525_filled_best = strike_4525.copy()
strike_4525_filled_best.loc[missing_indices] = best_predictions

# Add the filled data back to the original dataset
data['C4525_Filled_CV'] = strike_4525_filled_best




  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get

In [18]:
# Add the filled data back to the original dataset
data['C4525_Filled_CV'] = strike_4525_filled
data.to_csv("sp500_options_price_C4525.csv")