In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA


In [None]:
# Step 1: Data Collection
ticker = "^GSPC"
data = yf.download(ticker, start="1980-01-01", end="2024-07-31")
data = data.reset_index()
df = data[['Date', 'Close']]


[*********************100%%**********************]  1 of 1 completed


In [None]:
df

Unnamed: 0,Date,Close
0,1980-01-02,105.760002
1,1980-01-03,105.220001
2,1980-01-04,106.519997
3,1980-01-07,106.809998
4,1980-01-08,108.949997
...,...,...
11233,2024-07-24,5427.129883
11234,2024-07-25,5399.220215
11235,2024-07-26,5459.100098
11236,2024-07-29,5463.540039


In [None]:
# Check for missing values
missing_values = df.isna().sum()
print(f'Missing values: {missing_values}')


Missing values: Date     0
Close    0
dtype: int64


In [None]:
# Log transformation
df['Close'] = np.log(df['Close'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Close'] = np.log(df['Close'])


In [None]:
msk = (df.index < len(df)-30)
df_train = df[msk].copy()
df_test = df[~msk].copy()

In [None]:
!pip install pmdarima

Collecting pmdarima
  Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (7.8 kB)
Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pmdarima
Successfully installed pmdarima-2.0.4


In [None]:
import pmdarima as pm
auto_arima = pm.auto_arima(df_train['Close'], stepwise=False, seasonal=False)
auto_arima

In [None]:
auto_arima.summary()

In [None]:
# Forecasting for the next 30 days
forecast_test_auto = auto_arima.predict(n_periods=len(df_test))

In [None]:
df.tail(251)

In [None]:
# Creating the forecast column
df['Forecast'] = np.nan
df.loc[df.index[-30:], 'Forecast'] = forecast_test_auto

In [None]:
# Filter the last 251 days for plotting
df_last_120 = df.iloc[-251:].copy()

In [None]:
# Plotting the actual and forecasted values
plt.figure(figsize=(12, 6))
plt.plot(df_last_120['Date'], df_last_120['Close'], label='Actual Data')
plt.plot(df_last_120['Date'], df_last_120['Forecast'], label='Forecasted Data', linestyle='--')
plt.title('S&P 500 Historical Closing Prices and 30-Day Forecast (Last 251 Work Days(1 year))')
plt.xlabel('Date')
plt.ylabel('Log of Closing Prices')
plt.legend()
plt.show()