# Experimenting with ARIMA

This notebook demonstrates how to download and transform stock data for use with ARIMA.

In [52]:
import yfinance as yf
import pandas as pd 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
import warnings

warnings.simplefilter("ignore", DeprecationWarning)
warnings.simplefilter("ignore", FutureWarning)

In [53]:
def download_stock_data(ticker_symbol):
    """
    Download stock data with error handling and validation
    """
    try:
        # Create ticker object and get history
        yticker = yf.Ticker(ticker_symbol)
        df = yticker.history(period='max')
        
        if df.empty:
            raise ValueError(f"No data downloaded for {ticker_symbol}")
            
        print(f"Downloaded {len(df)} days of {ticker_symbol} data")
        
        # Basic validation
        required_columns = ['Open', 'Close', 'Volume']
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
            
        return df
    
    except Exception as e:
        print(f"Error downloading {ticker_symbol}: {str(e)}")
        return None

def transform_stock_data(df, ticker):
    """
    Transform stock data for ARIMA
    """
    if df is None:
        return None
    
    # Reset index to make the date a column
    df = df.reset_index()
    
    # Create DataFrame with Prophet-like structure
    arima_df = pd.DataFrame()
    arima_df['ds'] = pd.to_datetime(df['Date']).dt.tz_localize(None)  # Remove timezone
    arima_df['y'] = df['Close'].shift(-3).astype(float)    # 3-day ahead target
    
    # Add additional features
    arima_df['volume'] = df['Volume'].astype(float)
    arima_df['open'] = df['Open'].astype(float)
    arima_df['close'] = df['Close'].astype(float)
    arima_df['range'] = df['Open'].astype(float) - df['Close'].astype(float)

    # Calculate moving averages
    arima_df['ma20'] = df['Close'].rolling(window=20).mean()
    arima_df['ma50'] = df['Close'].rolling(window=50).mean()
    
    # Calculate volatility (20-day rolling standard deviation)
    arima_df['volatility'] = df['Close'].rolling(window=20).std()
    
    # Add day of week as a feature
    arima_df['day_of_week'] = arima_df['ds'].dt.dayofweek
    
    # Split data into train/validation/test sets (70/20/10)
    total_days = len(df)
    train_end = int(total_days * 0.7)
    val_end = int(total_days * 0.9)

    # Initialize split column
    arima_df['split'] = 'train'
    
    # Use loc for setting values to avoid SettingWithCopyWarning
    arima_df.loc[train_end:val_end-1, 'split'] = 'validation'
    arima_df.loc[val_end:, 'split'] = 'test'

    return arima_df

In [54]:
## Download Stock Data
# Download data for QQQ
ticker = "QQQ"
df = download_stock_data(ticker)

print("\nRaw data sample:")
df.head()

Downloaded 6526 days of QQQ data

Raw data sample:


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1999-03-10 00:00:00-05:00,43.400427,43.426955,42.684161,43.34737,5232000,0.0,0.0,0.0
1999-03-11 00:00:00-05:00,43.665711,43.91773,42.710689,43.559597,9688600,0.0,0.0,0.0
1999-03-12 00:00:00-05:00,43.400435,43.426964,42.153601,42.49847,8743600,0.0,0.0,0.0
1999-03-15 00:00:00-05:00,42.816827,43.771849,42.365844,43.718792,6369000,0.0,0.0,0.0
1999-03-16 00:00:00-05:00,43.904485,44.275882,43.426974,44.090183,4905800,0.0,0.0,0.0


In [55]:
## Transform Data for ARIMA
# Transform data into ARIMA format
arima_df = transform_stock_data(df, ticker)

print("\nTransformed data sample:")
arima_df.tail(10)


Transformed data sample:


Unnamed: 0,ds,y,volume,open,close,range,ma20,ma50,volatility,day_of_week,split
6516,2025-02-03,529.599976,40580800.0,513.469971,518.109985,-4.640015,519.090001,518.198866,7.855021,0,test
6517,2025-02-04,522.919983,26046800.0,518.630005,524.469971,-5.839966,519.384499,518.635276,7.944794,1,test
6518,2025-02-05,529.25,21134100.0,521.820007,526.849976,-5.029968,519.499998,519.125076,8.039909,2,test
6519,2025-02-06,527.98999,19434500.0,527.359985,529.599976,-2.23999,520.220998,519.633735,8.275242,3,test
6520,2025-02-07,528.299988,29605500.0,530.140015,522.919983,7.220032,520.603496,519.99262,8.210902,4,test
6521,2025-02-10,535.900024,20852800.0,527.219971,529.25,-2.030029,521.706496,520.462132,7.784828,0,test
6522,2025-02-11,538.150024,19325500.0,525.960022,527.98999,-2.029968,522.827995,520.852131,6.901888,1,test
6523,2025-02-12,,25009400.0,522.299988,528.299988,-6.0,523.988995,521.328401,5.587003,2,test
6524,2025-02-13,,28714800.0,529.97998,535.900024,-5.920044,524.948996,521.868015,5.908908,3,test
6525,2025-02-14,,17835900.0,536.01001,538.150024,-2.140015,526.202496,522.341807,5.917672,4,test


In [56]:
# Clean data by removing NaN values
arima_df_clean = arima_df.dropna()
print(f"Original shape: {arima_df.shape}")
print(f"Clean shape: {arima_df_clean.shape}")

Original shape: (6526, 11)
Clean shape: (6474, 11)


In [57]:
## Train ARIMA Model
def train_arima_model(df, order=(10,1,10)):
    """
    Train ARIMA model on price data
    """
    # Get training data
    train_data = df[df['split'].isin(['train', 'validation'])]
    
    # Prepare exogenous variables
    exog = train_data[['volume', 'range', 'ma20', 'ma50', 'volatility', 'day_of_week']]
    
    # Train on prices
    model = ARIMA(train_data['close'], 
                  order=order,
                  exog=exog)
    
    return model.fit()

def make_predictions(model, df):
    """
    Make price predictions
    """
    # Split data into train/validation/test
    train_mask = df['split'].isin(['train', 'validation'])
    train_data = df[train_mask]
    test_data = df[~train_mask]
    
    # Prepare exogenous variables
    train_exog = train_data[['volume', 'range', 'ma20', 'ma50', 'volatility', 'day_of_week']]
    test_exog = test_data[['volume', 'range', 'ma20', 'ma50', 'volatility', 'day_of_week']]
    
    # Get in-sample predictions for train/validation
    train_pred = model.predict(start=0, end=len(train_data)-1, exog=train_exog)
    
    # Get out-of-sample predictions for test
    test_pred = model.forecast(steps=len(test_data), exog=test_exog)
    
    # Combine predictions
    predictions = pd.concat([train_pred, pd.Series(test_pred, index=test_data.index)])
    
    # Shift predictions forward by 3 days to match target
    predictions = predictions.shift(3)
    
    return predictions

# Train model
model = train_arima_model(arima_df_clean)

# Make predictions
predictions = make_predictions(model, arima_df_clean)

# Add predictions to DataFrame
arima_df_clean['yhat'] = predictions
arima_df_clean['yhat_lower'] = predictions * 0.95  # Simple 5% confidence interval
arima_df_clean['yhat_upper'] = predictions * 1.05


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


An unsupported index was provided. As a result, forecasts cannot be generated. To use the model for forecasting, use one of the supported classes of index.


Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.


Non-invertible starting MA parameters found. Using zeros as starting parameters.


Maximum Likelihood optimization failed to converge. Check mle_retvals


No supported index is available. Prediction results will be given with an integer index beginning at `start`.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: 

In [58]:
## Visualize Results
def plot_arima_analysis(forecast_df, split_column='split'):
    # Create figure with secondary y-axis
    fig = make_subplots(rows=2, cols=1, 
                       shared_xaxes=True,
                       vertical_spacing=0.1,
                       subplot_titles=('Actual vs Predicted', 'Residuals'))
    
    # Calculate residuals
    forecast_df['residuals'] = forecast_df['y'] - forecast_df['yhat']
    
    # Define colors for each split to match Prophet style
    split_colors = {
        'train': 'rgb(0, 114, 178)',      # Blue
        'validation': 'rgb(240, 228, 66)', # Yellow
        'test': 'rgb(230, 159, 0)'        # Orange
    }
    
    # Add traces for each split (train/validation/test)
    for split in forecast_df[split_column].unique():
        mask = forecast_df[split_column] == split
        split_data = forecast_df[mask]
        color = split_colors[split]
        
        # Main plot
        fig.add_trace(
            go.Scatter(x=split_data.ds, y=split_data.y,
                      name=f'Actual ({split})',
                      mode='markers',
                      marker=dict(color=color)),
            row=1, col=1)
        
        fig.add_trace(
            go.Scatter(x=split_data.ds, y=split_data.yhat,
                      name=f'Predicted ({split})',
                      mode='lines',
                      line=dict(color=color)),
            row=1, col=1)
            
        # Confidence intervals
        fig.add_trace(
            go.Scatter(x=split_data.ds, y=split_data.yhat_upper,
                      fill=None,
                      mode='lines',
                      line=dict(width=0),
                      showlegend=False),
            row=1, col=1)
            
        # Extract RGB values from color string and add alpha
        rgb_values = [int(x) for x in color.replace('rgb(', '').replace(')', '').split(',')]
        rgba_color = f'rgba({rgb_values[0]}, {rgb_values[1]}, {rgb_values[2]}, 0.2)'
        
        fig.add_trace(
            go.Scatter(x=split_data.ds, y=split_data.yhat_lower,
                      fill='tonexty',
                      mode='lines',
                      line=dict(width=0),
                      fillcolor=rgba_color,
                      name=f'CI ({split})'),
            row=1, col=1)
        
        # Residuals plot
        fig.add_trace(
            go.Scatter(x=split_data.ds, y=split_data.residuals,
                      name=f'Residuals ({split})',
                      mode='markers',
                      marker=dict(color=color)),
            row=2, col=1)
    
    # Update layout
    fig.update_layout(
        height=800,
        showlegend=True,
        title_text="ARIMA Forecast Analysis with Residuals",
        template='plotly_white'
    )
    
    # Update axes labels
    fig.update_xaxes(title_text="Date", row=2, col=1)
    fig.update_yaxes(title_text="Value", row=1, col=1)
    fig.update_yaxes(title_text="Residual", row=2, col=1)
    
    return fig

# Plot analysis
fig = plot_arima_analysis(arima_df_clean)
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

