Get Naive model on the same tickers used for my sample

In [1]:
import requests
import yfinance as yf
import pandas as pd
import random
import numpy as np
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
from sklearn.metrics import mean_squared_error

In [2]:
rmses_df = pd.read_csv('combined_model_rmse.csv')
rmses_df.head()

Unnamed: 0,ticker,volatility,avg_price,len,rmse_standard,rmse_tuned,winsorized_pct
0,ADYEY,Medium,16.389898,1347,5.08775,7.321585,0.397179
1,NMM,Medium,63.829555,4339,23.137863,18.612367,0.373358
2,MARK,High,214.811263,5455,10.280774,4.45447,0.273877
3,VIGI,Low,63.282331,2251,12.586773,3.847514,0.54598
4,GDRX,Medium-High,17.561466,1102,2.904431,1.844597,0.259528


In [3]:
tickers = rmses_df['ticker'].tolist()
tickers

['ADYEY',
 'NMM',
 'MARK',
 'VIGI',
 'GDRX',
 'FTGS',
 'GOVX',
 'DFEM',
 'WABC',
 'SDA',
 'AAOI',
 'CRGY',
 'JHMD',
 'XLK',
 'FTLS',
 'CL',
 'MPWR',
 'THQ',
 'EFX',
 'FFTY',
 'EFSC',
 'WDC',
 'JG',
 'MYRG',
 'DBC',
 'VFLO',
 'CCL',
 'SPTN',
 'OILD',
 'REZI',
 'WBD',
 'BILS',
 'GMPR',
 'AHEXY',
 'UDMY',
 'ARTY',
 'GFAI',
 'FPEI',
 'VINC',
 'VERV',
 'HLVX',
 'CLDT',
 'RELI',
 'GXO',
 'AZTA',
 'FBEC',
 'ILPT',
 'WTW',
 'CDP',
 'CNK',
 'PFS',
 'TPC',
 'TCHP',
 'VYX',
 'GWW',
 'TDTT',
 'WB',
 'TMO',
 'CVLT',
 'USIO',
 'CNMD',
 'SGBX',
 'MUST',
 'SUI',
 'QNCX',
 'FOXA',
 'XOM',
 'INVA',
 'AIXI',
 'CMA',
 'WU',
 'EFV',
 'HIGH',
 'WEX',
 'DEO',
 'ANGO',
 'HOMB',
 'DCOM',
 'SAH',
 'NEAR',
 'CALX',
 'OEC',
 'IVOL',
 'FCPT',
 'SNCY',
 'BLV',
 'XLU',
 'STIP',
 'PGF',
 'WJRYY',
 'GMNI',
 'STRL',
 'FAF',
 'BSY',
 'ETNB',
 'HPE',
 'FTDR',
 'DY',
 'MUB',
 'VGIT',
 'SGRY',
 'SE',
 'XLP',
 'ARMN',
 'MRNA',
 'APP',
 'RDCM',
 'ENGN',
 'AOR',
 'PED',
 'BBW',
 'CYTK',
 'APM',
 'GDLC',
 'IGLB',
 'ACON',
 'BL

In [4]:
def load_data(ticker):
    """
    Downloads historical market data for a given ticker symbol.

    Parameters:
    ticker (str): The ticker symbol of the stock to download data for.

    Returns:
    pd.DataFrame: A DataFrame containing the historical market data for the specified ticker.
    """
    data = yf.download(ticker, period='max') # returns relevant data in df
    data.reset_index(inplace=True) # reset multindex, output is index list of tuples
    cols = list(data.columns) # convert index to list
    # cols[0] = ('Date', '') 
    # cols = [i[0] for i in cols] # return first element of cols tuples
    # data.columns = cols # set as column names
    data['Date'] = pd.to_datetime(data['Date']).dt.date
    data.Date = data.Date.astype('datetime64[ns]')
    return data

In [5]:
df_temp = load_data('AAPL')
df_temp.columns

[*********************100%%**********************]  1 of 1 completed


Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

In [6]:
def get_volatility(data):
    volatility = data.daily_returns.std() * np.sqrt(252)
    if volatility < 0.2:
        category = "Low"
        percentiles=(0.15, 0.85)
    elif volatility < 0.4:
        category = "Medium-Low"
        percentiles=(0.1, 0.9)
    elif volatility < 0.6:
        category = "Medium"
        percentiles=(0.1, 0.9)
    elif volatility < 0.8:
        category = "Medium-High"
        percentiles=(0.05, 0.95)
    else:
        category = "High"
        percentiles=(0.05, 0.95)
    return category, volatility, percentiles

In [7]:
def get_period_params(data, volatility):
    if len(data)/365 < 8:
        period_unit = int(len(data)/4)
        forecast_period = period_unit
        train_period = len(data)
    else:
        period_unit = 365
        forecast_period = period_unit
        train_period = forecast_period * 4 if volatility < 0.6 else forecast_period * 8
    return train_period, period_unit, forecast_period

In [8]:
cv_func = lambda model_name, train_period, period_unit, forecast_period: cross_validation(model_name, 
                                              initial=f'{train_period} days', 
                                              period=f'{period_unit} days', 
                                              horizon=f'{forecast_period} days', 
                                              parallel="processes")

In [9]:
def naive_forecast(history):
    """
    Performs a naive forecast using the last value in the history.

    Parameters:
    history (pd.Series): Time series data to use as history.

    Returns:
    float: The naive forecast (last value of history).
    """
    return history.iloc[-1]

In [10]:
def naive_cross_validation(data, initial, period, horizon):
    """
    Performs cross-validation for a naive forecasting model.

    Parameters:
    data (pd.DataFrame): Time series data with 'Date' and 'Close' columns.
    initial (int): Initial training period length in days.
    period (int): Period between cutoffs in days.
    horizon (int): Forecast horizon in days.

    Returns:
    pd.DataFrame: DataFrame containing actual values, forecasts, and cutoffs.
    """
    initial_date = data['Date'].iloc[0]
    cutoffs = pd.to_datetime([initial_date + pd.Timedelta(days=initial + i * period)
                               for i in range(1 + (len(data) - initial - horizon) // period)])

    periods_list = []
    for cutoff in cutoffs:
        train_df = data[data['Date'] <= cutoff].copy()
        test_df = data[(data['Date'] > cutoff) & (data['Date'] <= cutoff + pd.Timedelta(days=horizon))].copy()

        # if len(test_df) == 0: # Skip if no data in the horizon
        #     continue

        history = train_df['Close']
        forecasts = [naive_forecast(history)] * len(test_df) # Naive forecast for all points in horizon

        test_df['forecast'] = forecasts
        test_df['cutoff'] = cutoff.date() # Store cutoff date

        periods_list.append(test_df)

    df_cv = pd.concat(periods_list).reset_index(drop=True)
    return df_cv

In [11]:
def calculate_rmse(cv_results):
    """
    Calculates RMSE from cross-validation results DataFrame.

    Parameters:
    cv_results (pd.DataFrame): DataFrame from naive_cross_validation.

    Returns:
    float: RMSE value.
    """
    if cv_results.empty:
        return np.nan  # Return NaN if no cross-validation results

    return np.sqrt(mean_squared_error(cv_results['Close'], cv_results['forecast']))

In [12]:
def get_naive_rmse_for_tickers(tickers):
    """
    Calculates naive model RMSE for a list of stock tickers.

    Parameters:
    tickers (list): List of stock ticker symbols.

    Returns:
    pd.DataFrame: DataFrame with ticker symbols and their naive model RMSEs.
    """
    naive_rmse_results = []
    for ticker in tickers:
        data = load_data(ticker)
        data['daily_returns'] = data.Close.pct_change()
        category, volatility, percentiles = get_volatility(data)
        train_period, period_unit, forecast_period = get_period_params(data, volatility)

        initial_days = train_period
        period_days = period_unit
        horizon_days = forecast_period

        cv_results_naive = naive_cross_validation(
            data[['Date', 'Close']], # Ensure only Date and Close are passed
            initial=initial_days,
            period=period_days,
            horizon=horizon_days
        )
        rmse_naive = calculate_rmse(cv_results_naive)
        naive_rmse_results.append({'Ticker': ticker, 'Naive_RMSE': rmse_naive})
        print(f"Naive RMSE for {ticker}: {rmse_naive:.4f}")

    return pd.DataFrame(naive_rmse_results)

In [13]:
naive_rmse_df = get_naive_rmse_for_tickers(tickers)
naive_rmse_df.head()

[*********************100%%**********************]  1 of 1 completed


ValueError: No objects to concatenate