Get visual representation of how model accuracy does based on volatility of the ticker. 
- pull in apis
- get volatility score and volatility cat. at the end, i can put them on a scatterplot and have the color reflect category, x-axis reflect score. should be a positive correlation. 
- randomly get 30 tickers for each category. fit model and get metrics. 
    - use if statements

In [1]:
import requests
import yfinance as yf
import pandas as pd
import random
import numpy as np
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
standard_model_smapes_path = r"C:\Users\Shane\Desktop\2024.06.27_-_Data_Science\2024.10.13 - Portfolio Projects\2024.12.04 - Swing Ticker\SMAPE Model Eval\standard_model.csv"

In [3]:
standard_model_smapes = pd.read_csv(standard_model_smapes_path)
standard_model_smapes.head()

Unnamed: 0,ticker,volatility,smape,avg_price,len
0,WPP,Medium,0.270578,32.136355,9355
1,CVGW,Medium,0.162737,31.071787,5681
2,DFH,Medium,0.3057,22.145518,1023
3,RCEL,Medium-High,0.915579,13.289237,3209
4,C,Medium-Low,0.325752,79.326248,12132


In [4]:
ticker_list = standard_model_smapes['ticker'].tolist()
ticker_list[0:5]

['WPP', 'CVGW', 'DFH', 'RCEL', 'C']

In [5]:
def load_data(ticker):
    """
    Downloads historical market data for a given ticker symbol.

    Parameters:
    ticker (str): The ticker symbol of the stock to download data for.

    Returns:
    pd.DataFrame: A DataFrame containing the historical market data for the specified ticker.
    """
    data = yf.download(ticker, period='max') # returns relevant data in df
    data.reset_index(inplace=True) # reset multindex, output is index list of tuples
    cols = list(data.columns) # convert index to list
    cols[0] = ('Date', '') 
    cols = [i[0] for i in cols] # return first element of cols tuples
    data.columns = cols # set as column names
    data['Date'] = pd.to_datetime(data['Date']).dt.date
    data.Date = data.Date.astype('datetime64[ns]')
    return data

In [None]:
def get_volatility(data):
    """
    Calculate the volatility of daily returns and categorize it into levels.

    This function computes the annualized volatility of daily returns from the input data.
    It then categorizes the volatility into one of five levels: "Low", "Medium-Low", "Medium", 
    "Medium-High", or "High" based on predefined thresholds.

    Parameters:
    data (DataFrame): A pandas DataFrame containing a 'daily_returns' column with daily return values.

    Returns:
    tuple: A tuple containing the volatility category (str) and the calculated volatility (float).
    """
    volatility = data.daily_returns.std() * np.sqrt(252)
    if volatility < 0.2:
        category = "Low"
        percentiles=(0.15, 0.85)
    elif volatility < 0.4:
        category = "Medium-Low"
        percentiles=(0.1, 0.9)
    elif volatility < 0.6:
        category = "Medium"
        percentiles=(0.1, 0.9)
    elif volatility < 0.8:
        category = "Medium-High"
        percentiles=(0.05, 0.95)
    else:
        category = "High"
        percentiles=(0.05, 0.95)
    return category, volatility, percentiles

In [None]:
def get_period_params(data, volatility):
    """
    Determine the training, period unit, and forecast periods based on data length and volatility.

    This function calculates the appropriate training period, period unit, and forecast period 
    based on the length of the input data and the given volatility. If the data length is less 
    than 8 years, the period unit and forecast period are set to a quarter of the data length. 
    Otherwise, the period unit is set to 1 year, and the forecast period is set to 1 year. 
    The training period is adjusted based on the volatility.

    Parameters:
    data (DataFrame): A pandas DataFrame containing the data.
    volatility (float): The calculated annualized volatility.

    Returns:
    tuple: A tuple containing:
        - train_period (int): The length of the training period.
        - period_unit (int): The length of the period unit.
        - forecast_period (int): The length of the forecast period.
    """
    if len(data)/365 < 8:
        period_unit = int(len(data)/4)
        forecast_period = period_unit
        train_period = len(data)
    else:
        period_unit = 365
        forecast_period = period_unit
        train_period = forecast_period * 4 if volatility < 0.6 else forecast_period * 8
    return train_period, period_unit, forecast_period

In [None]:
# Lambda function for performing cross-validation with specified parameters

cv_func = lambda model_name, train_period, period_unit, forecast_period: cross_validation(model_name, 
                                              initial=f'{train_period} days', 
                                              period=f'{period_unit} days', 
                                              horizon=f'{forecast_period} days', 
                                              parallel="processes")

In [9]:
# Windsorize Function
# ------------------------------------------------------------------
def dynamic_winsorize(df, column, percentiles, window_size=30):
    """
    Winsorizes data within a rolling window.

    Args:
        df: DataFrame containing the data.
        column: Name of the column to winsorize.
        window_size: Size of the rolling window.
        percentiles: Tuple containing the lower and upper percentiles.

    Returns:
        DataFrame with the winsorized column.
    """

    df['rolling_lower'] = df[column].rolling(window=window_size).quantile(percentiles[0])
    df['rolling_upper'] = df[column].rolling(window=window_size).quantile(percentiles[1])

    df['winsorized'] = df[column]
    df.loc[df[column] < df['rolling_lower'], 'winsorized'] = df['rolling_lower']
    df.loc[df[column] > df['rolling_upper'], 'winsorized'] = df['rolling_upper']

    # Calculate the percentage of data points that were winsorized
    winsorized_percentage = np.mean(df['Close'] != df['winsorized'])
    df['winsorized_pct'] = winsorized_percentage

    return df

In [10]:
def tune_and_train_final_model(data, df_train, all_params, train_period, period_unit, forecast_period):
    """
    Tunes hyperparameters, trains the final Prophet model, evaluates its performance, and generates a forecast.

    Parameters:
    df_train (pd.DataFrame): DataFrame containing the training data with 'ds' (date) and target columns.
    all_params (list): List of dictionaries containing hyperparameter combinations to be tested.
    forecast_period (int): Number of periods to forecast into the future.
    scores_df (pd.DataFrame): DataFrame to store the performance metrics of the models.

    Returns:
    tuple: A tuple containing the trained model, updated scores DataFrame, forecast DataFrame, and best hyperparameters dictionary.
    """
    smapes = []
    for params in all_params:
        m = Prophet(**params).fit(df_train)
        df_cv = cv_func(m, train_period, period_unit, forecast_period)
        df_p = performance_metrics(df_cv, rolling_window=1)
        smapes.append(df_p['smape'].values[0])

    # Find best parameters
    tuning_results = pd.DataFrame(all_params)
    tuning_results['smape'] = smapes
    best_params_dict = dict(tuning_results.sort_values('smape').reset_index(drop=True).drop('smape', axis='columns').iloc[0])

    # Reload OG data
    df_train = data[['Date', 'Close']].iloc[-train_period:]
    df_train.columns = ['ds', 'y']

    # Train final model with best parameters
    m = Prophet(**best_params_dict)
    m.fit(df_train)
    df_cv = cv_func(m, train_period, period_unit, forecast_period)
    df_p = performance_metrics(df_cv, rolling_window=1)

    return df_p['smape'].values[0]

In [None]:
import itertools

def score_ticker_smapes(ticker_list):
    """
    Score and return SMAPE values for a list of tickers using a tuned model.

    This function iterates through a list of tickers, loads their historical data, 
    and calculates daily returns and volatility. It then dynamically winsorizes the 
    data based on calculated percentiles and trains a Prophet model with a grid search 
    over specified hyperparameters. The function computes the SMAPE (Symmetric Mean 
    Absolute Percentage Error) for each ticker and returns the results.

    Parameters:
    ticker_list (list): A list of ticker symbols to be modeled.

    Returns:
    tuple: A tuple containing:
        - smapes (list): A list of SMAPE values for the modeled tickers.
        - tickers_modeled (list): A list of tickers that were successfully modeled.
        - winsorized_pct (list): A list of the final winsorized percentage values for the modeled tickers.
    """
    smapes = []
    tickers_modeled = []
    winsorized_pct = []
    param_grid = {
        'changepoint_prior_scale': [0.001, 0.01, 0.1, 0.5],
        'seasonality_prior_scale': [0.01, 0.1, 1.0, 10.0]
        }
    all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

    for ticker in ticker_list:
        data = load_data(ticker)
        data['daily_returns'] = data.Close.pct_change()
        category, volatility, percentiles = get_volatility(data)
        tickers_modeled.append(ticker)
        train_period, period_unit, forecast_period = get_period_params(data, volatility)
        data = dynamic_winsorize(data, 'Close', percentiles=percentiles)
        winsorized_pct.append(data.winsorized_pct.values[-1])
        df_train = data[['Date', 'winsorized']].iloc[-train_period:]
        df_train.columns = ['ds', 'y'] 
        smapes.append(tune_and_train_final_model(data, df_train, all_params, train_period, period_unit, forecast_period))
    return smapes, tickers_modeled, winsorized_pct

In [12]:
smapes, tickers_modeled, winsorized_pct = score_ticker_smapes(ticker_list)

[*********************100%***********************]  1 of 1 completed
20:09:51 - cmdstanpy - INFO - Chain [1] start processing
20:09:51 - cmdstanpy - INFO - Chain [1] done processing
20:09:52 - cmdstanpy - INFO - Chain [1] start processing
20:09:52 - cmdstanpy - INFO - Chain [1] done processing
20:09:53 - cmdstanpy - INFO - Chain [1] start processing
20:09:53 - cmdstanpy - INFO - Chain [1] done processing
20:09:54 - cmdstanpy - INFO - Chain [1] start processing
20:09:54 - cmdstanpy - INFO - Chain [1] done processing
20:09:55 - cmdstanpy - INFO - Chain [1] start processing
20:09:55 - cmdstanpy - INFO - Chain [1] done processing
20:09:56 - cmdstanpy - INFO - Chain [1] start processing
20:09:56 - cmdstanpy - INFO - Chain [1] done processing
20:09:57 - cmdstanpy - INFO - Chain [1] start processing
20:09:57 - cmdstanpy - INFO - Chain [1] done processing
20:09:58 - cmdstanpy - INFO - Chain [1] start processing
20:09:58 - cmdstanpy - INFO - Chain [1] done processing
20:09:59 - cmdstanpy - INFO

In [13]:
len(tickers_modeled)

150

In [14]:
len(smapes)

150

In [15]:
tuned_smape = pd.DataFrame({
    'ticker': tickers_modeled, 
    'smape': smapes,
    'winsorized_pct': winsorized_pct
    })
tuned_smape.head()

Unnamed: 0,ticker,smape,winsorized_pct
0,WPP,0.075833,0.364725
1,CVGW,0.163886,0.363316
2,DFH,0.239105,0.418377
3,RCEL,0.627327,0.267061
4,C,0.241818,0.368035


In [16]:
# save data to csv

tuned_smape_path = r"C:\Users\Shane\Desktop\2024.06.27_-_Data_Science\2024.10.13 - Portfolio Projects\2024.12.04 - Swing Ticker\Model Performance\tuned_model.csv"

In [17]:
tuned_smape.to_csv(tuned_smape_path, index=False)

In [18]:
combined_model_smape = pd.merge(tuned_smape, standard_model_smapes, on='ticker', suffixes=('_tuned', '_standard'))

In [19]:
combined_model_smape.columns

Index(['ticker', 'smape_tuned', 'winsorized_pct', 'volatility',
       'smape_standard', 'avg_price', 'len'],
      dtype='object')

In [20]:
combined_model_smape = combined_model_smape[['ticker', 'volatility', 'avg_price', 'len', 'smape_standard', 'smape_tuned', 'winsorized_pct']]
combined_model_smape.head()

Unnamed: 0,ticker,volatility,avg_price,len,smape_standard,smape_tuned,winsorized_pct
0,WPP,Medium,32.136355,9355,0.270578,0.075833,0.364725
1,CVGW,Medium,31.071787,5681,0.162737,0.163886,0.363316
2,DFH,Medium,22.145518,1023,0.3057,0.239105,0.418377
3,RCEL,Medium-High,13.289237,3209,0.915579,0.627327,0.267061
4,C,Medium-Low,79.326248,12132,0.325752,0.241818,0.368035


In [21]:
len(combined_model_smape)

150

In [None]:
# save combined data to csv
combined_model_smape_path = r"C:\Users\Shane\Desktop\2024.06.27_-_Data_Science\2024.10.13 - Portfolio Projects\2024.12.04 - Swing Ticker\SMAPE Model Eval\combined_model_smape.csv"

In [23]:
combined_model_smape.to_csv(combined_model_smape_path, index=False)