Get visual representation of how model accuracy does based on volatility of the ticker. 
- pull in apis
- get volatility score and volatility cat. at the end, i can put them on a scatterplot and have the color reflect category, x-axis reflect score. should be a positive correlation. 
- randomly get 30 tickers for each category. fit model and get metrics. 
    - use if statements

In [1]:
import requests
import yfinance as yf
import pandas as pd
import random
import numpy as np
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
standard_model_rmses = pd.read_csv(r"C:\Users\Shane\Desktop\2024.06.27_-_Data_Science\2024.10.13 - Portfolio Projects\2024.12.04 - Swing Ticker\Model Performance\standard_model.csv")
standard_model_rmses.head()

Unnamed: 0,ticker,volatility,rmse,avg_price,len
0,ADYEY,Medium,5.08775,16.389898,1347
1,NMM,Medium,23.137863,63.829555,4339
2,MARK,High,10.280774,214.811263,5455
3,VIGI,Low,12.586773,63.282331,2251
4,GDRX,Medium-High,2.904431,17.561466,1102


In [3]:
ticker_list = standard_model_rmses['ticker'].tolist()
ticker_list[0:5]

['ADYEY', 'NMM', 'MARK', 'VIGI', 'GDRX']

In [4]:
def load_data(ticker):
    """
    Downloads historical market data for a given ticker symbol.

    Parameters:
    ticker (str): The ticker symbol of the stock to download data for.

    Returns:
    pd.DataFrame: A DataFrame containing the historical market data for the specified ticker.
    """
    data = yf.download(ticker, period='max') # returns relevant data in df
    data.reset_index(inplace=True) # reset multindex, output is index list of tuples
    cols = list(data.columns) # convert index to list
    cols[0] = ('Date', '') 
    cols = [i[0] for i in cols] # return first element of cols tuples
    data.columns = cols # set as column names
    data['Date'] = pd.to_datetime(data['Date']).dt.date
    data.Date = data.Date.astype('datetime64[ns]')
    return data

In [5]:
def get_volatility(data):
    volatility = data.daily_returns.std() * np.sqrt(252)
    if volatility < 0.2:
        category = "Low"
        percentiles=(0.15, 0.85)
    elif volatility < 0.4:
        category = "Medium-Low"
        percentiles=(0.1, 0.9)
    elif volatility < 0.6:
        category = "Medium"
        percentiles=(0.1, 0.9)
    elif volatility < 0.8:
        category = "Medium-High"
        percentiles=(0.05, 0.95)
    else:
        category = "High"
        percentiles=(0.05, 0.95)
    return category, volatility, percentiles

In [6]:
def get_period_params(data, volatility):
    if len(data)/365 < 8:
        period_unit = int(len(data)/4)
        forecast_period = period_unit
        train_period = len(data)
    else:
        period_unit = 365
        forecast_period = period_unit
        train_period = forecast_period * 4 if volatility < 0.6 else forecast_period * 8
    return train_period, period_unit, forecast_period

In [7]:
cv_func = lambda model_name, train_period, period_unit, forecast_period: cross_validation(model_name, 
                                              initial=f'{train_period} days', 
                                              period=f'{period_unit} days', 
                                              horizon=f'{forecast_period} days', 
                                              parallel="processes")

In [8]:
# Windsorize Function
# ------------------------------------------------------------------
def dynamic_winsorize(df, column, percentiles, window_size=30):
    """
    Winsorizes data within a rolling window.

    Args:
        df: DataFrame containing the data.
        column: Name of the column to winsorize.
        window_size: Size of the rolling window.
        percentiles: Tuple containing the lower and upper percentiles.

    Returns:
        DataFrame with the winsorized column.
    """

    df['rolling_lower'] = df[column].rolling(window=window_size).quantile(percentiles[0])
    df['rolling_upper'] = df[column].rolling(window=window_size).quantile(percentiles[1])

    df['winsorized'] = df[column]
    df.loc[df[column] < df['rolling_lower'], 'winsorized'] = df['rolling_lower']
    df.loc[df[column] > df['rolling_upper'], 'winsorized'] = df['rolling_upper']

    # Calculate the percentage of data points that were winsorized
    winsorized_percentage = np.mean(df['Close'] != df['winsorized'])
    df['winsorized_pct'] = winsorized_percentage

    return df

In [9]:
def tune_and_train_final_model(data, df_train, all_params, train_period, period_unit, forecast_period):
    """
    Tunes hyperparameters, trains the final Prophet model, evaluates its performance, and generates a forecast.

    Parameters:
    df_train (pd.DataFrame): DataFrame containing the training data with 'ds' (date) and target columns.
    all_params (list): List of dictionaries containing hyperparameter combinations to be tested.
    forecast_period (int): Number of periods to forecast into the future.
    scores_df (pd.DataFrame): DataFrame to store the performance metrics of the models.

    Returns:
    tuple: A tuple containing the trained model, updated scores DataFrame, forecast DataFrame, and best hyperparameters dictionary.
    """
    rmses = []
    for params in all_params:
        m = Prophet(**params).fit(df_train)
        df_cv = cv_func(m, train_period, period_unit, forecast_period)
        df_p = performance_metrics(df_cv, rolling_window=1)
        rmses.append(df_p['rmse'].values[0])

    # Find best parameters
    tuning_results = pd.DataFrame(all_params)
    tuning_results['rmse'] = rmses
    best_params_dict = dict(tuning_results.sort_values('rmse').reset_index(drop=True).drop('rmse', axis='columns').iloc[0])

    # Reload OG data
    df_train = data[['Date', 'Close']].iloc[-train_period:]
    df_train.columns = ['ds', 'y']

    # Train final model with best parameters
    m = Prophet(**best_params_dict)
    m.fit(df_train)
    df_cv = cv_func(m, train_period, period_unit, forecast_period)
    df_p = performance_metrics(df_cv, rolling_window=1)

    return df_p['rmse'].values[0]

In [10]:
import itertools

def score_ticker_rmses(ticker_list):
    rmses = []
    tickers_modeled = []
    winsorized_pct = []
    param_grid = {
        'changepoint_prior_scale': [0.001, 0.01, 0.1, 0.5],
        'seasonality_prior_scale': [0.01, 0.1, 1.0, 10.0]
        }
    all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

    for ticker in ticker_list:
        data = load_data(ticker)
        data['daily_returns'] = data.Close.pct_change()
        category, volatility, percentiles = get_volatility(data)
        tickers_modeled.append(ticker)
        train_period, period_unit, forecast_period = get_period_params(data, volatility)
        data = dynamic_winsorize(data, 'Close', percentiles=percentiles)
        winsorized_pct.append(data.winsorized_pct.values[-1])
        df_train = data[['Date', 'winsorized']].iloc[-train_period:]
        df_train.columns = ['ds', 'y'] 
        rmses.append(tune_and_train_final_model(data, df_train, all_params, train_period, period_unit, forecast_period))
    return rmses, tickers_modeled, winsorized_pct

In [11]:
rmses, tickers_modeled, winsorized_pct = score_ticker_rmses(ticker_list)

[*********************100%***********************]  1 of 1 completed
20:01:29 - cmdstanpy - INFO - Chain [1] start processing
20:01:29 - cmdstanpy - INFO - Chain [1] done processing
20:01:30 - cmdstanpy - INFO - Chain [1] start processing
20:01:30 - cmdstanpy - INFO - Chain [1] done processing
20:01:31 - cmdstanpy - INFO - Chain [1] start processing
20:01:31 - cmdstanpy - INFO - Chain [1] done processing
20:01:32 - cmdstanpy - INFO - Chain [1] start processing
20:01:32 - cmdstanpy - INFO - Chain [1] done processing
20:01:33 - cmdstanpy - INFO - Chain [1] start processing
20:01:33 - cmdstanpy - INFO - Chain [1] done processing
20:01:34 - cmdstanpy - INFO - Chain [1] start processing
20:01:34 - cmdstanpy - INFO - Chain [1] done processing
20:01:35 - cmdstanpy - INFO - Chain [1] start processing
20:01:35 - cmdstanpy - INFO - Chain [1] done processing
20:01:36 - cmdstanpy - INFO - Chain [1] start processing
20:01:36 - cmdstanpy - INFO - Chain [1] done processing
20:01:37 - cmdstanpy - INFO

In [12]:
len(tickers_modeled)

150

In [13]:
len(rmses)

150

In [14]:
tuned_rmse = pd.DataFrame({
    'ticker': tickers_modeled, 
    'rmse': rmses,
    'winsorized_pct': winsorized_pct
    })
tuned_rmse.head()

Unnamed: 0,ticker,rmse,winsorized_pct
0,ADYEY,7.321585,0.397179
1,NMM,18.612367,0.373358
2,MARK,4.45447,0.273877
3,VIGI,3.847514,0.54598
4,GDRX,1.844597,0.259528


In [23]:
# save data to csv

tuned_rmse_path = r"C:\Users\Shane\Desktop\2024.06.27_-_Data_Science\2024.10.13 - Portfolio Projects\2024.12.04 - Swing Ticker\Model Performance\tuned_model.csv"

In [24]:
tuned_rmse.to_csv(tuned_rmse_path, index=False)

In [25]:
combined_model_rmse = pd.merge(tuned_rmse, standard_model_rmses, on='ticker', suffixes=('_tuned', '_standard'))

In [26]:
combined_model_rmse.columns

Index(['ticker', 'rmse_tuned', 'winsorized_pct', 'volatility', 'rmse_standard',
       'avg_price', 'len'],
      dtype='object')

In [27]:
combined_model_rmse = combined_model_rmse[['ticker', 'volatility', 'avg_price', 'len', 'rmse_standard', 'rmse_tuned', 'winsorized_pct']]
combined_model_rmse.head()

Unnamed: 0,ticker,volatility,avg_price,len,rmse_standard,rmse_tuned,winsorized_pct
0,ADYEY,Medium,16.389898,1347,5.08775,7.321585,0.397179
1,NMM,Medium,63.829555,4339,23.137863,18.612367,0.373358
2,MARK,High,214.811263,5455,10.280774,4.45447,0.273877
3,VIGI,Low,63.282331,2251,12.586773,3.847514,0.54598
4,GDRX,Medium-High,17.561466,1102,2.904431,1.844597,0.259528


In [28]:
len(combined_model_rmse)

150

In [29]:
# save combined data to csv
combined_model_rmse_path = r"C:\Users\Shane\Desktop\2024.06.27_-_Data_Science\2024.10.13 - Portfolio Projects\2024.12.04 - Swing Ticker\Model Performance\combined_model_rmse.csv"

In [30]:
combined_model_rmse.to_csv(combined_model_rmse_path, index=False)