In [1]:
#!pip install ace_tools

In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler, Normalizer
import pandas_ta as ta
from pandas.tseries.offsets import MonthEnd, BDay, Week
from itertools import product
from sklearn.metrics import mean_absolute_error

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from datetime import datetime, timedelta
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from skopt import BayesSearchCV

from skopt.space import Real, Integer, Categorical

import time

In [3]:
# Function for loading ETF data
def data_loading(ticker_symbol, start_date, end_date):
    etf_data = yf.Ticker(ticker_symbol)
    etf_history = etf_data.history(start=start_date, end=end_date)
    etf_history.index = etf_history.index.tz_localize(None)
    return etf_history, etf_data

#Function to derive the predictor columns
def etf_predictors(etf_history,  start_date, end_date, etf_data, benchmark_ticker = '^GSPC' ):
    
    # Calculate Daily Returns
    etf_history['Daily Return'] = etf_history['Close'].pct_change()

    # Calculate 21-Day Volatility (standard deviation of daily returns, annualized)
    etf_history['Volatility'] = etf_history['Daily Return'].rolling(window=21).std() * np.sqrt(252)
    #etf_history['Volatility_ta'] = ta.volatility(etf_history['Close'], window=21, annualize=True)
   
    # Get the ETF info
    etf_info = etf_data.info

    # Retrieve the net asset value price (NAV) and total net assets
    nav_price = etf_info['navPrice']
    total_assets = etf_info['totalAssets']

    # Calculate the number of shares outstanding
    shares_outstanding = total_assets / nav_price

    # Calculate Market Capitalization for each day
    # Market Capitalization = Closing Price * Total Number of Shares Outstanding
    etf_history['Market Cap'] = etf_history['Close'] * shares_outstanding

    total_assets = etf_info['totalAssets']
    total_liabilities = etf_info.get('totalLiabilities', 0)  # Handle the case where total liabilities might not be present
    
    # Calculate Book Value per Share
    book_value_per_share = (total_assets - total_liabilities) / shares_outstanding
    
    # Calculate Price to Book (P/B) Ratio
    etf_history['P/B Ratio'] = etf_history['Close'] / book_value_per_share
    
    # Calculate 1-Month Momentum (21 trading days)
    
    etf_history['Momentum'] = ta.mom(etf_history['Close'], length=21)
    
    benchmark_data = yf.download(benchmark_ticker, start=start_date, end=end_date)
    benchmark_data['Daily Return'] = benchmark_data['Close'].pct_change()
    #benchmark_data_history = benchmark_data.dropna()
    benchmark_data.head(10)
    
    # Ensure the indices are time zone-naive
    benchmark_data.index = benchmark_data.index.tz_localize(None)
    combined_data = etf_history[['Close']].join(benchmark_data[['Close']], lsuffix='_ETF', rsuffix='_Benchmark')
    combined_data = combined_data.dropna()
    
    # Calculate rolling beta with a 30-day window
    rolling_beta_21 = rolling_beta(combined_data, window=21)

    # Add the rolling beta to the dataframe
    combined_data.loc[:, 'Rolling Beta 21-day'] = rolling_beta_21
    etf_history['Rolling Beta']=combined_data['Rolling Beta 21-day']
    
    # Calculate daily profitability
    daily_profitabilities = []
    previous_nav = None
    for index, row in etf_history.iterrows():
        current_nav = row['Close']  # Current day's NAV
        #print(current_nav)
        if previous_nav is not None:
            daily_profitability = (current_nav - previous_nav) / previous_nav * 100
            daily_profitabilities.append(daily_profitability)
        else:
            daily_profitabilities.append(None)
        previous_nav = current_nav  # Update previous_nav for the next iteration

    # Add daily profitabilities to ETF dataset
    etf_history['Daily Profitability (%)'] = daily_profitabilities
    
    # Calculate the dividend yield for each day
    dividend_yields = []

    for index, row in etf_history.iterrows():
        # Get the dividend payment for the day
        dividend_payment = row['Dividends']

        # Get the current market price of the ETF for the day
        current_price = row['Close']

        # Calculate the dividend yield for the day
        dividend_yield = (dividend_payment / current_price) * 100

        # Append the dividend yield to the list
        dividend_yields.append(dividend_yield)
      
    etf_history['Div yield'] = dividend_yields

    #Volatility
    etf_history['ATR'] = ta.atr(etf_history['High'], etf_history['Low'], etf_history['Close'], length=21)    
    
    # Compute the Relative Volatility Index (RVI)
    rvi = ta.rvi(etf_history['Close'],length=21)
    etf_history['RVI'] = rvi
    
    #Momentum
    rsi_window = 14  # Window size for RSI calculation
    roc_window = 12  # Window size for ROC calculation
    #rsi_window = 21  
    #roc_window = 21  

    

    etf_history['RSI'] = ta.rsi(etf_history['Close'],length=rsi_window)
    # Calculate Rate of Change (ROC)
    etf_history['ROC'] = ta.roc(etf_history['Close'], length=roc_window)
        
    etf_history['log_returns'] = np.log(etf_history['Close'] / etf_history['Close'].shift(1))

    return etf_history

# Function to calculate rolling beta
def rolling_beta(df, window):
    rolling_cov = df['Close_Benchmark'].rolling(window=window).cov(df['Close_ETF'])
    rolling_var = df['Close_Benchmark'].rolling(window=window).var()
    rolling_beta = rolling_cov / rolling_var
    return rolling_beta



In [4]:
def pre_processing(
    etf_history, 
    train_start_date,
    train_end_date,
    #test_start_date,
    #test_end_date,
    prediction_dates_monthly,
    prediction_dates_weekly,
    feature_columns=None,
    scaling_strategy='StandardScaler', 
    final_end_date='2023-01-01'
):
    """
    Preprocess ETF history data and prepare train/test splits along with 
    dynamically scaled forecast data for both weekly & monthly horizons,
    stored in separate dictionaries.

    Parameters
    ----------
    etf_history : pd.DataFrame
        Full historical data with at least 'log_returns' column plus other features.
    train_start_date : str
        Start date (YYYY-MM-DD) for training data.
    train_end_date : str
        End date (YYYY-MM-DD) for training data.
    
    prediction_dates_monthly : list of str
        List of start dates for each monthly forecast window.
    prediction_dates_weekly : list of str
        List of start dates for each weekly forecast window.
    feature_columns : list of str, optional
        Columns to be used as features. If None, defaults are used.
    scaling_strategy : str, optional
        'StandardScaler' or 'Normalizer'. Defaults to 'StandardScaler'.
    final_end_date : str, optional
        The final end date (YYYY-MM-DD) for the last forecast window.
        Defaults to '2025-01-01'.

    Returns
    -------
    dict
        A dictionary containing:
        - 'train_data_scaled': np.ndarray
        - 'test_data_scaled': np.ndarray
        - 'y_train': pd.Series
        - 'y_test': pd.Series
        - 'scaler': scaler object
        - 'X': pd.DataFrame (all feature columns before splitting)
        - 'etf_history': pd.DataFrame (original data with missing values handled)
        - 'forecast_data_weekly': dict of weekly forecast DataFrame slices
        - 'forecast_data_monthly': dict of monthly forecast DataFrame slices
        - 'forecast_data_weekly_scaled': dict of scaled weekly forecast slices
        - 'forecast_data_monthly_scaled': dict of scaled monthly forecast slices
    """

    # 1. Handle missing values and infinite values
    etf_history.fillna(etf_history.median(), inplace=True)
    etf_history.replace([np.inf, -np.inf], 0, inplace=True)

    # 2. Default feature columns if none are provided
    if feature_columns is None:
        feature_columns = [
            'Volatility', 'Volume', 'Daily Return', 'Market Cap', 
            'P/B Ratio', 'Momentum', 'Rolling Beta', 
            'Daily Profitability (%)', 'ATR', 'RVI', 'RSI', 'ROC'
        ]

    # 3. Select features (X) and target (y)
    X = etf_history.loc[:, feature_columns]
    y = etf_history['log_returns']

    # 4. Split train and test data
    train_data = X.loc[train_start_date : train_end_date]
    #test_data  = X.loc[test_start_date  : test_end_date]
    y_train = y.loc[train_start_date : train_end_date]
    #y_test  = y.loc[test_start_date  : test_end_date]
    
    #print(train_data.head(5))
    #print(train_data.tail(5))
    # 5. Initialize and fit scaler
    if scaling_strategy == 'StandardScaler':
        scaler = StandardScaler()
    elif scaling_strategy == 'Normalizer':
        scaler = Normalizer()
    else:
        raise ValueError(f"Unsupported scaling strategy: {scaling_strategy}")

    train_data_scaled = scaler.fit_transform(train_data)
    #test_data_scaled  = scaler.transform(test_data)

    # 6. Create separate dictionaries for weekly and monthly
    forecast_data_weekly = {}
    forecast_data_monthly = {}

    # ---- Weekly Forecast Slices ----
    for i, start_date in enumerate(prediction_dates_weekly):
        if i < len(prediction_dates_weekly) - 1:
            weekly_end = (pd.to_datetime(start_date) + Week(1) - pd.Timedelta(days=1)).strftime('%Y-%m-%d')
        else:
            weekly_end = final_end_date
        
        w_key = f'forecast_data_{i+1}w'
        forecast_data_weekly[w_key] = X.loc[start_date : weekly_end]

    # ---- Monthly Forecast Slices ----
    for i, start_date in enumerate(prediction_dates_monthly):
        if i < len(prediction_dates_monthly) - 1:
            monthly_end = (pd.to_datetime(start_date) + MonthEnd(0)).strftime('%Y-%m-%d')
        else:
            monthly_end = final_end_date
        
        m_key = f'forecast_data_{i+1}m'
        forecast_data_monthly[m_key] = X.loc[start_date : monthly_end]

    # 7. Scale forecast data (weekly & monthly) and add intercept
    forecast_data_weekly_scaled = {}
    for period_key, period_data in forecast_data_weekly.items():
        if not period_data.empty:
            data_scaled = scaler.transform(period_data)
            forecast_data_weekly_scaled[period_key] = sm.add_constant(data_scaled)

    forecast_data_monthly_scaled = {}
    for period_key, period_data in forecast_data_monthly.items():
        if not period_data.empty:
            data_scaled = scaler.transform(period_data)
            forecast_data_monthly_scaled[period_key] = sm.add_constant(data_scaled)

    # 8. Add a constant to train and test data
    train_data_scaled = sm.add_constant(train_data_scaled)
    #test_data_scaled  = sm.add_constant(test_data_scaled)

    # 9. Return all objects in a dictionary
    return {
        'train_data_scaled': train_data_scaled,
        #'test_data_scaled': test_data_scaled,
        'y_train': y_train,
        #'y_test': y_test,
        'scaler': scaler,
        'X': X,
        'etf_history': etf_history,
        'forecast_data_weekly': forecast_data_weekly,
        'forecast_data_monthly': forecast_data_monthly,
        'forecast_data_weekly_scaled': forecast_data_weekly_scaled,
        'forecast_data_monthly_scaled': forecast_data_monthly_scaled
    }


In [5]:
def model_training(train_data_scaled, y_train):
    # Define the hyperparameters space
    param_space = {
        'n_estimators': (100, 300),  # integer values from 100 to 300
        'max_features': ['sqrt', 'log2'],  # categorical values
        'max_depth': (10, 50),  # integer values from 10 to 50
        'min_samples_split': (2, 10),  # integer values from 2 to 10
        'min_samples_leaf': (1, 4),  # integer values from 1 to 4
        'bootstrap': [True, False],  # categorical values
        'criterion': ['absolute_error', 'squared_error', 'friedman_mse'],  # categorical values for split criterion
        'max_leaf_nodes': (10, 100),  # integer values for maximum leaf nodes
        'min_impurity_decrease': (0.0, 0.2)  # float values for minimum impurity decrease
    }

    # Initialize the Random Forest Regressor
    rf = RandomForestRegressor(random_state=42)

    # Set up the Bayesian Optimization to search for the best hyperparameters
    bayes_search = BayesSearchCV(
        estimator=rf,
        search_spaces=param_space,
        n_iter=32,  # number of iterations for the search
        cv=5,  # cross-validation folds
        n_jobs=-1,  # use all available CPU cores
        verbose=0,
        scoring='neg_mean_squared_error',
        random_state=42,
        n_points=15
    )

    # Fit the model using the best hyperparameters
    bayes_search.fit(train_data_scaled, y_train)

    # Get the best model
    best_model = bayes_search.best_estimator_

    # Display the best model parameters
    print("Best Model Parameters:")
    print(bayes_search.best_params_)

    return best_model

"""def eval_model(best_model,test_data_scaled,y_test,y_train):
    
    test_predictions = best_model.predict((test_data_scaled))
    mae = mean_absolute_error(y_test, test_predictions)
    # Mean Absolute Scaled Error (MASE) - Example calculation assuming seasonal period m=1
    seasonal_naive = np.roll(y_test, 1)  # Shift y_test by 1 for seasonal naive forecast
    #seasonal_mae = np.mean(np.abs(y_test - seasonal_naive))
    seasonal_mae = np.mean(np.abs(y_test[1:] - seasonal_naive[1:]))
    mase = mae / seasonal_mae

    # Mean Absolute Percentage Error (MAPE)
    #mape = np.mean(np.abs((y_test - test_predictions) / y_test)) * 100

    #n = y_test.shape[0]
    #d = np.abs(  np.diff( y_test) ).sum()/(n-1)
    
    #errors = np.abs(y_test - test_predictions )
    #mase_value = errors.mean()/d

    print("Mean Absolute Error (MAE):", mae)
    print("Mean Absolute Scaled Error (MASE):", mase)
    #print("Mean Absolute Scaled Error-2 (MASE-2):", mase_value)
    #print("Mean Absolute Percentage Error (MAPE):", mape)
    
    return mae, mase"""

# Function to make predictions
def predictions(model, forecast_data_scaled):
    forecast_results = {}
    for key, data in forecast_data_scaled.items():
        # Extract the index from the original forecast data
        original_index = forecast_data_scaled[key][:, 1]  # Assuming the index is preserved here
        predictions = model.predict(data)
        forecast_results[key] = pd.DataFrame(
            predictions, 
            columns=["log_returns"], 
            index=original_index
        )
    return forecast_results


In [6]:
def combined_workflow(
    tickers,
    start_date, 
    end_date, 
    train_start_date, 
    train_end_date, 
    #test_start_date, 
    #test_end_date,
    prediction_dates_monthly, 
    prediction_dates_weekly
):
    etf_results = {}
    
    for ticker in tickers:
        # 1. Load and preprocess data
        etf_history, etf_data = data_loading(ticker, start_date, end_date)
        etf_history = etf_predictors(etf_history, start_date, end_date, etf_data)

        # 2. Single call to pre_processing for training/testing & both forecast horizons
        preprocessed = pre_processing(
            etf_history=etf_history, 
            train_start_date=train_start_date, 
            train_end_date=train_end_date,
            #test_start_date=test_start_date, 
            #test_end_date=test_end_date,
            prediction_dates_monthly=prediction_dates_monthly,
            prediction_dates_weekly=prediction_dates_weekly
        )
        
        # Track the start time
        start_time = time.time()
        
        # 3. Train the model
        model = model_training(preprocessed['train_data_scaled'], preprocessed['y_train'])
        
        # Calculate time consumed
        time_consumed = time.time() - start_time

        # Print the time consumed
        print(f"Time consumed for training: {time_consumed:.2f} seconds")
        
        # 4. Evaluate the model
        """mae, mase = eval_model(
            model, 
            preprocessed['test_data_scaled'], 
            preprocessed['y_test'], 
            preprocessed['y_train']
        )"""
        
        # 5. Generate predictions for monthly forecasts
        monthly_forecast_results = predictions(
            model, 
            preprocessed['forecast_data_monthly_scaled']
        )
        
        # 6. Generate predictions for weekly forecasts (optional)
        weekly_forecast_results = predictions(
            model, 
            preprocessed['forecast_data_weekly_scaled']
        )

        # ---------------------------------------------------------------------
        # 7. Build your final dictionary in the desired format
        # ---------------------------------------------------------------------

        # A. Basic structure
        dict_data = {
            "etf_history": etf_history,              # The full ETF dataframe (with predictors, etc.)
            "X": preprocessed["X"],                 # All feature columns used
            "y_train_values": preprocessed["y_train"],  # Training target values

            # Store model performance
            #model results": {
            #    "mae": mae,
            #    "mase": mase,
            #    "time": time_consumed
            #},
        }

        # B. Store monthly forecast predictions in separate keys
        #    Suppose you want up to 12 monthly periods; adjust as needed
        for i_m in range(1, 13):
            # The keys produced by predictions(...) might look like 'forecast_data_1m', 'forecast_data_2m', etc.
            forecast_key = f"forecast_data_{i_m}m"

            if forecast_key in monthly_forecast_results:
                # monthly_forecast_results[forecast_key] might be a DataFrame with 'log_returns'
                df_forecast = monthly_forecast_results[forecast_key]
                
                # e.g. store array of predicted 'log_returns' (or your final target variable)
                dict_data[f"forecast_predictions_df_{i_m}m"] = df_forecast["log_returns"].values
            
            # If a given month's key doesn't exist, you might skip it or handle differently

        # C. Store weekly forecast predictions similarly, if desired
        #    For example, suppose you do up to 48 weekly windows
        # C. Store weekly forecast predictions similarly, if desired
        # For example, suppose you do up to 52 weekly windows
        for i_w in range(1, 53):
            forecast_key = f"forecast_data_{i_w}w"
            if forecast_key in weekly_forecast_results:
                df_forecast = weekly_forecast_results[forecast_key]
                dict_data[f"forecast_predictions_df_{i_w}w"] = df_forecast["log_returns"].values


        # Finally, store this dictionary per ticker
        etf_results[ticker] = dict_data
    
    return etf_results


In [7]:
import pandas as pd

# Example usage
tickers = ['SMH', 'SOXX', 'PSI', 'XSD', 'IYW', 'XLK', 'VGT', 'FTEC', 'IGM', 'IXN', 
          ]
start_date, end_date = '2000-01-01', '2022-12-31'
"""train_start_date, train_end_date, test_start_date, test_end_date = (
    start_date,         # train_start_date
    '2014-01-01',       # train_end_date
    '2014-01-01',       # test_start_date
    '2024-01-01'        # test_end_date
)"""
train_start_date, train_end_date = (
    start_date,         # train_start_date
    '2022-01-01'        # train_end_date
)

# Generate prediction dates for monthly & weekly
prediction_dates_monthly = pd.date_range(
    start='2022-01-01', 
    periods=12, 
    freq='MS'
).strftime('%Y-%m-%d').tolist()

prediction_dates_weekly = pd.date_range(
    start='2022-01-01', 
    periods=52, 
    freq='W-MON'
).strftime('%Y-%m-%d').tolist()


# Run the workflow
results = combined_workflow(
    tickers=tickers,
    start_date=start_date,
    end_date=end_date,
    train_start_date=train_start_date,
    train_end_date=train_end_date,
    #test_start_date=test_start_date,
    #test_end_date=test_end_date,
    prediction_dates_monthly=prediction_dates_monthly,
    prediction_dates_weekly=prediction_dates_weekly
)

# The 'results' object is a dictionary keyed by ticker.
# For each ticker (e.g. 'SMH' or 'SOXX'),to inspect:
#   results[ticker]["etf_history"]
#   results[ticker]["X"]
#   results[ticker]["y_train_values"]
#   results[ticker]["model results"]
#   results[ticker]["forecast_predictions_df_1m"], etc.
#
# Example:
# results['SMH']['forecast_predictions_df_1m']


[*********************100%%**********************]  1 of 1 completed


Best Model Parameters:
OrderedDict([('bootstrap', True), ('criterion', 'friedman_mse'), ('max_depth', 34), ('max_features', 'sqrt'), ('max_leaf_nodes', 100), ('min_impurity_decrease', 0.0), ('min_samples_leaf', 1), ('min_samples_split', 3), ('n_estimators', 138)])
Time consumed for training: 132.35 seconds


[*********************100%%**********************]  1 of 1 completed


Best Model Parameters:
OrderedDict([('bootstrap', False), ('criterion', 'friedman_mse'), ('max_depth', 11), ('max_features', 'log2'), ('max_leaf_nodes', 100), ('min_impurity_decrease', 0.0), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 126)])
Time consumed for training: 132.00 seconds


[*********************100%%**********************]  1 of 1 completed


Best Model Parameters:
OrderedDict([('bootstrap', True), ('criterion', 'friedman_mse'), ('max_depth', 40), ('max_features', 'log2'), ('max_leaf_nodes', 85), ('min_impurity_decrease', 0.0), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 256)])
Time consumed for training: 100.59 seconds


[*********************100%%**********************]  1 of 1 completed


Best Model Parameters:
OrderedDict([('bootstrap', False), ('criterion', 'friedman_mse'), ('max_depth', 35), ('max_features', 'log2'), ('max_leaf_nodes', 100), ('min_impurity_decrease', 0.0), ('min_samples_leaf', 1), ('min_samples_split', 10), ('n_estimators', 216)])
Time consumed for training: 90.83 seconds


[*********************100%%**********************]  1 of 1 completed


Best Model Parameters:
OrderedDict([('bootstrap', False), ('criterion', 'friedman_mse'), ('max_depth', 18), ('max_features', 'sqrt'), ('max_leaf_nodes', 69), ('min_impurity_decrease', 0.0), ('min_samples_leaf', 3), ('min_samples_split', 2), ('n_estimators', 124)])
Time consumed for training: 159.49 seconds


[*********************100%%**********************]  1 of 1 completed


Best Model Parameters:
OrderedDict([('bootstrap', True), ('criterion', 'friedman_mse'), ('max_depth', 11), ('max_features', 'sqrt'), ('max_leaf_nodes', 89), ('min_impurity_decrease', 0.0), ('min_samples_leaf', 2), ('min_samples_split', 5), ('n_estimators', 282)])
Time consumed for training: 167.30 seconds


[*********************100%%**********************]  1 of 1 completed


Best Model Parameters:
OrderedDict([('bootstrap', True), ('criterion', 'friedman_mse'), ('max_depth', 20), ('max_features', 'log2'), ('max_leaf_nodes', 65), ('min_impurity_decrease', 0.0), ('min_samples_leaf', 4), ('min_samples_split', 10), ('n_estimators', 100)])
Time consumed for training: 87.30 seconds


[*********************100%%**********************]  1 of 1 completed


Best Model Parameters:
OrderedDict([('bootstrap', False), ('criterion', 'friedman_mse'), ('max_depth', 32), ('max_features', 'sqrt'), ('max_leaf_nodes', 25), ('min_impurity_decrease', 0.0), ('min_samples_leaf', 4), ('min_samples_split', 9), ('n_estimators', 161)])
Time consumed for training: 36.53 seconds


[*********************100%%**********************]  1 of 1 completed


Best Model Parameters:
OrderedDict([('bootstrap', False), ('criterion', 'squared_error'), ('max_depth', 32), ('max_features', 'sqrt'), ('max_leaf_nodes', 86), ('min_impurity_decrease', 0.0), ('min_samples_leaf', 3), ('min_samples_split', 8), ('n_estimators', 300)])
Time consumed for training: 183.67 seconds


[*********************100%%**********************]  1 of 1 completed


Best Model Parameters:
OrderedDict([('bootstrap', True), ('criterion', 'friedman_mse'), ('max_depth', 25), ('max_features', 'log2'), ('max_leaf_nodes', 87), ('min_impurity_decrease', 0.0), ('min_samples_leaf', 4), ('min_samples_split', 9), ('n_estimators', 199)])
Time consumed for training: 136.58 seconds


In [8]:
def print_results_with_metrics(results):
    """
    Print out results (model summary, evaluation metrics, and some forecast predictions)
    from the results dictionary returned by combined_workflow.
    """
    for ticker, data in results.items():
        print(f"Ticker: {ticker}")
        print("=" * 50)
        
        # 1. Print model summary (if the model is a statsmodels object)
        if "model" in data and hasattr(data["model"], "summary"):
            print("Model Summary:")
            print(data["model"].summary())
            print("\n")
        else:
            print("No statsmodels summary available for this model.\n")
        
        
        
        # 3. Print monthly forecast predictions
        """print("Monthly Forecast Predictions (first 5 values for each period):")
        for i_m in range(1, 13):  # Up to 12 months
            key_m = f"forecast_predictions_df_{i_m}m"
            if key_m in data:
                preds_array = data[key_m]
                print(f"{key_m}: {preds_array[:5]}")  # Print first 5 predictions
        print("\n")

        # 4. Print weekly forecast predictions (first 5 values)
        print("Weekly Forecast Predictions (first 5 values for each period):")
        for i_w in range(1, 53):  # Adjusted for 52 weeks
            key_w = f"forecast_predictions_df_{i_w}w"
            if key_w in data:
                preds_array = data[key_w]
                print(f"{key_w}: {preds_array[:5]}")  # Print first 5 predictions
        print("\n")
        
        print("=" * 50)
        print("\n")"""

print_results_with_metrics(results)

Ticker: SMH
No statsmodels summary available for this model.

Ticker: SOXX
No statsmodels summary available for this model.

Ticker: PSI
No statsmodels summary available for this model.

Ticker: XSD
No statsmodels summary available for this model.

Ticker: IYW
No statsmodels summary available for this model.

Ticker: XLK
No statsmodels summary available for this model.

Ticker: VGT
No statsmodels summary available for this model.

Ticker: FTEC
No statsmodels summary available for this model.

Ticker: IGM
No statsmodels summary available for this model.

Ticker: IXN
No statsmodels summary available for this model.



In [9]:
rf_metrics_clipboard={}
for ticker,data in results.items():
    if "model results" in data:
            metrics_dict = data["model results"]
            rf_metrics_clipboard[ticker]=metrics_dict
            #mae = metrics_dict.get("mae", None)
            #mase = metrics_dict.get("mase", None)

for key,value in rf_metrics_clipboard.items():
    print(key)
    print(value)
rf_metrics_dataframe = pd.DataFrame(rf_metrics_clipboard).T
rf_metrics_dataframe.reset_index(inplace=True)
rf_metrics_dataframe.rename(columns={'index':'ticker'}, inplace=True)
rf_metrics_dataframe

rf_metrics_dataframe.to_clipboard(index=False,sep=',')


### Ratios


In [10]:
def calculate_sharpe_ratio(returns, annual_risk_free_rate=0.1,period='daily'):
    #excess_returns = rate_of_return(returns) - risk_free_rate
    
    # Convert annual risk-free rate to daily rate
    daily_risk_free_rate = (1 + annual_risk_free_rate) ** (1/252) - 1
    
    # Calculate mean daily log return
    mean_return = np.mean(returns)
    
    # Calculate excess daily log return
    excess_return = mean_return - daily_risk_free_rate
    
    # Calculate standard deviation of daily log returns
    std_return = np.std(returns)
    
    # Print diagnostic information
    #print(f"Mean Daily Log Return: {mean_return}")
    #print(f"Excess Daily Log Return: {excess_return}")
    #print(f"Standard Deviation of Daily Log Returns: {std_return}")
    
    # Check for zero standard deviation to avoid division by zero
    if std_return == 0:
        return 0
    
    # Calculate Sharpe ratio
    sharpe_ratio = (excess_return / std_return) * np.sqrt(252)  # Annualize the Sharpe ratio
    return sharpe_ratio
    #return excess_returns / np.std(returns)


def calculate_rachev_ratio(returns, lower_percentile=5, upper_percentile=95):
    # Step 1: Sort the returns
    sorted_returns = np.sort(returns)
    
    # Step 2: Determine the percentiles
    lower_threshold = np.percentile(sorted_returns, lower_percentile)
    upper_threshold = np.percentile(sorted_returns, upper_percentile)
    
    # Step 3: Calculate Expected Shortfall (ES)
    es = np.mean(sorted_returns[sorted_returns <= lower_threshold])
    
    # Step 4: Calculate Expected Gain (EG)
    eg = np.mean(sorted_returns[sorted_returns >= upper_threshold])
    
    # Step 5: Compute the Rachev Ratio
    rachev_ratio = eg / -es
    return rachev_ratio


"""def calculate_volatility_clustering(returns):
    squared_returns = returns ** 2
    n = len(squared_returns)
    
    # Mean of squared returns
    mean_squared_returns = np.mean(squared_returns)
    
    # Calculate the numerator and denominator for autocorrelation at lag 1
    numerator = np.sum((squared_returns[:-1] - mean_squared_returns) * (squared_returns[1:] - mean_squared_returns))
    denominator = np.sum((squared_returns - mean_squared_returns) ** 2)
    
    if denominator == 0:
        return 0  # Avoid division by zero
    
    rho_1 = numerator / denominator
    return rho_1"""
def calculate_volatility_clustering(returns):
    # Ensure returns is a NumPy array
    returns = np.array(returns)
    squared_returns = returns ** 2
    n = len(squared_returns)

    # Mean of squared returns
    mean_squared = np.mean(squared_returns)
    clustering = np.sum((squared_returns - mean_squared) ** 2) / (n - 1 if n > 1 else 1)
    return clustering

def calculate_sortino_ratio(log_returns, target_log_return=0.0):
    """
    Calculate the Sortino Ratio using log returns.
    
    Parameters:
    - log_returns (array-like): Array or list of log returns for the period.
    - target_log_return (float): The target log return. Default is 0, which is often used as a benchmark.
    
    Returns:
    - float: The Sortino Ratio.
    """
    # Convert input to a NumPy array for easier calculations
    log_returns = np.array(log_returns)
    
    # Calculate the average period log return (R)
    avg_log_return = np.mean(log_returns)
    
    # Calculate the Target Downside Deviation (TDD)
    downside_deviation = np.sqrt(
        np.mean(np.square(np.maximum(0, target_log_return - log_returns)))
    )
    
    # Calculate Sortino Ratio
    #sortino_ratio_value = (avg_log_return - target_log_return) / downside_deviation
    epsilon = 1e-8
    
    # Add epsilon to downside_deviation to prevent division by zero
    #if downside_deviation == 0:
     #   return np.nan  # Return NaN if downside deviation is zero
    
    # Calculate Sortino Ratio
    sortino_ratio_value = (avg_log_return - target_log_return) / (downside_deviation + epsilon)
    
    return sortino_ratio_value


In [11]:

def calculate_composite_score(
    forecasted_values, risk_percentage, rachev_ratio, sharpe_ratio, sortino_ratio, volatility_clustering, 
    mean_forecast, std_forecast, mean_rachev, std_rachev, mean_sharpe, std_sharpe, mean_sortino, std_sortino, 
    mean_volatility_clustering, std_volatility_clustering
):
    epsilon = 1e-8  # To prevent division by zero
    forecasted_mean = np.mean(forecasted_values)

    # Normalize the components with epsilon
    forecasted_mean_normalized = (forecasted_mean - mean_forecast) / (std_forecast + epsilon)
    rachev_normalized = (rachev_ratio - mean_rachev) / (std_rachev + epsilon)
    sharpe_normalized = (sharpe_ratio - mean_sharpe) / (std_sharpe + epsilon)
    sortino_normalized = (sortino_ratio - mean_sortino) / (std_sortino + epsilon)
    volatility_clustering_normalized = (volatility_clustering - mean_volatility_clustering) / (std_volatility_clustering + epsilon)

    # Composite score calculation
    score = (
        forecasted_mean_normalized
        - (risk_percentage * rachev_normalized)
        + sharpe_normalized
        + sortino_normalized
        - volatility_clustering_normalized
    )

    return score

def calculate_smoothing(data, alpha=0.1):
    if len(data) == 0:  # Explicitly check if the data is empty
        return []
    smoothed_data = []
    previous_value = data[0]
    for value in data:
        smoothed_value = alpha * value + (1 - alpha) * previous_value
        smoothed_data.append(smoothed_value)
        previous_value = smoothed_value
    return smoothed_data



def process_etf_data_weekly(tickers, etf_dict, smoothing=True):
    etf_pred_dict = {}

    # Determine the forecast periods dynamically from the etf_dict
    sample_etf = next(iter(etf_dict.values()))
    forecast_periods = [key.split('_')[-1] for key in sample_etf.keys() if key.startswith('forecast_predictions_df')]

    for etf_name in tickers:
        etf_pred_dict[etf_name] = {
            f"returns_{period}": etf_dict[etf_name][f"forecast_predictions_df_{period}"]
            for period in forecast_periods
        }

        for period in forecast_periods:
            returns = etf_pred_dict[etf_name][f"returns_{period}"]

            if smoothing:
                # Apply smoothing to returns
                smoothed_returns = calculate_smoothing(returns)
                etf_pred_dict[etf_name][f"smoothed_returns_{period}"] = smoothed_returns

                # Calculate metrics using smoothed returns
                etf_pred_dict[etf_name][f"rachev_ratio_{period}"] = calculate_rachev_ratio(smoothed_returns)
                etf_pred_dict[etf_name][f"sharpe_ratio_{period}"] = calculate_sharpe_ratio(smoothed_returns)
                etf_pred_dict[etf_name][f"sortino_ratio_{period}"] = calculate_sortino_ratio(smoothed_returns)
                etf_pred_dict[etf_name][f"volatility_clustering_{period}"] = calculate_volatility_clustering(smoothed_returns)
            else:
                # Calculate metrics using raw returns
                etf_pred_dict[etf_name][f"rachev_ratio_{period}"] = calculate_rachev_ratio(returns)
                etf_pred_dict[etf_name][f"sharpe_ratio_{period}"] = calculate_sharpe_ratio(returns)
                etf_pred_dict[etf_name][f"sortino_ratio_{period}"] = calculate_sortino_ratio(returns)
                etf_pred_dict[etf_name][f"volatility_clustering_{period}"] = calculate_volatility_clustering(returns)

    return etf_pred_dict


def calculate_means_and_stds_weekly(etf_pred_dict, forecast_period):
    returns_list = [etf_pred_dict[etf][f'returns_{forecast_period}'] for etf in etf_pred_dict]

    # Compute global means and standard deviations
    mean_forecast = np.mean([np.mean(returns) for returns in returns_list])
    std_forecast = np.std([np.mean(returns) for returns in returns_list])

    rachev_ratios = np.array([etf_pred_dict[etf][f'rachev_ratio_{forecast_period}'] for etf in etf_pred_dict])
    mean_rachev = np.mean(rachev_ratios)
    std_rachev = np.std(rachev_ratios)

    sharpe_ratios = np.array([etf_pred_dict[etf][f'sharpe_ratio_{forecast_period}'] for etf in etf_pred_dict])
    mean_sharpe = np.mean(sharpe_ratios)
    std_sharpe = np.std(sharpe_ratios)

    sortino_ratios = np.array([etf_pred_dict[etf][f'sortino_ratio_{forecast_period}'] for etf in etf_pred_dict])
    mean_sortino = np.mean(sortino_ratios)
    std_sortino = np.std(sortino_ratios)

    volatility_clustering = np.array([etf_pred_dict[etf][f'volatility_clustering_{forecast_period}'] for etf in etf_pred_dict])
    mean_volatility_clustering = np.mean(volatility_clustering)
    std_volatility_clustering = np.std(volatility_clustering)

    return (
        mean_forecast, std_forecast, mean_rachev, std_rachev,
        mean_sharpe, std_sharpe, mean_sortino, std_sortino,
        mean_volatility_clustering, std_volatility_clustering
    )


def calculate_scores_for_etfs_weekly(etf_pred_dict, forecast_period, risk_percentage):
    (mean_forecast, std_forecast, mean_rachev, std_rachev, 
     mean_sharpe, std_sharpe, mean_sortino, std_sortino, 
     mean_volatility_clustering, std_volatility_clustering) = calculate_means_and_stds_weekly(etf_pred_dict, forecast_period)

    scores = []
    for etf in etf_pred_dict:
        forecasted_values = etf_pred_dict[etf][f'returns_{forecast_period}']
        rachev_ratio = etf_pred_dict[etf][f'rachev_ratio_{forecast_period}']
        sharpe_ratio = etf_pred_dict[etf][f'sharpe_ratio_{forecast_period}']
        volatility_clustering = etf_pred_dict[etf][f'volatility_clustering_{forecast_period}']
        sortino_ratio = etf_pred_dict[etf][f'sortino_ratio_{forecast_period}']

        # Calculate the composite score
        score = calculate_composite_score(
            forecasted_values, risk_percentage, rachev_ratio, sharpe_ratio, 
            sortino_ratio, volatility_clustering, mean_forecast, std_forecast, 
            mean_rachev, std_rachev, mean_sharpe, std_sharpe, mean_sortino, 
            std_sortino, mean_volatility_clustering, std_volatility_clustering
        )

        scores.append({
            'ETF': etf,
            'Week': forecast_period,
            'RiskPercentage': risk_percentage,
            'Score': score
        })

    return scores


def select_top_etfs_weekly(df_scores, forecast_period):
    if df_scores.empty:
        print(f"No scores available for {forecast_period}. Skipping.")
        return []
    print(f"Processing scores for {forecast_period}:")
    print(df_scores.head())  # Check the top rows of the DataFrame
    top_etfs = df_scores.nlargest(2, 'Score')
    print(f"Top ETFs for {forecast_period}: {top_etfs['ETF'].tolist()}")
    return top_etfs['ETF'].tolist()


def generate_week_ranges(start_date, end_date):
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    week_ranges = []

    while start < end:
        week_start = start
        week_end = start + timedelta(days=6)
        if week_end > end:
            week_end = end
        week_ranges.append((week_start.strftime('%Y-%m-%d'), week_end.strftime('%Y-%m-%d')))
        start += timedelta(days=7)

    return week_ranges




def gather_etf_data_for_weeks(selected_tickers, week_ranges):
    # Download all data for each ticker once
    print("Fetching full-year data for all tickers...")
    full_year_data = {}
    for ticker in selected_tickers:
        print(f"Downloading data for {ticker} for the entire year...")
        full_year_data[ticker] = yf.download(ticker, start=week_ranges[0][0], end=week_ranges[-1][1], progress=False)

    # Slice data into weekly ranges
    etf_histories = {}
    for start_date, end_date in week_ranges:
        week = f"{start_date} to {end_date}"
        etf_histories[week] = {}
        for ticker in selected_tickers:
            if ticker in full_year_data and not full_year_data[ticker].empty:
                # Slice the full-year data for the given week
                weekly_data = full_year_data[ticker].loc[start_date:end_date]
                if not weekly_data.empty:
                    etf_histories[week][ticker] = weekly_data
                else:
                    print(f"No data found for {ticker} in {week}")
            else:
                print(f"No data available for {ticker} in the full-year download.")
    return etf_histories




def initialize_shares_for_first_week(top_etfs_1w, etf_histories, week, investment_amount=50000):
    ticker_shares = {}
    first_trading_day_start = week.split(" to ")[0]

    for ticker in top_etfs_1w:
        etf_history = etf_histories.get(week, {}).get(ticker)

        if etf_history is not None:
            first_trading_day = pd.to_datetime(first_trading_day_start) + BDay(1)
            if first_trading_day not in etf_history.index:
                first_trading_day = etf_history.index[etf_history.index.searchsorted(first_trading_day)]

            price_on_first_trading_day = etf_history.loc[first_trading_day, 'Close']
            num_shares = (investment_amount * 0.9975) / price_on_first_trading_day
            ticker_shares[ticker] = num_shares
        else:
            print(f"No data found for {ticker} in {week}")
    return ticker_shares


def manage_etf_portfolio_weekly(
    top_etfs_previous, top_etfs_current, previous_week, current_week, ticker_shares, gathered_data_per_week
):
    etf_histories_for_current_week = gathered_data_per_week.get(current_week, {})
    top2etfs_previous = list(top_etfs_previous)
    top2etfs_current = list(top_etfs_current)

    print(f"\nTop 2 ETFs for {previous_week}: {top2etfs_previous}")
    print(f"Top 2 ETFs for {current_week}: {top2etfs_current}")

    etfs_to_sell = [etf for etf in top2etfs_previous if etf not in top2etfs_current]
    etfs_to_buy = [etf for etf in top2etfs_current if etf not in top2etfs_previous]

    # Ensure one-to-one mapping between sell and buy ETFs
    if len(etfs_to_sell) != len(etfs_to_buy):
        print("Mismatch between ETFs to sell and buy. Adjusting allocation...")
        return ticker_shares  # Abort if mismatched for now, you can implement custom logic

    # Allocate funds ETF-by-ETF
    for etf_sell, etf_buy in zip(etfs_to_sell, etfs_to_buy):
        no_of_shares = ticker_shares.get(etf_sell, 0)
        if no_of_shares > 0:
            # Selling old ETF
            if etf_sell in etf_histories_for_current_week:
                first_trading_day_sell_price = etf_histories_for_current_week[etf_sell].loc[
                    etf_histories_for_current_week[etf_sell].index[0], 'Close'
                ]
                selling_value = no_of_shares * first_trading_day_sell_price * 0.9975
                print(f"Sell {etf_sell}: {no_of_shares:.2f} shares at {first_trading_day_sell_price:.2f}. Total value: {selling_value:.2f}")

                # Remove sold ETF from portfolio
                del ticker_shares[etf_sell]

                # Buying new ETF
                if etf_buy in etf_histories_for_current_week:
                    first_trading_day_buy_price = etf_histories_for_current_week[etf_buy].loc[
                        etf_histories_for_current_week[etf_buy].index[0], 'Close'
                    ]
                    new_shares = (selling_value * 0.9975) / first_trading_day_buy_price
                    print(f"Buy {etf_buy}: {new_shares:.2f} shares at {first_trading_day_buy_price:.2f}.")
                    ticker_shares[etf_buy] = new_shares
                else:
                    print(f"Data for {etf_buy} is missing for {current_week}. Skipping purchase.")
            else:
                print(f"Data for {etf_sell} is missing for {current_week}. Skipping sale.")
        else:
            print(f"No shares found for {etf_sell} to sell.")

    print(f"Updated ticker shares after {current_week}: {ticker_shares}")
    return ticker_shares

def main_weekly(tickers, etf_dict, smoothing=True):
    etf_pred_dict = process_etf_data_weekly(tickers, etf_dict, smoothing=smoothing)

    risk_percentage = 0.10
    weekly_scores = {}

    # Calculate scores for all weeks (1 to 52)
    for week in range(1, 53):  # Updated range for 52 weeks
        week_key = f"{week}w"
        if any(f"returns_{week_key}" in etf_pred_dict[etf] for etf in etf_pred_dict):
            scores = calculate_scores_for_etfs_weekly(etf_pred_dict, week_key, risk_percentage)
            weekly_scores[week_key] = scores

    # Convert scores to DataFrames
    weekly_scores_dfs = {}
    for week_key, scores in weekly_scores.items():
        weekly_scores_dfs[week_key] = pd.DataFrame(scores)

    # Select top ETFs for each week
    top_etfs_weekly = {}
    for week_key, df_scores in weekly_scores_dfs.items():
        top_etfs_weekly[week_key] = df_scores.nlargest(2, 'Score')

    # Generate week ranges for ETF data gathering
    week_ranges = generate_week_ranges('2022-01-01', '2022-12-31')  # Full year for 52 weeks

    # Gather ETF historical data
    etf_histories = gather_etf_data_for_weeks(tickers, week_ranges)

    # Map week keys to date ranges
    week_key_mapping = {f"{i+1}w": week_range for i, week_range in enumerate(etf_histories.keys())}

    # Align top ETFs with historical data
    aligned_top_etfs_weekly = {}
    for week_key, df_scores in weekly_scores_dfs.items():
        forecast_period = week_key_mapping.get(week_key, None)
        if forecast_period:
            aligned_top_etfs_weekly[forecast_period] = select_top_etfs_weekly(df_scores, forecast_period)

    # Initialize and manage portfolio
    ticker_shares = {}
    ticker_shares_per_week = {}
    for i, week_range in enumerate(etf_histories.keys()):
        current_week_key = week_range
        if i == 0:
            # Initialize shares for the first week
            ticker_shares = initialize_shares_for_first_week(
                aligned_top_etfs_weekly.get(current_week_key, []),  # Fallback to empty list if key is missing
                etf_histories,
                current_week_key
            )
            ticker_shares_per_week[current_week_key] = ticker_shares.copy()
        else:
            # Manage portfolio for subsequent weeks
            prev_week_key = list(etf_histories.keys())[i - 1]
            if current_week_key not in aligned_top_etfs_weekly:
                print(f"Warning: Missing alignment for {current_week_key}. Skipping week.")
                continue

            ticker_shares = manage_etf_portfolio_weekly(
                aligned_top_etfs_weekly.get(prev_week_key, []),  # Fallback to empty list
                aligned_top_etfs_weekly[current_week_key],
                prev_week_key,
                current_week_key,
                ticker_shares,
                etf_histories
            )
            ticker_shares_per_week[current_week_key] = ticker_shares.copy()


    # Define the first trading day of the 53rd week
    first_trading_day_53w = '2023-01-01'

    # Identify the 52nd week date ranges
    week_52_range = list(ticker_shares_per_week.keys())[-1]
    week_53_start = first_trading_day_53w
    print(f"Using data for the 52nd week: {week_52_range}")
    print(f"Fetching data starting from the first trading day of the 53rd week: {week_53_start}")

    week_53_end = '2023-01-06'
    etf_values_53w = {}

    # Ensure 52nd week data exists
    if week_52_range in ticker_shares_per_week:
        ticker_shares_52w = ticker_shares_per_week[week_52_range]

        # Fetch the first trading day price of the 53rd week for each ETF
        for ticker, shares in ticker_shares_52w.items():
            print(f"Fetching data for ticker {ticker} starting from {week_53_start}...")
            data = yf.download(ticker, start=week_53_start, end=week_53_end)

            if not data.empty:
                closing_price_53w = data['Close'].iloc[0]
                total_value = shares * closing_price_53w
                etf_values_53w[ticker] = total_value
                print(f"{ticker}: {shares:.2f} shares at ${closing_price_53w:.2f} each, total value: ${total_value:.2f}")
            else:
                print(f"{ticker}: No data available for the 53rd week's first trading day.")
    else:
        print(f"No data available in ticker_shares_per_week for the 52nd week: {week_52_range}")

    # Display total portfolio value for the 53rd week's first trading day
    if etf_values_53w:
        print("\nETF values on the 53rd week's first trading day:")
        total_value = sum(etf_values_53w.values())
        print(f"Total portfolio value: {total_value:.2f}")
        for ticker, value in etf_values_53w.items():
            print(f"{ticker}: {value:.2f}")
    else:
        print("No values could be calculated for the 53rd week's first trading day.")

    return ticker_shares_per_week



In [12]:
# Now call the weekly workflow using the 'results' as etf_dict
ticker_shares_per_week_wo_smoothing = main_weekly(tickers, results, smoothing=False)


Fetching full-year data for all tickers...
Downloading data for SMH for the entire year...
Downloading data for SOXX for the entire year...
Downloading data for PSI for the entire year...
Downloading data for XSD for the entire year...
Downloading data for IYW for the entire year...
Downloading data for XLK for the entire year...
Downloading data for VGT for the entire year...
Downloading data for FTEC for the entire year...
Downloading data for IGM for the entire year...
Downloading data for IXN for the entire year...


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

Processing scores for 2022-01-01 to 2022-01-07:
    ETF Week  RiskPercentage     Score
0   SMH   1w             0.1  4.247955
1  SOXX   1w             0.1  4.590096
2   PSI   1w             0.1 -1.378003
3   XSD   1w             0.1 -2.755100
4   IYW   1w             0.1 -1.321248
Top ETFs for 2022-01-01 to 2022-01-07: ['SOXX', 'SMH']
Processing scores for 2022-01-08 to 2022-01-14:
    ETF Week  RiskPercentage     Score
0   SMH   2w             0.1  7.571839
1  SOXX   2w             0.1  4.579854
2   PSI   2w             0.1  1.368427
3   XSD   2w             0.1 -3.241898
4   IYW   2w             0.1 -1.645664
Top ETFs for 2022-01-08 to 2022-01-14: ['SMH', 'SOXX']
Processing scores for 2022-01-15 to 2022-01-21:
    ETF Week  RiskPercentage     Score
0   SMH   3w             0.1  0.023945
1  SOXX   3w             0.1 -2.599741
2   PSI   3w             0.1 -2.784175
3   XSD   3w             0.1 -5.092212
4   IYW   3w             0.1  1.453948
Top ETFs for 2022-01-15 to 2022-01-21: ['IGM




In [13]:
ticker_shares_per_week_with_smoothing = main_weekly(tickers, results, smoothing=True)


Fetching full-year data for all tickers...
Downloading data for SMH for the entire year...
Downloading data for SOXX for the entire year...
Downloading data for PSI for the entire year...
Downloading data for XSD for the entire year...
Downloading data for IYW for the entire year...
Downloading data for XLK for the entire year...
Downloading data for VGT for the entire year...
Downloading data for FTEC for the entire year...
Downloading data for IGM for the entire year...
Downloading data for IXN for the entire year...
Processing scores for 2022-01-01 to 2022-01-07:
    ETF Week  RiskPercentage     Score
0   SMH   1w             0.1  3.485428
1  SOXX   1w             0.1  3.688556
2   PSI   1w             0.1 -0.790099
3   XSD   1w             0.1 -0.803894
4   IYW   1w             0.1 -2.713806
Top ETFs for 2022-01-01 to 2022-01-07: ['SOXX', 'SMH']
Processing scores for 2022-01-08 to 2022-01-14:
    ETF Week  RiskPercentage     Score
0   SMH   2w             0.1  4.572799
1  SOXX   2w

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

FTEC: 395.27 shares at $93.63 each, total value: $37008.70
Fetching data for ticker IGM starting from 2023-01-01...
IGM: 873.18 shares at $46.63 each, total value: $40720.60

ETF values on the 53rd week's first trading day:
Total portfolio value: 77729.30
FTEC: 37008.70
IGM: 40720.60





In [14]:
def process_etf_data_monthly(tickers, etf_dict, smoothing=True):
    etf_pred_dict = {}

    # Determine forecast periods dynamically from the etf_dict
    sample_etf = next(iter(etf_dict.values()))
    forecast_periods = [key.split('_')[-1] for key in sample_etf.keys() if key.startswith('forecast_predictions_df')]

    for etf_name in tickers:
        etf_pred_dict[etf_name] = {
            f"returns_{period}": etf_dict[etf_name][f"forecast_predictions_df_{period}"]
            for period in forecast_periods
        }

        for period in forecast_periods:
            returns = etf_pred_dict[etf_name][f"returns_{period}"]

            if smoothing:
                # Apply smoothing
                smoothed_returns = calculate_smoothing(returns)
                etf_pred_dict[etf_name][f"smoothed_returns_{period}"] = smoothed_returns

                # Calculate metrics using smoothed returns
                etf_pred_dict[etf_name][f"rachev_ratio_{period}"] = calculate_rachev_ratio(smoothed_returns)
                etf_pred_dict[etf_name][f"sharpe_ratio_{period}"] = calculate_sharpe_ratio(smoothed_returns)
                etf_pred_dict[etf_name][f"sortino_ratio_{period}"] = calculate_sortino_ratio(smoothed_returns)
                etf_pred_dict[etf_name][f"volatility_clustering_{period}"] = calculate_volatility_clustering(smoothed_returns)
            else:
                # Calculate metrics using raw returns
                etf_pred_dict[etf_name][f"rachev_ratio_{period}"] = calculate_rachev_ratio(returns)
                etf_pred_dict[etf_name][f"sharpe_ratio_{period}"] = calculate_sharpe_ratio(returns)
                etf_pred_dict[etf_name][f"sortino_ratio_{period}"] = calculate_sortino_ratio(returns)
                etf_pred_dict[etf_name][f"volatility_clustering_{period}"] = calculate_volatility_clustering(returns)

    return etf_pred_dict


def calculate_means_and_stds_monthly(etf_pred_dict, forecast_period):
    returns_list = [etf_pred_dict[etf][f'returns_{forecast_period}'] for etf in etf_pred_dict]

    # Compute global means and standard deviations
    mean_forecast = np.mean([np.mean(returns) for returns in returns_list])
    std_forecast = np.std([np.mean(returns) for returns in returns_list])

    print(f"\nDebug: Forecast Period = {forecast_period}")
    print(f"All Returns Means: {[np.mean(returns) for returns in returns_list]}")
    print(f"Mean Forecast = {mean_forecast}, Std Forecast = {std_forecast}")

    rachev_ratios = np.array([etf_pred_dict[etf][f'rachev_ratio_{forecast_period}'] for etf in etf_pred_dict])
    print(f"All Rachev Ratios: {rachev_ratios}")
    mean_rachev = np.mean(rachev_ratios)
    std_rachev = np.std(rachev_ratios)

    sharpe_ratios = np.array([etf_pred_dict[etf][f'sharpe_ratio_{forecast_period}'] for etf in etf_pred_dict])
    print(f"All Sharpe Ratios: {sharpe_ratios}")
    mean_sharpe = np.mean(sharpe_ratios)
    std_sharpe = np.std(sharpe_ratios)

    sortino_ratios = np.array([etf_pred_dict[etf][f'sortino_ratio_{forecast_period}'] for etf in etf_pred_dict])
    print(f"All Sortino Ratios: {sortino_ratios}")
    mean_sortino = np.mean(sortino_ratios)
    std_sortino = np.std(sortino_ratios)

    volatility_clustering = np.array([etf_pred_dict[etf][f'volatility_clustering_{forecast_period}'] for etf in etf_pred_dict])
    print(f"All Volatility Clustering: {volatility_clustering}")
    mean_volatility_clustering = np.mean(volatility_clustering)
    std_volatility_clustering = np.std(volatility_clustering)

    print(f"Mean Rachev = {mean_rachev}, Std Rachev = {std_rachev}")
    print(f"Mean Sharpe = {mean_sharpe}, Std Sharpe = {std_sharpe}")
    print(f"Mean Sortino = {mean_sortino}, Std Sortino = {std_sortino}")
    print(f"Mean Volatility Clustering = {mean_volatility_clustering}, Std Volatility Clustering = {std_volatility_clustering}")

    return (
        mean_forecast, std_forecast, mean_rachev, std_rachev,
        mean_sharpe, std_sharpe, mean_sortino, std_sortino,
        mean_volatility_clustering, std_volatility_clustering
    )

def calculate_scores_for_etfs_monthly(etf_pred_dict, forecast_period, risk_percentage, smoothing = True):
    (mean_forecast, std_forecast, mean_rachev, std_rachev, 
     mean_sharpe, std_sharpe, mean_sortino, std_sortino, 
     mean_volatility_clustering, std_volatility_clustering) = calculate_means_and_stds_monthly(etf_pred_dict, forecast_period)

    scores = []
    for etf in etf_pred_dict:
        forecasted_values = etf_pred_dict[etf][f'returns_{forecast_period}']
        rachev_ratio = etf_pred_dict[etf][f'rachev_ratio_{forecast_period}']
        sharpe_ratio = etf_pred_dict[etf][f'sharpe_ratio_{forecast_period}']
        volatility_clustering = etf_pred_dict[etf][f'volatility_clustering_{forecast_period}']
        sortino_ratio = etf_pred_dict[etf][f'sortino_ratio_{forecast_period}']

        # Debugging: Log inputs to composite score calculation
        print(f"\nDebug: ETF = {etf}, Forecast Period = {forecast_period}")
        print(f"Forecasted Values Mean: {np.mean(forecasted_values)}")
        print(f"Rachev Ratio: {rachev_ratio}, Sharpe Ratio: {sharpe_ratio}")
        print(f"Sortino Ratio: {sortino_ratio}, Volatility Clustering: {volatility_clustering}")
        print(f"Means and Stds: Mean Forecast = {mean_forecast}, Std Forecast = {std_forecast}")
        
        # Calculate the composite score
        score = calculate_composite_score(
            forecasted_values, risk_percentage, rachev_ratio, sharpe_ratio, 
            sortino_ratio, volatility_clustering, mean_forecast, std_forecast, 
            mean_rachev, std_rachev, mean_sharpe, std_sharpe, mean_sortino, 
            std_sortino, mean_volatility_clustering, std_volatility_clustering
        )

        scores.append({
            'ETF': etf,
            'Month': forecast_period,
            'RiskPercentage': risk_percentage,
            'Score': score
        })

        # Debugging: Log the calculated score
        print(f"Calculated Score for {etf} ({forecast_period}): {score}")

    return scores

def select_top_etfs_monthly(df_scores, forecast_period):
    if df_scores.empty:
        print(f"No scores available for {forecast_period}. Skipping.")
        return []
    print(f"Processing scores for {forecast_period}:")
    print(df_scores.head())  # Check the top rows of the DataFrame
    top_etfs = df_scores.nlargest(2, 'Score')
    print(f"Top ETFs for {forecast_period}: {top_etfs['ETF'].tolist()}")
    return top_etfs['ETF'].tolist()


def generate_month_ranges(start_date, end_date):
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')
    month_ranges = []

    while start < end:
        month_start = start
        month_end = (start + timedelta(days=31)).replace(day=1) - timedelta(days=1)
        if month_end > end:
            month_end = end
        month_ranges.append((month_start.strftime('%Y-%m-%d'), month_end.strftime('%Y-%m-%d')))
        start = month_end + timedelta(days=1)

    return month_ranges

def gather_etf_data_for_months(tickers, month_ranges):
    etf_histories = {}
    for start_date, end_date in month_ranges:
        month = f"{start_date} to {end_date}"
        etf_histories[month] = {}
        for ticker in tickers:
            etf_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
            if etf_data.empty:
                print(f"No data found for {ticker} in {month}")
                continue
            etf_data.index = pd.to_datetime(etf_data.index)
            etf_histories[month][ticker] = etf_data
    return etf_histories

# Function to initialize shares for the first month
def initialize_shares_for_first_month(top_etfs_1m, etf_histories, month, investment_amount=50000):
    ticker_shares = {}
    first_trading_day_start = month.split(" to ")[0]

    for ticker in top_etfs_1m:
        etf_history = etf_histories.get(month, {}).get(ticker)

        if etf_history is not None:
            first_trading_day = pd.to_datetime(first_trading_day_start) + BDay(1)
            if first_trading_day not in etf_history.index:
                first_trading_day = etf_history.index[etf_history.index.searchsorted(first_trading_day)]

            price_on_first_trading_day = etf_history.loc[first_trading_day, 'Close']
            num_shares = (investment_amount * 0.9975) / price_on_first_trading_day
            print(f"Shares 1st month: ({investment_amount} * 0.9975) / {price_on_first_trading_day}")
            ticker_shares[ticker] = num_shares
            print(f"Ticker: {ticker}, First trading day: {first_trading_day.date()}, Price: {price_on_first_trading_day}, Shares: {num_shares:.2f}")
        else:
            print(f"No data found for {ticker} in {month}")
    return ticker_shares


def manage_etf_portfolio_monthly(
    top_etfs_previous, top_etfs_current, previous_month, current_month, ticker_shares, gathered_data_per_month
):
    etf_histories_for_current_month = gathered_data_per_month.get(current_month, {})

    # Extract ETFs as lists to preserve order
    top2etfs_previous = list(top_etfs_previous)
    top2etfs_current = list(top_etfs_current)

    print(f"Top 2 ETFs for {previous_month}: {top2etfs_previous}")
    print(f"Top 2 ETFs for {current_month}: {top2etfs_current}")

    # Step 1: Identify common ETFs (present in both months)
    etfs_common = [etf for etf in top2etfs_previous if etf in top2etfs_current]

    # Step 2: Identify ETFs to sell (present in the previous month but not in the current month)
    etfs_to_sell = [etf for etf in top2etfs_previous if etf not in top2etfs_current]

    # Step 3: Identify ETFs to buy (present in the current month but not in the previous month)
    etfs_to_buy = [etf for etf in top2etfs_current if etf not in top2etfs_previous]

    # Step 4: Sell ETFs that are no longer in the current top ETFs
    selling_values = {}  # Track selling values for each ETF being sold
    for etf in etfs_to_sell:
        no_of_shares = ticker_shares.get(etf, 0)
        if no_of_shares > 0:
            # Check if data for the ETF is available in the current month
            if etf in etf_histories_for_current_month:
                first_trading_day_current_month = etf_histories_for_current_month[etf].loc[
                    etf_histories_for_current_month[etf].index[0], 'Close'
                ]
                
                selling_value = no_of_shares * first_trading_day_current_month * 0.9975
                selling_values[etf] = selling_value  # Store selling value for this ETF
                print(f"Formula: {no_of_shares} * {first_trading_day_current_month} * 0.9975")
                print(f"Sell {etf}: {no_of_shares:.2f} shares at {first_trading_day_current_month:.2f}. Total value: {selling_value:.2f}\n")
                del ticker_shares[etf]
            else:
                print(f"Data for {etf} is missing for {current_month}. Skipping sale.")
        else:
            print(f"No shares found for {etf} to sell.")

    # Step 5: Buy new ETFs that were not in the previous month's top ETFs
    for etf_to_buy, etf_to_sell in zip(etfs_to_buy, etfs_to_sell):
        selling_value = selling_values.get(etf_to_sell, 0)
        if selling_value > 0 and etf_to_buy in etf_histories_for_current_month:
            first_trading_day_new_etf = etf_histories_for_current_month[etf_to_buy].loc[
                etf_histories_for_current_month[etf_to_buy].index[0], 'Close'
            ]
            new_shares = (selling_value * 0.9975) / first_trading_day_new_etf
            print(f"Formula: ({selling_value} * 0.9975) / {first_trading_day_new_etf}")
            print(f"Buy {etf_to_buy}: {new_shares:.2f} shares at {first_trading_day_new_etf:.2f}.\n")
            ticker_shares[etf_to_buy] = new_shares
        else:
            print(f"Data for {etf_to_buy} is missing or no selling value available. Skipping purchase of {etf_to_buy}.")

    # Step 6: Maintain the order of ETFs in the ticker_shares dictionary based on the current top 2 ETFs
    ordered_ticker_shares = {etf: ticker_shares[etf] for etf in top2etfs_current if etf in ticker_shares}

    print(f"Updated ticker shares after {current_month}: {ordered_ticker_shares}")
    print("")
    return ordered_ticker_shares



def main_monthly(tickers, etf_dict, smoothing=True):
    etf_pred_dict = process_etf_data_monthly(tickers, etf_dict, smoothing=smoothing)

    risk_percentage = 0.10
    monthly_scores = {}

    # Calculate scores for all months
    for month in range(1, 13):
        month_key = f"{month}m"
        if any(f"returns_{month_key}" in etf_pred_dict[etf] for etf in etf_pred_dict):
            scores = calculate_scores_for_etfs_weekly(etf_pred_dict, month_key, risk_percentage)  # Reusing weekly function
            monthly_scores[month_key] = scores

    # Convert scores to DataFrames
    monthly_scores_dfs = {}
    for month_key, scores in monthly_scores.items():
        monthly_scores_dfs[month_key] = pd.DataFrame(scores)

    # Select top ETFs for each month
    top_etfs_monthly = {}
    for month_key, df_scores in monthly_scores_dfs.items():
        top_etfs_monthly[month_key] = df_scores.nlargest(2, 'Score')

    # Generate month ranges for ETF data gathering
    month_ranges = generate_month_ranges('2022-01-01', '2023-01-01')

    # Gather ETF historical data
    etf_histories = gather_etf_data_for_weeks(tickers, month_ranges)

    # Map month keys to date ranges
    month_key_mapping = {f"{i+1}m": month_range for i, month_range in enumerate(etf_histories.keys())}

    # Align top ETFs with historical data
    aligned_top_etfs_monthly = {}
    for month_key, df_scores in monthly_scores_dfs.items():
        forecast_period = month_key_mapping.get(month_key, None)
        if forecast_period:
            aligned_top_etfs_monthly[forecast_period] = select_top_etfs_weekly(df_scores, forecast_period)

    # Initialize and manage portfolio
    ticker_shares = {}
    ticker_shares_per_month = {}
    for i, month_range in enumerate(etf_histories.keys()):
        current_month_key = month_range
        if i == 0:
            # Initialize shares for the first month
            ticker_shares = initialize_shares_for_first_week(
                aligned_top_etfs_monthly[current_month_key],
                etf_histories,
                current_month_key
            )
            ticker_shares_per_month[current_month_key] = ticker_shares.copy()
        else:
            # Manage portfolio for subsequent months
            prev_month_key = list(etf_histories.keys())[i - 1]
            ticker_shares = manage_etf_portfolio_monthly(
                aligned_top_etfs_monthly[prev_month_key],
                aligned_top_etfs_monthly[current_month_key],
                prev_month_key,
                current_month_key,
                ticker_shares,
                etf_histories
            )
            ticker_shares_per_month[current_month_key] = ticker_shares.copy()

    # Define the first trading day of the 13th month
    first_trading_day_13m = '2023-01-01'

    # Identify the 12th and 13th month date ranges
    month_12_range = list(ticker_shares_per_month.keys())[-1]
    month_13_start = first_trading_day_13m
    print(f"Using data for the 12th month: {month_12_range}")
    print(f"Fetching data starting from the first trading day of the 13th month: {month_13_start}")

    month_13_end = '2023-01-06'
    etf_values_13m = {}

    # Ensure 12th month data exists
    if month_12_range in ticker_shares_per_month:
        ticker_shares_12m = ticker_shares_per_month[month_12_range]

        # Fetch the first trading day price of the 13th month for each ETF
        for ticker, shares in ticker_shares_12m.items():
            print(f"Fetching data for ticker {ticker} starting from {month_13_start}...")
            data = yf.download(ticker, start=month_13_start, end=month_13_end)

            if not data.empty:
                closing_price_13m = data['Close'].iloc[0]
                total_value = shares * closing_price_13m
                etf_values_13m[ticker] = total_value
                print(f"{ticker}: {shares:.2f} shares at ${closing_price_13m:.2f} each, total value: ${total_value:.2f}")
            else:
                print(f"{ticker}: No data available for the 13th month's first trading day.")
    else:
        print(f"No data available in ticker_shares_per_month for the 12th month: {month_12_range}")

    # Display total portfolio value for the 13th month's first trading day
    if etf_values_13m:
        print("\nETF values on the 13th month's first trading day:")
        total_value = sum(etf_values_13m.values())
        print(f"Total portfolio value: {total_value:.2f}")
        for ticker, value in etf_values_13m.items():
            print(f"{ticker}: {value:.2f}")
    else:
        print("No values could be calculated for the 13th month's first trading day.")

    return ticker_shares_per_month



In [15]:
# Now call the weekly workflow using the 'results' as etf_dict
ticker_shares_per_month_wo_smoothing = main_monthly(tickers, results,False)


Fetching full-year data for all tickers...
Downloading data for SMH for the entire year...
Downloading data for SOXX for the entire year...
Downloading data for PSI for the entire year...
Downloading data for XSD for the entire year...
Downloading data for IYW for the entire year...
Downloading data for XLK for the entire year...
Downloading data for VGT for the entire year...
Downloading data for FTEC for the entire year...
Downloading data for IGM for the entire year...
Downloading data for IXN for the entire year...


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

Processing scores for 2022-01-01 to 2022-01-31:
    ETF Week  RiskPercentage     Score
0   SMH   1m             0.1  1.909478
1  SOXX   1m             0.1 -4.549693
2   PSI   1m             0.1 -0.901228
3   XSD   1m             0.1 -8.186279
4   IYW   1m             0.1  1.720864
Top ETFs for 2022-01-01 to 2022-01-31: ['XLK', 'FTEC']
Processing scores for 2022-02-01 to 2022-02-28:
    ETF Week  RiskPercentage     Score
0   SMH   2m             0.1  0.219017
1  SOXX   2m             0.1  1.046213
2   PSI   2m             0.1  4.943786
3   XSD   2m             0.1  1.112340
4   IYW   2m             0.1 -0.255701
Top ETFs for 2022-02-01 to 2022-02-28: ['PSI', 'FTEC']
Processing scores for 2022-03-01 to 2022-03-31:
    ETF Week  RiskPercentage     Score
0   SMH   3m             0.1 -4.629103
1  SOXX   3m             0.1 -6.031621
2   PSI   3m             0.1 -1.676497
3   XSD   3m             0.1 -4.457733
4   IYW   3m             0.1  5.175039
Top ETFs for 2022-03-01 to 2022-03-31: ['IYW




In [16]:
# Now call the weekly workflow using the 'results' as etf_dict
ticker_shares_per_month_with_smoothing = main_monthly(tickers, results,True)


Fetching full-year data for all tickers...
Downloading data for SMH for the entire year...


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

Downloading data for SOXX for the entire year...
Downloading data for PSI for the entire year...
Downloading data for XSD for the entire year...
Downloading data for IYW for the entire year...
Downloading data for XLK for the entire year...
Downloading data for VGT for the entire year...
Downloading data for FTEC for the entire year...
Downloading data for IGM for the entire year...
Downloading data for IXN for the entire year...
Processing scores for 2022-01-01 to 2022-01-31:
    ETF Week  RiskPercentage     Score
0   SMH   1m             0.1  3.305628
1  SOXX   1m             0.1  0.564744
2   PSI   1m             0.1 -0.193390
3   XSD   1m             0.1 -1.890366
4   IYW   1m             0.1 -2.035576
Top ETFs for 2022-01-01 to 2022-01-31: ['SMH', 'XLK']
Processing scores for 2022-02-01 to 2022-02-28:
    ETF Week  RiskPercentage     Score
0   SMH   2m             0.1  0.132512
1  SOXX   2m             0.1  1.073286
2   PSI   2m             0.1  5.654588
3   XSD   2m             0


