In [113]:
import pandas as pd
from darts import TimeSeries
from darts.models import LightGBMModel
from darts.dataprocessing.transformers import Scaler
from darts.utils.timeseries_generation import datetime_attribute_timeseries

def allocate_stock_weights_rank_based(predicted_returns, list_stocks, N=10, max_weight=0.10):
    """
    Allocates stock weights based on a 1/rank allocation strategy, giving higher weight to higher-ranked stocks.
    
    Parameters:
        predicted_returns (dict): A dictionary of stock symbols and their predicted returns.
        list_stocks (list): A list of all stock symbols.
        N (int): The number of top stocks to allocate weights to (default is 10).
        max_weight (float): Maximum allowable weight per stock (default is 10%).
    
    Returns:
        list: A list of weights corresponding to the stocks in list_stocks.
    """
    # Sort the stocks by their predicted returns
    sorted_stocks = sorted(predicted_returns, key=predicted_returns.get, reverse=True)
    
    # Select the top N stocks with the highest predicted returns
    top_stocks = sorted_stocks[:N]
    
    # Compute weights using the 1/rank strategy, ensuring that no stock exceeds the max weight
    total_rank_weight = sum(1 / (rank + 1) for rank in range(len(top_stocks)))  # Normalize total rank weight
    
    stock_weights = {}
    for rank, stock in enumerate(top_stocks):
        # Compute weight based on 1/rank and normalize it so the sum of weights does not exceed 1
        weight = (1 / (rank + 1)) / total_rank_weight * min(max_weight, 1)
        stock_weights[stock] = min(weight, max_weight)
    
    # Create a list of weights corresponding to the stocks in list_stocks
    weights = [stock_weights.get(stock, 0) for stock in list_stocks]
    
    return weights



# Function to prepare the data and fit a multivariate LightGBM model
def predict(df_test, list_stocks, df_returns, df_weights):
    
    # Convert the returns dataframe into TimeSeries format for Darts
    # Create a multivariate time series by combining all stock series
    df_returns['month_end'] = pd.to_datetime(df_returns['month_end'])
    df_test['month_end'] = pd.to_datetime(df_test['month_end'])


    multivariate_series = TimeSeries.from_dataframe(df_returns, 'month_end', list_stocks)

    # Normalize the data (optional, but useful for LightGBM)
    scaler = Scaler()
    multivariate_series = scaler.fit_transform(multivariate_series)
    
    # Create the LightGBM model and train on the multivariate series
    model = LightGBMModel(lags=12)  # adjust lags for the time horizon you're interested in
    model.fit(multivariate_series)
    
    # Initialize predicted weights dataframe
    predicted_returns = {}

    # Loop through the test set
    for i in range(len(df_test)):
        # Get the data up to the current month in the test set
        df_latest = df_returns[df_returns['month_end'] < df_test.loc[i, 'month_end']]
        
        # Forecast the next returns for all stocks (multivariate prediction)
        forecast = model.predict(n=1)

        # Reverse scaling transformation (if used)
        forecast = scaler.inverse_transform(forecast)
        
        # Store predicted returns in a dictionary for sorting
        for idx, stock in enumerate(list_stocks):
            predicted_returns[stock] = forecast[stock].values()[0][0]  # Extract the forecasted value

        weights = allocate_stock_weights_rank_based(predicted_returns, list_stocks, N=10, max_weight=0.10)

        
        # Add the current weights to the DataFrame
        df_this = pd.DataFrame(data=[[df_test.loc[i, 'month_end']] + weights], 
                               columns=['month_end'] + list_stocks)
        df_weights = pd.concat([df_weights, df_this], ignore_index=True)

        for i in range(len(df_test)):

            # latest data at this point
            df_latest = df_returns[(df_returns['month_end'] < df_test.loc[i, 'month_end'])]
                    
            # vol calc
            df_w = pd.DataFrame()
            df_w['vol'] = df_latest.std(numeric_only=True)          # calculate stock volatility
            df_w['inv_vol'] = 1/df_w['vol']                         # calculate the inverse volatility
            df_w['tot_inv_vol'] = df_w['inv_vol'].sum()             # calculate the total inverse volatility
            df_w['weight'] = df_w['inv_vol']/df_w['tot_inv_vol']    # calculate weight based on inverse volatility
            df_w.reset_index(inplace=True, names='name')

            # add to all weights
            df_this = pd.DataFrame(data=[[df_test.loc[i, 'month_end']] + df_w['weight'].to_list()], columns=df_latest.columns)
            df_weights = pd.concat(objs=[df_weights, df_this], ignore_index=True)
        
    
    return df_weights


In [114]:
# %%
import numpy as np
import pandas as pd
import datetime
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from concurrent.futures import ProcessPoolExecutor
import plotly.express as px
import concurrent.futures
import darts
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# ...



print('---Python script Start---', str(datetime.datetime.now()))

# %%

# data reads
df_returns_train = pd.read_csv('data/returns_train.csv')
df_returns_test = pd.read_csv('data/returns_test.csv')
df_returns_train['month_end'] = pd.to_datetime(arg=df_returns_train['month_end']).apply(lambda d: d.date())
df_returns_test['month_end'] = pd.to_datetime(arg=df_returns_test['month_end']).apply(lambda d: d.date())

# %%

def equalise_weights(df: pd.DataFrame):

    '''
        Function to generate the equal weights, i.e. 1/p for each active stock within a month

        Args:
            df: A return data frame. First column is month end and remaining columns are stocks

        Returns:
            A dataframe of the same dimension but with values 1/p on active funds within a month

    '''

    # create df to house weights
    n_length = len(df)
    df_returns = df
    df_weights = df_returns[:n_length].copy()
    df_weights.set_index('month_end', inplace=True)

    # list of stock names
    list_stocks = list(df_returns.columns)
    list_stocks.remove('month_end')

    # assign 1/p
    df_weights[list_stocks] = 1/len(list_stocks)

    return df_weights


# %%

def generate_portfolio(df_train: pd.DataFrame, df_test: pd.DataFrame):

    '''
        Function to generate stocks weight allocation for time t+1 using historic data. Initial weights generated as 1/p for active stock within a month

        Args:
            df_train: The training set of returns. First column is month end and remaining columns are stocks
            df_test: The testing set of returns. First column is month end and remaining columns are stocks

        Returns:
            The returns dataframe and the weights
    '''

    print('---> training set spans', df_train['month_end'].min(), df_train['month_end'].max())
    print('---> training set spans', df_test['month_end'].min(), df_test['month_end'].max())

    # initialise data
    n_train = len(df_train)
    df_returns = pd.concat(objs=[df_train, df_test], ignore_index=True)

    df_weights = equalise_weights(df_returns[:n_train]) # df to store weights and create initial

    # list of stock names
    list_stocks = list(df_returns.columns)
    list_stocks.remove('month_end')

    # <<--------------------- YOUR CODE GOES BELOW THIS LINE --------------------->>
    
    


    df_weights = predict(df_test, list_stocks, df_returns, df_weights)

    print(f"AFTER: {len(df_weights)}")
    
    # <<--------------------- YOUR CODE GOES ABOVE THIS LINE --------------------->>
    
    # 10% limit check
    if len(np.array(df_weights[list_stocks])[np.array(df_weights[list_stocks]) > 0.101]):

        raise Exception(r'---> 10% limit exceeded')

    return df_returns, df_weights


# %%


def plot_total_return(df_returns: pd.DataFrame, df_weights_index: pd.DataFrame, df_weights_portfolio: pd.DataFrame):

    '''
        Function to generate the two total return indices.

        Args:
            df_returns: Ascending date ordered combined training and test returns data.
            df_weights_index: Index weights. Equally weighted
            df_weights_index: Portfolio weights. Your portfolio should use equally weighted for the training date range. If blank will be ignored

        Returns:
            A plot of the two total return indices and the total return indices as a dataframe
    '''

    # list of stock names
    list_stocks = list(df_returns.columns)
    list_stocks.remove('month_end')

    # replace nans with 0 in return array
    ar_returns = np.array(df_returns[list_stocks])
    np.nan_to_num(x=ar_returns, copy=False, nan=0)

    # calc index
    ar_rtn_index = np.array(df_weights_index[list_stocks])*ar_returns
    ar_rtn_port = np.array(df_weights_portfolio[list_stocks])*ar_returns

    v_rtn_index = np.sum(ar_rtn_index, axis=1)
    v_rtn_port = np.sum(ar_rtn_port, axis=1)

    # add return series to dataframe
    df_rtn = pd.DataFrame(data=df_returns['month_end'], columns=['month_end'])
    df_rtn['index'] = v_rtn_index
    df_rtn['portfolio'] = v_rtn_port
    df_rtn

    # create total return
    base_price = 100
    df_rtn.sort_values(by = 'month_end', inplace = True)
    df_rtn['index_tr'] = ((1 + df_rtn['index']).cumprod()) * base_price
    df_rtn['portfolio_tr'] = ((1 + df_rtn['portfolio']).cumprod()) * base_price
    df_rtn

    df_rtn_long = df_rtn[['month_end', 'index_tr', 'portfolio_tr']].melt(id_vars='month_end', var_name='series', value_name='Total Return')

    # plot
    fig1 = px.line(data_frame=df_rtn_long, x='month_end', y='Total Return', color='series')

    return fig1, df_rtn

# %%





# running solution
df_returns = pd.concat(objs=[df_returns_train, df_returns_test], ignore_index=True)
df_weights_index = equalise_weights(df_returns)
df_returns, df_weights_portfolio = generate_portfolio(df_returns_train, df_returns_test)
fig1, df_rtn = plot_total_return(df_returns, df_weights_index=df_weights_index, df_weights_portfolio=df_weights_portfolio)
fig1


---Python script Start--- 2024-09-09 09:53:54.144093
---> training set spans 2010-01-31 2017-08-31
---> training set spans 2017-09-30 2022-09-30
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31078
[LightGBM] [Info] Number of data points in the train set: 141, number of used features: 648
[LightGBM] [Info] Start training from score 0.415383
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31078
[LightGBM] [Info] Number of data points in the train set: 141, number of used features: 648
[LightGBM] [Info] Start training from score 0.492669
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31078
[LightGBM] [Info] Number of data points in the train set: 141, number of used features: 648
[LightGBM] [Info] Start training from score 0.426048
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31078
[LightGBM] [Info] Number of data points in the t

ValueError: operands could not be broadcast together with shapes (3874,54) (153,54) 

KeyError: 1