In [None]:
import warnings, gc
from math import ceil, sqrt
from decimal import ROUND_HALF_UP, Decimal
from datetime import datetime, timedelta
import pickle

import numpy as np 
import pandas as pd

from tqdm import tqdm

import matplotlib.colors
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error

from xgboost import XGBRegressor

import jpx_tokyo_market_prediction

warnings.filterwarnings("ignore")

pd.options.display.max_rows = 50
pd.options.display.min_rows = 50
pd.options.display.max_columns = None

tqdm.pandas()

# Test config
test_config = {
    #Specify which step to run to reduce testing time
    'load_data': True,
    'calc_features': True,
    'train_cv': False,
    'predict_cv': False,
    'train_sector': True,
    'predict_sector': True,
    
    # Specify if we use XGBoost with GPU or not
    'use_gpu': True,
    'early_stopping_rounds': 5,
    'verbose': 25,
    'ignore_count': 0
}

# XGBoost training parameters
xgb_submit_params = {
    'verbosity': 1,
    'objective': 'reg:squarederror',
    'n_estimators': 10_000,
    'learning_rate': 0.02,
    'max_depth': 14,
    'random_state': 21,
    'tree_method': 'hist'
}

In [None]:
# Read csv files
def load_csv(test_config):
    print('load_csv()')
    
    dataset_dir="../input/jpx-tokyo-stock-exchange-prediction/train_files"
    supplemental_dataset_dir="../input/jpx-tokyo-stock-exchange-prediction/supplemental_files"

    df_stock_list = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv")
    df_stock_list['Name'] = [i.rstrip().lower().capitalize() for i in df_stock_list['Name']]
    df_stock_list['17SectorName'] = [i.rstrip().lower().capitalize() for i in df_stock_list['17SectorName']]
    df_stock_list['33SectorName'] = [i.rstrip().lower().capitalize() for i in df_stock_list['33SectorName']]
    df_stock_list['17SectorCode'] = [i for i in df_stock_list['17SectorCode']]
    df_stock_list['17SectorCode'] = df_stock_list['17SectorCode'].replace('-', '0').fillna('0')
    df_stock_list['33SectorCode'] = [i for i in df_stock_list['33SectorCode']]
    df_stock_list['33SectorCode'] = df_stock_list['33SectorCode'].replace('-', '0').fillna('0')

    # List of data files
    inputs = {
        "financials": [f"{dataset_dir}/financials.csv", f"{supplemental_dataset_dir}/financials.csv"],
        #"options": [f"{dataset_dir}/options.csv", f"{supplemental_dataset_dir}/options.csv"],
        "secondary_stock_prices": [f"{dataset_dir}/secondary_stock_prices.csv", f"{supplemental_dataset_dir}/secondary_stock_prices.csv"],
        "stock_prices": [f"{dataset_dir}/stock_prices.csv", f"{supplemental_dataset_dir}/stock_prices.csv"],
        #"trades": [f"{dataset_dir}/trades.csv", f"{supplemental_dataset_dir}/trades.csv"]
    }

    # dict_input is a dict of raw input dataframes
    dict_input = dict()

    example_test_files = {}
    for k, v in tqdm(inputs.items()):
        _dfs = dict()
        for file_name in v:
            if 'trades.csv' in file_name:
                _dfs[file_name] = pd.read_csv(file_name, parse_dates = ['PublishedDate'])
            else:
                _dfs[file_name] = pd.read_csv(file_name, parse_dates = ['Date'])

        dict_input[k] = pd.concat(_dfs).reset_index(drop = True)

        if k in ['financials', 'secondary_stock_prices', 'stock_prices']:
            dict_input[k] = dict_input[k].merge(df_stock_list[['SecuritiesCode', 'Name', '17SectorCode', '33SectorCode']], on = 'SecuritiesCode', how = 'left')
            dict_input[k]['SecuritiesCode'] = dict_input[k]['SecuritiesCode'].fillna(0).astype(int)
            dict_input[k]['17SectorCode'] = dict_input[k]['17SectorCode'].fillna('0').astype(int)
            dict_input[k]['33SectorCode'] = dict_input[k]['33SectorCode'].fillna('0').astype(int)
            dict_input[k].sort_values(['SecuritiesCode', 'Date'], inplace = True)
        else:
            dict_input[k].sort_values('Date', inplace = True)

    for k, v in dict_input.items():
        print(f'{k}.info()')
        print(v.info())

    return dict_input

In [None]:
def adjust_price(df_price):
    print('adjust_price()')
    def generate_adjusted_features(df):
        df_copy = df
        
        # generate CumulativeAdjustmentFactor
        df_copy.loc[:, "CumulativeAdjustmentFactor"] = df_copy["AdjustmentFactor"].cumprod().shift(1).fillna(1.0)
        
        # generate Adjusted prices
        for column in ["Close", "Open", "High", "Low", "Volume"]:
            # Adjusted columns
            df_copy.loc[:, f'Adjusted_{column}'] = (df_copy[column] / df_copy["CumulativeAdjustmentFactor"] if column != "Volume" else df_copy[column] * df_copy["CumulativeAdjustmentFactor"]).map(lambda x: float(Decimal(str(x)).quantize(Decimal('0.1'), rounding = ROUND_HALF_UP)))
            df_copy.loc[df_copy[f'Adjusted_{column}'] == 0, f'Adjusted_{column}'] = np.nan
            
            # NaN handling
            if column == "Close":
                # First, forward fill for Close
                df_copy.loc[:, column] = df_copy.loc[:, column].ffill()
                df_copy.loc[:, f'Adjusted_{column}'] = df_copy.loc[:, f'Adjusted_{column}'].ffill()
            elif column == "High" or column == "Low" or column == "Open":
                # High, Low and Open should be the same as Close
                df_copy.loc[np.isnan(df_copy[column]), column] = df_copy.loc[np.isnan(df_copy[column]), 'Close']
                df_copy.loc[np.isnan(df_copy[f'Adjusted_{column}']), f'Adjusted_{column}'] = df_copy.loc[np.isnan(df_copy[f'Adjusted_{column}']), 'Adjusted_Close']
            else:
                # Volume should be 0
                df_copy.loc[np.isnan(df_copy[column]), column] = 0
                df_copy.loc[np.isnan(df_copy[f'Adjusted_{column}']), f'Adjusted_{column}'] = 0
        return df_copy
    
    print('Adjust OHLCV by adjustment factor')
    df_price = df_price.groupby("SecuritiesCode").progress_apply(generate_adjusted_features).reset_index(drop = True)
    return df_price.dropna(subset = ['Close', 'Open', 'High', 'Low'], how = 'all')


def adjust_financial(df_financial):
    print('adjust_financial()')

    # Drop invalid row
    df_financial = df_financial[~df_financial['DisclosedTime'].isna()]
    
    # Extract only financial statements
    df_financial = df_financial[df_financial['TypeOfDocument'].str.contains('FinancialStatements')]

    # If disclosure time >= 15:00, we can't use the information on the day
    df_financial['DisclosedDateTime'] = pd.to_datetime(df_financial['DisclosedUnixTime'], unit = 's')
    df_financial['DisclosedDateTime'] = df_financial['DisclosedDateTime'].dt.tz_localize('utc').dt.tz_convert('Asia/Tokyo')
    df_financial['Date'] = df_financial['Date'] + (df_financial['DisclosedDateTime'].dt.hour >= 15) * timedelta(days = 1)
    
    # ToDo: Reflect financial revisions
    
    # Convert key results to float
    df_financial['NetSales'] = pd.to_numeric(df_financial["NetSales"], errors = 'coerce').fillna(0)
    df_financial['Profit'] = pd.to_numeric(df_financial["Profit"], errors = 'coerce').fillna(0)
    df_financial['ForecastProfit'] = pd.to_numeric(df_financial["ForecastProfit"], errors = 'coerce').fillna(0)
    df_financial['AverageNumberOfShares'] = pd.to_numeric(df_financial["AverageNumberOfShares"], errors = 'coerce').fillna(0)
    
    # Calc EPS
    df_financial.loc[:, 'EPS'] = df_financial.loc[:, 'Profit'] / df_financial.loc[:, 'AverageNumberOfShares']
    df_financial.loc[:, 'EPS'].replace([np.inf, -np.inf], np.nan, inplace = True)
    df_financial.loc[:, 'EPS'].fillna(0, inplace = True)
    
    # Calc Profit Margin
    df_financial.loc[:, 'ProfitMargin'] = df_financial.loc[:, 'Profit'] / df_financial.loc[:, 'NetSales']
    df_financial.loc[:, 'ProfitMargin'].replace([np.inf, -np.inf], np.nan, inplace = True)
    df_financial.loc[:, 'ProfitMargin'].fillna(0, inplace = True)
    
    # Convert raw value to pct_change from the previous financial quarter
    def calc_financial_change(df):
        df_copy = df
        
        df_copy['EPS_pct_change'] = df_copy['EPS'].diff() / df_copy['EPS'].abs().shift(1)
        df_copy['EPS_pct_change'].replace([np.inf, -np.inf], np.nan, inplace = True)
        df_copy['EPS_pct_change'].fillna(0, inplace = True)
        
        df_copy['NetSales_pct_change'] = df_copy['NetSales'].diff() / df_copy['NetSales'].abs().shift(1)
        df_copy['NetSales_pct_change'].replace([np.inf, -np.inf], np.nan, inplace = True)
        df_copy['NetSales_pct_change'].fillna(0, inplace = True)
        
        df_copy['Profit_pct_change'] = df_copy['Profit'].diff() / df_copy['Profit'].abs().shift(1)
        df_copy['Profit_pct_change'].replace([np.inf, -np.inf], np.nan, inplace = True)
        df_copy['Profit_pct_change'].fillna(0, inplace = True)
        
        df_copy['ForecastProfit_pct_change'] = df_copy['ForecastProfit'].diff() / df_copy['ForecastProfit'].abs().shift(1)
        df_copy['ForecastProfit_pct_change'].replace([np.inf, -np.inf], np.nan, inplace = True)
        df_copy['ForecastProfit_pct_change'].fillna(0, inplace = True)
        
        return df_copy
    
    # Calc change from the previous financial period
    print('Calc q2q financial pct_change')
    df_financial = df_financial.groupby(['SecuritiesCode', 'TypeOfCurrentPeriod']).progress_apply(calc_financial_change)    
    
    return df_financial[['Date', 'SecuritiesCode', 'EPS', 'ProfitMargin', 'EPS_pct_change', 'NetSales_pct_change', 'Profit_pct_change', 'ForecastProfit_pct_change']]

def load_data(test_config):
    print('load_data()')
    
    dict_input = load_csv(test_config)
    
    # Adjust financial data
    df_adjusted_financial = adjust_financial(dict_input['financials'])
    gc.collect()
    df_adjusted_financial = df_adjusted_financial.sort_values(['Date', 'SecuritiesCode'], ascending = True).reset_index(drop = True)
    
    # Use 1306 as a market return (Use 1306 TOPIX ETF as a proxy of market return)
    df_adjusted_price = pd.concat([dict_input['stock_prices'], dict_input['secondary_stock_prices'].loc[dict_input['secondary_stock_prices']['SecuritiesCode'] == 1306, :]])
    
    # Reflect AdjustmentFactor to price, etc...
    df_adjusted_price = adjust_price(df_adjusted_price)
    gc.collect()
    df_adjusted_price = df_adjusted_price.sort_values('Date', ascending = True).reset_index(drop = True)
    df_adjusted_price.drop(['RowId', 'AdjustmentFactor', 'CumulativeAdjustmentFactor', 'ExpectedDividend'], axis = 1, inplace = True)
    
    # Add average number of share to price dataframe
    print('Merge price data and financial data')
    df_adjusted_price = pd.merge_asof(df_adjusted_price, df_adjusted_financial[['Date', 'SecuritiesCode', 'EPS', 'ProfitMargin', 'EPS_pct_change', 'NetSales_pct_change', 'Profit_pct_change', 'ForecastProfit_pct_change']], by = 'SecuritiesCode', on = 'Date', direction = 'backward')    
    df_adjusted_price.reset_index(drop = True, inplace = True)
    gc.collect()
    
    # Calc residual return
    def calc_residual_return(df):
        marketreturn = df.loc[df['SecuritiesCode'] == 1306, 'Target'].values[0]
        #df.loc[:, 'ResidualTarget'] = df.loc[:, 'Target'] - marketreturn
        df.loc[:, 'ResidualTarget'] = df.loc[:, 'Target']
        return df
    
    df_adjusted_price = df_adjusted_price.groupby('Date').progress_apply(calc_residual_return)
    #display(df_adjusted_price.groupby('Date').count())
    #display(df_adjusted_price)
        
    # Dump to pickle file
    df_adjusted_price.loc[df_adjusted_price['SecuritiesCode'] != 1306, :].dropna().to_pickle("df_adjusted_price.pkl")

In [None]:
# Features engineering
def ror(pivots, window):
    closes, opens, highs, lows, volumes = pivots
    ror = closes.pct_change(window)
    return pd.melt(ror, ignore_index = False).reset_index().dropna().rename(columns = {"value": f"ror_{window}"})

def qvol(pivots, window):    
    closes, opens, highs, lows, volumes = pivots
    quote_volumes = (volumes * (closes + opens) / 2).rolling(window).sum()
    return pd.melt(quote_volumes, ignore_index=False).reset_index().dropna().rename(columns = {"value": f"qvol_{window}"})

def atr(pivots, window):
    closes, opens, highs, lows, volumes = pivots
    a = highs.rolling(window).max() - lows.rolling(window).min()
    b = abs(highs.rolling(window).max() - closes.shift(window + 1))
    c = abs(lows.rolling(window).min() - closes.shift(window + 1))
    atr = pd.DataFrame(np.max([a,b,c], axis = 0), index = a.index, columns = a.columns)
    return pd.melt(atr, ignore_index=False).reset_index().dropna().rename(columns = {"value": f"atr_{window}"})

def gaphigh(pivots, window):    
    closes, opens, highs, lows, volumes = pivots
    atr_gaphigh = np.abs(highs.rolling(window).max() - closes.shift(window + 1))
    return pd.melt(atr_gaphigh, ignore_index = False).reset_index().dropna().rename(columns = {"value": f"gaphigh_{window}"})

def gaplow(pivots, window):
    closes, opens, highs, lows, volumes = pivots
    atr_gaplow = np.abs(lows.rolling(window).min() - closes.shift(window + 1))
    return pd.melt(atr_gaplow, ignore_index = False).reset_index().dropna().rename(columns = {"value": f"gaplow_{window}"})

def vola(pivots, window):
    closes, opens, highs, lows, volumes = pivots
    vola = closes.diff().rolling(window).std()
    return pd.melt(vola, ignore_index = False).reset_index().dropna().rename(columns = {"value": f"vola_{window}"})

def HL(pivots, window):
    closes, opens, highs, lows, volumes = pivots
    hl = highs.rolling(window).max() - lows.rolling(window).min()
    return pd.melt(hl, ignore_index = False).reset_index().dropna().rename(columns = {"value": f"hl_{window}"})

def market_impact(pivots, window):
    closes, opens, highs, lows, volumes = pivots
    mi = closes.diff(window) / volumes.rolling(window).sum()
    return pd.melt(mi, ignore_index = False).reset_index().dropna().rename(columns = {"value": f"mi_{window}"})

def ma(pivots, window):
    closes, opens, highs, lows, volumes = pivots
    ma = closes.rolling(window).mean()
    return pd.melt(ma, ignore_index = False).reset_index().dropna().rename(columns = {"value": f"ma_{window}"})

def calc_features(test_config):
    print('calc_features()')
    
    df_adjusted_price = pd.read_pickle('df_adjusted_price.pkl')
        
    # Calc technical features
    closes = pd.pivot_table(df_adjusted_price, values = "Adjusted_Close", index = "Date", columns = "SecuritiesCode").ffill()
    opens = pd.pivot_table(df_adjusted_price, values = "Adjusted_Open", index = "Date", columns = "SecuritiesCode").ffill()
    highs = pd.pivot_table(df_adjusted_price, values = "Adjusted_High", index = "Date", columns = "SecuritiesCode").ffill()
    lows = pd.pivot_table(df_adjusted_price, values = "Adjusted_Low", index = "Date", columns = "SecuritiesCode").ffill()
    volumes = pd.pivot_table(df_adjusted_price, values = "Adjusted_Volume", index = "Date", columns = "SecuritiesCode").ffill()
    
    pivots = (closes, opens, highs, lows, volumes)
    
    # Prepare features columns
    print('Calc basic features')
    for func in [vola]:
        for window in tqdm([2, 5, 10, 20]):
            df_adjusted_price = pd.merge(df_adjusted_price, func(pivots, window), on = ["Date","SecuritiesCode"], how = "left")
            gc.collect()
    
    #for func in [ror, ma, qvol, atr, gaphigh, gaplow, HL, market_impact]:
    for func in [ror, ma, qvol, atr, gaphigh, gaplow, market_impact]:
        for window in tqdm([1, 5, 10, 20]):
            df_adjusted_price = pd.merge(df_adjusted_price, func(pivots, window), on = ["Date","SecuritiesCode"], how = "left")
            gc.collect()
    
    del pivots, closes, opens, highs, lows, volumes
    gc.collect()
    
    # Prepare pct_change columns for one stock
    def calc_pct_change(df):
        df_copy = df
        
        #for column_name in ['ror', 'ma', 'atr', 'gaphigh', 'gaplow', 'qvol', 'hl', mi']:
        for column_name in ['ror', 'ma', 'atr', 'gaphigh', 'gaplow', 'qvol', 'mi']:
            for window in [1, 5, 10, 20]:
                df_copy[f'{column_name}_{window}_pct_change'] = df_copy[f'{column_name}_{window}'].diff() / df_copy[f'{column_name}_{window}'].abs().shift(1)
                df_copy[f'{column_name}_{window}_pct_change'].replace([np.inf, -np.inf], np.nan, inplace = True)
                df_copy[f'{column_name}_{window}_pct_change'].fillna(0, inplace = True)

                if window > 1:
                    df_copy[f'{column_name}_1_{window}_pct_change'] = (df_copy[f'{column_name}_1'] - df_copy[f'{column_name}_{window}']) / df_copy[f'{column_name}_{window}'].abs()
                    df_copy[f'{column_name}_1_{window}_pct_change'].replace([np.inf, -np.inf], np.nan, inplace = True)
                    df_copy[f'{column_name}_1_{window}_pct_change'].fillna(0, inplace = True)

        # Prepare pct_change columns    
        for column_name in ['vola']:
            for window in [2, 5, 10, 20]:
                df_copy[f'{column_name}_{window}_pct_change'] = df_copy[f'{column_name}_{window}'].diff() / df_copy[f'{column_name}_{window}'].abs().shift(1)
                df_copy[f'{column_name}_{window}_pct_change'].replace([np.inf, -np.inf], np.nan, inplace = True)
                df_copy[f'{column_name}_{window}_pct_change'].fillna(0, inplace = True)

                if window > 2:
                    df_copy[f'{column_name}_2_{window}_pct_change'] =(df_copy[f'{column_name}_2'] - df_copy[f'{column_name}_{window}']) / df_adjusted_price[f'{column_name}_{window}'].abs()
                    df_copy[f'{column_name}_2_{window}_pct_change'].replace([np.inf, -np.inf], np.nan, inplace = True)
                    df_copy[f'{column_name}_2_{window}_pct_change'].fillna(0, inplace = True)
        
        return df_copy
    
    # Add date of week as a feature
    df_adjusted_price['dateofweek'] = df_adjusted_price['Date'].map(lambda x: datetime.fromtimestamp(x.value / 1_000_000_000).weekday())
    
    # Calc pct_change features
    print('Calc pct_change features')
    df_adjusted_price = df_adjusted_price.groupby('SecuritiesCode').progress_apply(calc_pct_change)
    gc.collect()
    
    # Drop unnecessray features
    print('Drop unnecessary features')
    drop_columns = []
    #for column_name in tqdm(['ror', 'ma', 'atr', 'gaphigh', 'gaplow', 'qvol', 'hl', 'mi']):
    for column_name in tqdm(['ror', 'ma', 'atr', 'gaphigh', 'gaplow', 'qvol', 'mi']):
        for window in [1, 5, 10, 20]:
            drop_columns.append(f'{column_name}_{window}')
    
    for column_name in tqdm(['vola']):
        for window in [2, 5, 10, 20]:
            drop_columns.append(f'{column_name}_{window}')
    
    df_adjusted_price.drop(drop_columns, axis = 1, inplace = True)
    
    # Feature calculation completed
    print('df_adjusted_price.info()')
    print(df_adjusted_price.info())

    # Dump to file and free memory
    df_adjusted_price.to_pickle('df_features_target.pkl')

In [None]:
# Mostly copied from https://www.kaggle.com/treename/janestreet-cv-method-combpurgedkfoldcv
import itertools as itt
import numbers
import numpy as np
import pandas as pd

from abc import abstractmethod
from typing import Iterable, Tuple, List

class BaseTimeSeriesCrossValidator:
    """
    Abstract class for time series cross-validation.
    Time series cross-validation requires each sample has a prediction time pred_time, at which the features are used to
    predict the response, and an evaluation time eval_time, at which the response is known and the error can be
    computed. Importantly, it means that unlike in standard sklearn cross-validation, the samples X, response y,
    pred_times and eval_times must all be pandas dataframe/series having the same index. It is also assumed that the
    samples are time-ordered with respect to the prediction time (i.e. pred_times is non-decreasing).
    Parameters
    ----------
    n_splits : int, default=10
        Number of folds. Must be at least 2.
    """
    def __init__(self, n_splits=10):
        if not isinstance(n_splits, numbers.Integral):
            raise ValueError(f"The number of folds must be of Integral type. {n_splits} of type {type(n_splits)}"
                             f" was passed.")
        n_splits = int(n_splits)
        if n_splits <= 1:
            raise ValueError(f"K-fold cross-validation requires at least one train/test split by setting n_splits = 2 "
                             f"or more, got n_splits = {n_splits}.")
        self.n_splits = n_splits
        self.pred_times = None
        self.eval_times = None
        self.indices = None

    @abstractmethod
    def split(self, X: pd.DataFrame, y: pd.Series = None,
              pred_times: pd.Series = None, eval_times: pd.Series = None):
        if not isinstance(X, pd.DataFrame) and not isinstance(X, pd.Series):
            raise ValueError('X should be a pandas DataFrame/Series.')
        if not isinstance(y, pd.Series) and y is not None:
            raise ValueError('y should be a pandas Series.')
        if not isinstance(pred_times, pd.Series):
            raise ValueError('pred_times should be a pandas Series.')
        if not isinstance(eval_times, pd.Series):
            raise ValueError('eval_times should be a pandas Series.')
        if y is not None and (X.index == y.index).sum() != len(y):
            raise ValueError('X and y must have the same index')
        if (X.index == pred_times.index).sum() != len(pred_times):
            raise ValueError('X and pred_times must have the same index')
        if (X.index == eval_times.index).sum() != len(eval_times):
            raise ValueError('X and eval_times must have the same index')

        self.pred_times = pred_times
        self.eval_times = eval_times
        self.indices = np.arange(X.shape[0])


class CombPurgedKFoldCV(BaseTimeSeriesCrossValidator):
    """
    Purged and embargoed combinatorial cross-validation
    As described in Advances in financial machine learning, Marcos Lopez de Prado, 2018.
    The samples are decomposed into n_splits folds containing equal numbers of samples, without shuffling. In each cross
    validation round, n_test_splits folds are used as the test set, while the other folds are used as the train set.
    There are as many rounds as n_test_splits folds among the n_splits folds.
    Each sample should be tagged with a prediction time pred_time and an evaluation time eval_time. The split is such
    that the intervals [pred_times, eval_times] associated to samples in the train and test set do not overlap. (The
    overlapping samples are dropped.) In addition, an "embargo" period is defined, giving the minimal time between an
    evaluation time in the test set and a prediction time in the training set. This is to avoid, in the presence of
    temporal correlation, a contamination of the test set by the train set.
    Parameters
    ----------
    n_splits : int, default=10
        Number of folds. Must be at least 2.
    n_test_splits : int, default=2
        Number of folds used in the test set. Must be at least 1.
    embargo_td : pd.Timedelta, default=0
        Embargo period (see explanations above).
    """
    def __init__(self, n_splits=10, n_test_splits=2, embargo_td=0):
        super().__init__(n_splits)
        if not isinstance(n_test_splits, numbers.Integral):
            raise ValueError(f"The number of test folds must be of Integral type. {n_test_splits} of type "
                             f"{type(n_test_splits)} was passed.")
        n_test_splits = int(n_test_splits)
        if n_test_splits <= 0 or n_test_splits > self.n_splits - 1:
            raise ValueError(f"K-fold cross-validation requires at least one train/test split by setting "
                             f"n_test_splits between 1 and n_splits - 1, got n_test_splits = {n_test_splits}.")
        self.n_test_splits = n_test_splits

        if embargo_td < 0:
            raise ValueError(f"The embargo time should be positive, got embargo = {embargo_td}.")
        self.embargo_td = embargo_td

    def split(self, X: pd.DataFrame, y: pd.Series = None,
              pred_times: pd.Series = None, eval_times: pd.Series = None) -> Iterable[Tuple[np.ndarray, np.ndarray]]:
        """
        Yield the indices of the train and test sets.
        Although the samples are passed in the form of a pandas dataframe, the indices returned are position indices,
        not labels.
        Parameters
        ----------
        X : pd.DataFrame, shape (n_samples, n_features), required
            Samples. Only used to extract n_samples.
        y : pd.Series, not used, inherited from _BaseKFold
        pred_times : pd.Series, shape (n_samples,), required
            Times at which predictions are made. pred_times.index has to coincide with X.index.
        eval_times : pd.Series, shape (n_samples,), required
            Times at which the response becomes available and the error can be computed. eval_times.index has to
            coincide with X.index.
        Returns
        -------
        train_indices: np.ndarray
            A numpy array containing all the indices in the train set.
        test_indices : np.ndarray
            A numpy array containing all the indices in the test set.
        """
        super().split(X, y, pred_times, eval_times)

        # Fold boundaries
        fold_bounds = [(fold[0], fold[-1] + 1) for fold in np.array_split(self.indices, self.n_splits)]
        # List of all combinations of n_test_splits folds selected to become test sets
        selected_fold_bounds = list(itt.combinations(fold_bounds, self.n_test_splits))
        
        # In order for the first round to have its whole test set at the end of the dataset
        selected_fold_bounds.reverse()

        for fold_bound_list in selected_fold_bounds:
            # Computes the bounds of the test set, and the corresponding indices
            test_fold_bounds, test_indices = self.compute_test_set(fold_bound_list)
            # Computes the train set indices
            train_indices = self.compute_train_set(test_fold_bounds, test_indices)

            yield train_indices, test_indices

    def compute_train_set(self, test_fold_bounds: List[Tuple[int, int]], test_indices: np.ndarray) -> np.ndarray:
        """
        Compute the position indices of samples in the train set.
        Parameters
        ----------
        test_fold_bounds : List of tuples of position indices
            Each tuple records the bounds of a block of indices in the test set.
        test_indices : np.ndarray
            A numpy array containing all the indices in the test set.
        Returns
        -------
        train_indices: np.ndarray
            A numpy array containing all the indices in the train set.
        """
        # As a first approximation, the train set is the complement of the test set
        train_indices = np.setdiff1d(self.indices, test_indices)
        # But we now have to purge and embargo
        for test_fold_start, test_fold_end in test_fold_bounds:
            # Purge
            train_indices = purge(self, train_indices, test_fold_start, test_fold_end)
            # Embargo
            train_indices = embargo(self, train_indices, test_indices, test_fold_end)
        return train_indices

    def compute_test_set(self, fold_bound_list: List[Tuple[int, int]]) -> Tuple[List[Tuple[int, int]], np.ndarray]:
        """
        Compute the indices of the samples in the test set.
        Parameters
        ----------
        fold_bound_list: List of tuples of position indices
            Each tuple records the bounds of the folds belonging to the test set.
        Returns
        -------
        test_fold_bounds: List of tuples of position indices
            Like fold_bound_list, but with the neighboring folds in the test set merged.
        test_indices: np.ndarray
            A numpy array containing the test indices.
        """
        test_indices = np.empty(0)
        test_fold_bounds = []
        for fold_start, fold_end in fold_bound_list:
            # Records the boundaries of the current test split
            if not test_fold_bounds or fold_start != test_fold_bounds[-1][-1]:
                test_fold_bounds.append((fold_start, fold_end))
            # If the current test split is contiguous to the previous one, simply updates the endpoint
            elif fold_start == test_fold_bounds[-1][-1]:
                test_fold_bounds[-1] = (test_fold_bounds[-1][0], fold_end)
            test_indices = np.union1d(test_indices, self.indices[fold_start:fold_end]).astype(int)
        return test_fold_bounds, test_indices


def compute_fold_bounds(cv: BaseTimeSeriesCrossValidator, split_by_time: bool) -> List[int]:
    """
    Compute a list containing the fold (left) boundaries.
    Parameters
    ----------
    cv: BaseTimeSeriesCrossValidator
        Cross-validation object for which the bounds need to be computed.
    split_by_time: bool
        If False, the folds contain an (approximately) equal number of samples. If True, the folds span identical
        time intervals.
    """
    if split_by_time:
        full_time_span = cv.pred_times.max() - cv.pred_times.min()
        fold_time_span = full_time_span / cv.n_splits
        fold_bounds_times = [cv.pred_times.iloc[0] + fold_time_span * n for n in range(cv.n_splits)]
        return cv.pred_times.searchsorted(fold_bounds_times)
    else:
        return [fold[0] for fold in np.array_split(cv.indices, cv.n_splits)]


def embargo(cv: BaseTimeSeriesCrossValidator, train_indices: np.ndarray,
            test_indices: np.ndarray, test_fold_end: int) -> np.ndarray:
    """
    Apply the embargo procedure to part of the train set.
    This amounts to dropping the train set samples whose prediction time occurs within self.embargo_dt of the test
    set sample evaluation times. This method applies the embargo only to the part of the training set immediately
    following the end of the test set determined by test_fold_end.
    Parameters
    ----------
    cv: Cross-validation class
        Needs to have the attributes cv.pred_times, cv.eval_times, cv.embargo_dt and cv.indices.
    train_indices: np.ndarray
        A numpy array containing all the indices of the samples currently included in the train set.
    test_indices : np.ndarray
        A numpy array containing all the indices of the samples in the test set.
    test_fold_end : int
        Index corresponding to the end of a test set block.
    Returns
    -------
    train_indices: np.ndarray
        The same array, with the indices subject to embargo removed.
    """
    if not hasattr(cv, 'embargo_td'):
        raise ValueError("The passed cross-validation object should have a member cv.embargo_td defining the embargo"
                         "time.")
    last_test_eval_time = cv.eval_times.iloc[cv.indices[:test_fold_end]].max()
    min_train_index = len(cv.pred_times[cv.pred_times <= last_test_eval_time + cv.embargo_td])
    if min_train_index < cv.indices.shape[0]:
        allowed_indices = np.concatenate((cv.indices[:test_fold_end], cv.indices[min_train_index:]))
        train_indices = np.intersect1d(train_indices, allowed_indices)
    return train_indices


def purge(cv: BaseTimeSeriesCrossValidator, train_indices: np.ndarray,
          test_fold_start: int, test_fold_end: int) -> np.ndarray:
    """
    Purge part of the train set.
    Given a left boundary index test_fold_start of the test set, this method removes from the train set all the
    samples whose evaluation time is posterior to the prediction time of the first test sample after the boundary.
    Parameters
    ----------
    cv: Cross-validation class
        Needs to have the attributes cv.pred_times, cv.eval_times and cv.indices.
    train_indices: np.ndarray
        A numpy array containing all the indices of the samples currently included in the train set.
    test_fold_start : int
        Index corresponding to the start of a test set block.
    test_fold_end : int
        Index corresponding to the end of the same test set block.
    Returns
    -------
    train_indices: np.ndarray
        A numpy array containing the train indices purged at test_fold_start.
    """
    time_test_fold_start = cv.pred_times.iloc[test_fold_start]
    # The train indices before the start of the test fold, purged.
    train_indices_1 = np.intersect1d(train_indices, cv.indices[cv.eval_times < time_test_fold_start])
    # The train indices after the end of the test fold.
    train_indices_2 = np.intersect1d(train_indices, cv.indices[test_fold_end:])
    

    return np.concatenate((train_indices_1, train_indices_2))

def cpcv_split(df = None, n_splits = 6, n_test_splits = 2, time_gap = 100, embargo_td = 0):
    assert df.index.inferred_type == 'integer', 'Use integer index'
    
    _index_diff = pd.Series(df.index).diff()
    if len(_index_diff[_index_diff > 1]) > 0 or df.index[0] != 0:
        raise ValueError('Use int index starts from 0 and consecutive')

    cpkf = CombPurgedKFoldCV(n_splits = n_splits, n_test_splits = n_test_splits, embargo_td = embargo_td)
    t1_ = df.index
    t1 = pd.Series(t1_).shift(time_gap).fillna(0).astype(int)
    t2 = pd.Series(t1_).shift(-time_gap).fillna(1e12).astype(int)

    return list(cpkf.split(df, pred_times=t1, eval_times=t2))

In [None]:
drop_from_df_features_target = ['Open', 'High', 'Low', 'Close', 'Volume', 'SupervisionFlag', 'Name',
                                'Adjusted_Close', 'Adjusted_Open', 'Adjusted_High', 'Adjusted_Low', 'Adjusted_Volume']

def run_train_cv(test_config):
    print('run_train_cv()')
    
    df_features_target = pd.read_pickle('df_features_target.pkl')
    df_features_target = df_features_target.drop(drop_from_df_features_target, axis = 1)
    gc.collect()
    
    # CPCV split
    df_features_target_uniquedate = pd.DataFrame(df_features_target.loc[df_features_target['Date'] < '2021-12-06', 'Date'].unique(), columns = ['Date'])
    folds = cpcv_split(df_features_target_uniquedate, 6, 2, 5, 5)
    folds_list = [list(row) for row in folds]
    for k, v in enumerate(folds_list):
        for k2, v2 in enumerate(v):
            folds_list[k][k2] = [df_features_target_uniquedate['Date'].values[i] for i in folds_list[k][k2]]
    total_folds = len(folds_list)
    
    # Exclude outliers
    lower_bound = np.percentile(df_features_target.loc[df_features_target['Date'] < '2021-12-06', 'ResidualTarget'], 0.3)
    higher_bound = np.percentile(df_features_target.loc[df_features_target['Date'] < '2021-12-06', 'ResidualTarget'], 99.7)
    df_X = df_features_target.loc[(df_features_target['Date'] < '2021-12-06') & (df_features_target['ResidualTarget'] >= lower_bound) & (df_features_target['ResidualTarget'] <= higher_bound), set(df_features_target.columns.values) - {'Target', 'ResidualTarget'}]
    df_y = df_features_target.loc[(df_features_target['Date'] < '2021-12-06') & (df_features_target['ResidualTarget'] >= lower_bound) & (df_features_target['ResidualTarget'] <= higher_bound), ['Date', 'ResidualTarget']]
    
    print('df_X.info()')
    display(df_X.info())
    print('df_y.info()')
    display(df_y.info())

    models = []
    
    fit_start_datetime = datetime.now()
    print(f'Fit start time : {fit_start_datetime}')
    for i, fold in enumerate(folds_list):
        print(f"\n========================== Fold {i + 1} / {total_folds} ==========================")
        X_train = df_X.loc[df_X['Date'].isin(fold[0]), :]
        X_valid = df_X.loc[df_X['Date'].isin(fold[1]), :]
        y_train = df_y.loc[df_y['Date'].isin(fold[0]), :]
        y_valid = df_y.loc[df_y['Date'].isin(fold[1]), :]
        
        print(f"Train Date min: {X_train['Date'].min()} max: {X_train['Date'].max()}")
        print(f"Valid Date min: {X_valid['Date'].min()} max: {X_valid['Date'].max()}")

        X_train_nodate = X_train.drop('Date', axis = 1)
        X_valid_nodate = X_valid.drop('Date', axis = 1)
        y_train_nodate = y_train.drop('Date', axis = 1)
        y_valid_nodate = y_valid.drop('Date', axis = 1)
        
        if test_config['use_gpu'] == True:
            xgb_submit_params['tree_method'] = 'gpu_hist'
            print('run_train_cv() : Use GPU')
        else:
            xgb_submit_params['tree_method'] = 'hist'
            print('run_train_cv() : Use CPU')

        # Training
        model = XGBRegressor(**xgb_submit_params).fit(X_train_nodate, y_train_nodate, eval_set=[(X_valid_nodate, y_valid_nodate)], verbose = test_config['verbose'], early_stopping_rounds = test_config['early_stopping_rounds'])
        models.append(model)
        
        # Free memory
        del X_train, y_train,  X_valid, y_valid, X_train_nodate, X_valid_nodate, y_train_nodate, y_valid_nodate
        gc.collect()
    fit_end_datetime = datetime.now()
    print(f'Fit end time : {fit_end_datetime}')
    print(f'Elapsed time : {fit_end_datetime - fit_start_datetime}')
    
    pickle.dump(models, open('xgb_models.pkl', 'wb'))


In [None]:
def run_train_sector(test_config):
    print('run_train_sector()')
    
    df_features_target = pd.read_pickle('df_features_target.pkl')
    df_features_target = df_features_target.drop(drop_from_df_features_target, axis = 1)
    gc.collect()
    
    # Exclude outliers
    lower_bound = np.percentile(df_features_target.loc[df_features_target['Date'] < '2021-12-06', 'ResidualTarget'], 0.3)
    higher_bound = np.percentile(df_features_target.loc[df_features_target['Date'] < '2021-12-06', 'ResidualTarget'], 99.7)

    # Fit with all training date for submission
    X_train = df_features_target.loc[(df_features_target['Date'] < '2021-12-06') & (df_features_target['ResidualTarget'] >= lower_bound) & (df_features_target['ResidualTarget'] <= higher_bound), set(df_features_target.columns.values) - {'Target', 'ResidualTarget'}]
    y_train = df_features_target.loc[X_train.index, ['Date', 'ResidualTarget']]
    
    X_test = df_features_target.loc[df_features_target['Date'] >= '2021-12-06', set(df_features_target.columns.values) - {'Target', 'ResidualTarget'}]
    y_test = df_features_target.loc[X_test.index, ['Date', 'ResidualTarget']]
    
    print(f"Train Date range: {X_train.Date.min()} to {X_train.Date.max()}")
    print(f"Test Date range: {X_test.Date.min()} to {X_test.Date.max()}")
    
    if test_config['use_gpu'] == True:
        xgb_submit_params['tree_method'] = 'gpu_hist'
        print('run_train_all() : Use GPU')
    else:
        xgb_submit_params['tree_method'] = 'hist'
        print('run_train_all() : Use CPU')
    
    sector_models = dict()

    for sector_code in tqdm(X_train['17SectorCode'].unique()):
        X_train_nodate = X_train.loc[X_train['17SectorCode'] == sector_code, :].drop('Date', axis = 1)
        X_test_nodate = X_test.loc[X_test['17SectorCode'] == sector_code, :].drop('Date', axis = 1)
        y_train_nodate = y_train.loc[X_train_nodate.index, :].drop('Date', axis = 1)
        y_test_nodate = y_test.loc[X_test_nodate.index, :].drop('Date', axis = 1)
        
        fit_start_datetime = datetime.now()
        print(f'Fit start time : {fit_start_datetime}')
        model = XGBRegressor(**xgb_submit_params).fit(X_train_nodate, y_train_nodate, eval_set=[(X_train_nodate, y_train_nodate), (X_test_nodate, y_test_nodate)], verbose = test_config['verbose'], early_stopping_rounds = test_config['early_stopping_rounds'])
        sector_models[sector_code] = model
        fit_end_datetime = datetime.now()
        print(f'Fit end time : {fit_end_datetime}')
        print(f'Elapsed time : {fit_end_datetime - fit_start_datetime}')
    
    pickle.dump(sector_models, open('xgb_sector_models.pkl', 'wb'))
    gc.collect()

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start = toprank_weight_ratio, stop = 1, num = portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [None]:
# An utility function to plot true value and prediction value
def display_true_vs_pred(series_true, series_pred, subtitle):
    df = pd.DataFrame({'true': series_true, 'pred': series_pred}).dropna()
    corr = np.corrcoef(df['true'], df['pred'])
    pred_std = df['pred'].std()
    pred_mean = df['pred'].mean()
    true_std = df['true'].std()
    true_mean = df['true'].mean()
    
    std_range = 6
    pred_max = pred_mean + std_range * pred_std
    pred_min = pred_mean - std_range * pred_std
    true_max = true_mean + std_range * true_std
    true_min = true_mean - std_range * true_std
    
    fig, ax = plt.subplots(2, 2, sharex='col', sharey = 'row',
                           gridspec_kw={'width_ratios': [2, 0.5], 'height_ratios': [2, 0.5]}, figsize = (8, 8))
    
    # Plot average of pred range and confirm pred and true are correlated
    true_sections = []
    pred_means = []
    for i in range(std_range * 4 + 1):
        _df = df[(df['true'] >= true_min + 0.5 * true_std * i) & (df['true'] < true_min + 0.5 * true_std * (i + 1))]
        true_sections.append(true_min + 0.5 * true_std * i)
        pred_means.append(_df['pred'].mean())

    # Scatterplot
    ax[0, 0].scatter(df['true'], df['pred'], s = 1)
    ax[0, 0].step(true_sections, pred_means, 'red', where='post')
    ax[0, 0].set_title(subtitle + f' (IC={corr[0][1]:0.4f})')
    ax[0, 0].set_xlabel("truth")
    ax[0, 0].set_ylabel("prediction")
    ax[0, 0].set_xlim([true_min, true_max])
    ax[0, 0].set_ylim([pred_min, pred_max])
    ax[0, 0].set_xticks([true_mean, true_mean - 2*true_std, true_mean - 4*true_std, true_mean + 2*true_std, true_mean + 4*true_std])
    ax[0, 0].set_yticks([pred_mean, pred_mean - 2*pred_std, pred_mean - 4*pred_std, pred_mean + 2*pred_std, pred_mean + 4*pred_std])
    ax[0, 0].grid(axis = 'both')
    ax[0, 0].axvline(0, color='red', linestyle='dotted', linewidth=1)
    ax[0, 0].axhline(0, color='red', linestyle='dotted', linewidth=1)
    
    # Exclude 0 pred value to show reasonable histogram
    ax[1, 0].hist(df.loc[df['true'] != 0, 'true'], bins=50, range=[true_min, true_max])
    ax[1, 0].grid(axis = 'both')
    ax[1, 0].axvline(0, color='red', linestyle='dotted', linewidth=1)
    
    ax[0, 1].hist(df.loc[df['true'] != 0, 'pred'], bins=50, orientation='horizontal', range=[pred_min, pred_max])
    ax[0, 1].grid(axis = 'both')
    ax[0, 1].axhline(0, color='red', linestyle='dotted', linewidth=1)
    
    ax[1, 1].remove()
    
    fig.show()

def display_feature_importance(df_feature_importance):
    df = df_feature_importance.dropna(axis = 0)
    fig, ax = plt.subplots(1, 1, figsize = (8, 0.3 * len(df['Name'])))
    ax.barh(df['Name'], df['Importance'])
    ax.set_title('Feature importance')
    fig.show()

In [None]:
def predict_cv(test_config, models):
    print('predict_cv()')
    
    # Load data
    df_features_target = pd.read_pickle('df_features_target.pkl')
    
    # Fit with all training date for submission
    X_test = df_features_target[df_features_target['Date'] >= '2021-12-06'].drop(drop_from_df_features_target + ['Target', 'ResidualTarget'], axis = 1)
    y_test = df_features_target[df_features_target['Date'] >= '2021-12-06'][['Date', 'ResidualTarget']]
    
    print(f"Test Date range: {X_test.Date.min()} to {X_test.Date.max()}")
    
    X_test_nodate = X_test.drop('Date', axis = 1)
    y_test_nodate = y_test.drop('Date', axis = 1).loc[:, 'ResidualTarget']

    total_models = len(models)
    sharpes = []
    
    # Run against test period
    for i, model in enumerate(models):
        print(f"\n========================== Model {i + 1} / {total_models} ==========================")
        gc.collect()
        
        y_pred = model.predict(X_test_nodate)
        rmse = np.sqrt(mean_squared_error(y_test_nodate, y_pred))
        mae = mean_absolute_error(y_test_nodate, y_pred)

        # Scatterplot pred vs truth
        display_true_vs_pred(y_test_nodate, y_pred, 'Prediction vs Truth')

        rank = []
        for i in X_test.Date.unique():
            temp_df = X_test[X_test.Date == i]
            temp_df["pred"] = model.predict(temp_df.drop('Date', axis = 1))

            # Drop top x and bottom x stocks chosen by the model
            ignore_count = test_config['ignore_count']

            # Ignore too high and too low predictions
            if ignore_count > 1:
                median_pred = temp_df["pred"].median()
                temp_df = temp_df.sort_values('pred', ascending = False)
                pred_index = temp_df.columns.get_loc('pred')
                temp_df.iloc[:ignore_count, pred_index] = median_pred
                temp_df.iloc[-ignore_count:, pred_index] = median_pred
                temp_df = temp_df.sort_index()

            # Set rank after the above adjustment
            temp_df["Rank"] = (temp_df["pred"].rank(method = "first", ascending = False) - 1).astype(int)
            temp_df["Rank"] = temp_df["Rank"].fillna(len(temp_df) // 2)
            rank.append(temp_df["Rank"].values)

        stock_rank = pd.Series([x for y in rank for x in y], name="Rank")
        df = pd.concat([X_test.reset_index(drop = True), stock_rank,
                       df_features_target[df_features_target['Date'] >= '2021-12-06']['Target'].reset_index(drop = True)], axis = 1)

        sharpe = calc_spread_return_sharpe(df)
        print(f"Validation sharpe: {sharpe}, RMSE: {rmse}, MAE: {mae}")
        print(f"Validation annualized sharpe: {sharpe * sqrt(252)}")
        sharpes.append(sharpe)

        df_feature_importance = pd.DataFrame()
        df_feature_importance['Name'] = X_test_nodate.columns.values
        df_feature_importance['Importance'] = model.feature_importances_
        df_feature_importance.sort_values('Importance', ascending = True, inplace = True)

        display_feature_importance(df_feature_importance)
    print(f'Sharpe average: {np.average(sharpes)}')
    print(f'Annualized sharpe average: {np.average(sharpes) * sqrt(252)}')

In [None]:
def predict_sector(test_config, models):
    print('predict_sector()')
    
    # Load data
    df_features_target = pd.read_pickle('df_features_target.pkl')
    
    # Fit with all training date for submission
    X_test = df_features_target[df_features_target['Date'] >= '2021-12-06'].drop(drop_from_df_features_target + ['Target', 'ResidualTarget'], axis = 1)
    y_test = df_features_target[df_features_target['Date'] >= '2021-12-06'][['Date', 'ResidualTarget']]
    
    print(f"Test Date range: {X_test.Date.min()} to {X_test.Date.max()}")
    
    total_models = len(models)
    sharpes = []
    
    # Run against test period
    for sector_code, model in models.items():
        print(f"\n========================== Model {sector_code} ==========================")
        gc.collect()
        
        X_test_nodate = X_test.loc[X_test['17SectorCode'] == sector_code, :].drop('Date', axis = 1)
        y_test_nodate = y_test.loc[X_test_nodate.index, :].drop('Date', axis = 1).loc[:, 'ResidualTarget']

        y_pred = model.predict(X_test_nodate)
        
        rmse = np.sqrt(mean_squared_error(y_test_nodate, y_pred))
        mae = mean_absolute_error(y_test_nodate, y_pred)

        # Scatterplot pred vs truth
        display_true_vs_pred(y_test_nodate, y_pred, 'Prediction vs Truth')

        rank = []
        for i in X_test.Date.unique():
            temp_df = X_test[X_test.Date == i]
            temp_df["pred"] = model.predict(temp_df.drop('Date', axis = 1))

            # Drop top x and bottom x stocks chosen by the model
            ignore_count = test_config['ignore_count']

            # Ignore too high and too low predictions
            if ignore_count > 1:
                median_pred = temp_df["pred"].median()
                temp_df = temp_df.sort_values('pred', ascending = False)
                pred_index = temp_df.columns.get_loc('pred')
                temp_df.iloc[:ignore_count, pred_index] = median_pred
                temp_df.iloc[-ignore_count:, pred_index] = median_pred
                temp_df = temp_df.sort_index()

            # Set rank after the above adjustment
            temp_df["Rank"] = (temp_df["pred"].rank(method = "first", ascending = False) - 1).astype(int)
            temp_df["Rank"] = temp_df["Rank"].fillna(len(temp_df) // 2)
            rank.append(temp_df["Rank"].values)

        stock_rank = pd.Series([x for y in rank for x in y], name="Rank")
        df = pd.concat([X_test.reset_index(drop = True), stock_rank,
                       df_features_target[df_features_target['Date'] >= '2021-12-06']['Target'].reset_index(drop = True)], axis = 1)

        sharpe = calc_spread_return_sharpe(df)
        print(f"Validation sharpe: {sharpe}, RMSE: {rmse}, MAE: {mae}")
        print(f"Validation annualized sharpe: {sharpe * sqrt(252)}")
        sharpes.append(sharpe)

        df_feature_importance = pd.DataFrame()
        df_feature_importance['Name'] = X_test_nodate.columns.values
        df_feature_importance['Importance'] = model.feature_importances_
        df_feature_importance.sort_values('Importance', ascending = True, inplace = True)

        display_feature_importance(df_feature_importance)
    print(f'Sharpe average: {np.average(sharpes)}')
    print(f'Annualized sharpe average: {np.average(sharpes) * sqrt(252)}')

In [None]:
# Read data from CSV file and save to pickle file
if test_config['load_data'] == True:
    load_data(test_config)
    gc.collect()

if test_config['calc_features'] == True:
    calc_features(test_config)
    gc.collect()

if test_config['train_cv'] == True:
    run_train_cv(test_config)
    gc.collect()
elif test_config['train_sector'] == True:
    run_train_sector(test_config)
    gc.collect()
    
if test_config['predict_cv'] == True:
    models = pickle.load(open('xgb_models.pkl', 'rb'))
    predict(test_config, models)
    gc.collect()
elif test_config['predict_sector'] == True:
    models = pickle.load(open('xgb_sector_models.pkl', 'rb'))
    predict_sector(test_config, models)
    gc.collect()