In [1]:
import datetime
import numpy as np
import os
import pandas as pd

In [2]:
feature_list = [file[:-4] for file in os.listdir('preprocessing/feature')]
feature_dict = {}
for feature in feature_list:
    feature_dict[feature] = pd.read_csv('preprocessing/feature/{}.csv'.format(feature)).astype({'Date':'datetime64[ns]'})

stock_list = [file[:-4] for file in os.listdir('preprocessing/price')]
price_dict = {}
for stock in stock_list:
    price_dict[stock] = pd.read_csv('preprocessing/price/{}.csv'.format(stock), index_col='Date', parse_dates=['Date'])

In [3]:
def missing_portion(df, start=(2012, 12, 31), end=(2023, 12, 31)):
    df = df[df['Date'].ge(datetime.datetime(*start)) & df['Date'].le(datetime.datetime(*end))]
    return np.sum(df.set_index('Date').isna(), axis=0) / df.shape[0]

The following codes may not be corrected if more features are included!

In [4]:
freq_dict = {}
for feature in feature_list:
    feature_df = feature_dict[feature].copy()
    feature_df = feature_df[['Date', missing_portion(feature_df).idxmin()]].dropna()
    average_timedelta = feature_df['Date'].diff().mean()
    if average_timedelta < pd.Timedelta(days=2):
        freq_dict[feature] = 'D'
    elif average_timedelta < pd.Timedelta(days=8):
        freq_dict[feature] = 'W'
    elif average_timedelta < pd.Timedelta(days=16):
        freq_dict[feature] = '2W'
    elif average_timedelta < pd.Timedelta(days=93):
        freq_dict[feature] = 'QE'

Fill in the missing values of each feature (to do)

References:
1. https://medium.com/@aaabulkhair/data-imputation-demystified-time-series-data-69bc9c798cb7
2. https://scikit-learn.org/stable/modules/impute.html

In [5]:
# for feature in feature_list:
#     feature_df = feature_dict[feature].set_index('Date').ffill().resample(freq_dict[feature]).last() # bi-weekly data may return lagged data
#     feature_dict[feature] = feature_df.reset_index()

Feature Engineering (to do)

Resample the feature to a monthly dataframe

In [6]:
for feature in feature_list:
    feature_df = feature_dict[feature].set_index('Date').ffill().resample('ME').last().ffill()
    feature_dict[feature] = feature_df.reset_index()

Remove stock if the missing proportion $\ge0.02$ (to do)

In [7]:
removed_stock_list = []
for feature in feature_list:
    missing_series = missing_portion(feature_dict[feature])
    removed_stock_list += list(missing_series[missing_series >= 0.02].index)
removed_stock_list = list(dict.fromkeys(removed_stock_list))

In [8]:
for stock in removed_stock_list:
    stock_list.remove(stock)

In [9]:
for feature in feature_list:
    feature_dict[feature] = feature_dict[feature][['Date'] + stock_list].dropna()

Resample stock price to a monthly dataframe and merge it to the features

In [10]:
stock_dict = {}
for i, stock in enumerate(stock_list):
    price_df = price_dict[stock]
    price_df = pd.concat([price_df.resample('ME').first().iloc[:, 0],  # open
                          price_df.resample('ME').max().iloc[:, 1],  # high
                          price_df.resample('ME').min().iloc[:, 2],  # low
                          price_df.resample('ME').last().iloc[:, 3:5],  # close, adj close
                          price_df.resample('ME').mean().iloc[:, 5]],  # volume
                         axis=1).reset_index()
    
    price_df['Return'] = price_df['Adj Close'].pct_change()
    price_df['Acc Return'] = np.cumprod(1 + price_df['Return']) - 1
    
    stock_df = feature_dict[feature_list[0]][['Date', stock]].rename(columns={stock: feature_list[0]})
    for feature in feature_list[1:]:
        stock_df = stock_df.merge(feature_dict[feature][['Date', stock]].rename(columns={stock: feature}), on='Date', how='inner')
    stock_dict[stock] = price_df.merge(stock_df, on='Date').set_index('Date')

In [11]:
if not os.path.isdir('data'):
    os.mkdir('data')
    os.mkdir('data/full')

In [12]:
for stock in stock_list:
    stock_dict[stock].to_csv('data/full/{}_full.csv'.format(stock))

In [13]:
pd.Series(stock_list, name='Code').to_csv('data/selected_stock_list.csv', index=None)

In [ ]:
def backtest(stockHistory, signal, shortsell, name='capital', trackingDays=42, startDate=None, initialCapital=100, transactionCost=0.0004):
    if startDate is None:  # starting point of tracking
        startDate = signal.index[0]

    if len(signal.loc[startDate:]) < trackingDays:
        start = 0
    else:
        start = signal.index.get_loc(signal[startDate:].index[0])

    capital = initialCapital

    if trackingDays < len(stockHistory):
        stockHistory = stockHistory.iloc[start:start + trackingDays]
        signal = signal.iloc[start:start + trackingDays]

    signal = numpy.array(signal, dtype='int32')

    if not shortsell:
        signal[signal == -1] = 0

    position = 0
    price = 0
    share = 0

    capitalList = np.zeros(len(stockHistory))
    for num in range(len(stockHistory)):
        if position == 0:
            if signal[num] != 0:
                price = stockHistory.iloc[num]
                share = (1 - transactionCost) * capital / price
        elif position == 1:
            if signal[num] != 1:
                price = stockHistory.iloc[num]
                capital = share * price
                if signal[num] == -1:
                    share = (1 - transactionCost) * capital / price
        elif position == -1:
            if signal[num] != -1:
                capital = share * (2 * price - stockHistory.iloc[num])
                if signal[num] == 1:
                    share = (1 - transactionCost) * capital / stockHistory.iloc[num]
        position = signal[num]

        if position == 1:
            capital = share * stockHistory.iloc[num]
        elif position == -1:
            capital = share * (2 * price - stockHistory.iloc[num])
        capitalList[num] = capital

        if capital <= 0:
            break

    return pd.Series(capitalList, name=name, index=stockHistory.index), stockHistory