**Imports**

In [None]:
import gc
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import tqdm
import lightgbm
import time

from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors

**Settings and functions**

In [None]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

def draw(image: np.array, ro=True):
    plt.figure(figsize=(20, 12))
    if ro:
        plt.plot(image, 'ro')
    else:
        plt.plot(image)

**Abbreviations**

In [None]:
D = 'Date'
V = 'Volume'
C = 'Close'
T = 'Target'
P = 'Prediction'

SC = 'SecuritiesCode'
AF = 'AdjustmentFactor'
ED = 'ExpectedDividend'
SF = 'SupervisionFlag'

**Input pathes**

In [None]:
TRAIN_FILES = '../input/jpx-tokyo-stock-exchange-prediction/train_files'
SUPPLEMENTAL_FILES = '../input/jpx-tokyo-stock-exchange-prediction/supplemental_files'
EXAMPLE_TEST_FILES = '../input/jpx-tokyo-stock-exchange-prediction/example_test_files'
DATA_SPECIFTICATIONS = '../input/jpx-tokyo-stock-exchange-prediction/data_specifications'

STOCK_PRICES = 'stock_prices.csv'
SECONDARY_STOCK_PRICES = 'secondary_stock_prices.csv'
TRADES = 'trades.csv'
OPTIONS = 'options.csv'
FINANCIALS = 'financials.csv' 

**Neighbors**

In [None]:
class TimeNeighbors:
    def __init__(self, 
                name: str,
                pivot: pd.DataFrame,
                metric: str,
                p: float,
                test: bool = False):
        pivot = pivot.fillna(pivot.mean())

        self.name = name
        self.metric = metric
        self.p = p
        self.dates = pivot.index.values
        self.stocks = pivot.columns.values
        
        nrst = NearestNeighbors(n_neighbors=pivot.shape[0], p=p, metric=metric, metric_params=None)
        nrst.fit(pivot)
        if not test:
            _, self.neighbors = nrst.kneighbors(pivot, return_distance=True)
            self.neighbors_index = list(range(len(pivot)))
        else:
            _, self.neighbors = nrst.kneighbors(pivot.iloc[-1:], return_distance=True)
            self.neighbors_index = [len(pivot)-1]

    def generate_neighbors_feature_mean(self, 
                                   pivot: pd.DataFrame,
                                   name: str,
                                   count: int):
        name = name + '=' + str(count) + 'mean'
        features = []
        main_df = pd.DataFrame(columns=[D, SC, name])
        for neighbors_count, neighbors in zip(self.neighbors_index, self.neighbors):
            #policy
            dates = neighbors[:count]
            #policy
            
            main_date = self.dates[neighbors_count]
            
            #filter
            dates = dates[(dates <= neighbors_count) | (dates > neighbors_count + 3)]
            #filter
            
            dates = self.dates[dates]
            
            df = pivot.loc[dates].mean()
            df = df.rename(name).to_frame().reset_index()
            df[D] = main_date
            features.append(df)
        
        main_df = pd.concat([main_df] + features)
        return main_df
    
time_features = ['Close',
 'Close.diff7', 
 'Close.diff14', 
 'Close.diff28',
 'Close.rolling3',
 'Close.rolling7',
 'Close.rolling14',
 'Close.rolling28',
 'Volume.rolling7',
 'Volume.rolling14',
 'Volume.rolling28']

neighbors_aggregation_counts = [3, 5, 8, 10, 12, 14, 16, 18, 22, 24]

In [None]:
train_stock_prices = pd.read_feather('../input/jpx-dataset/train_stock_prices_v2.f').iloc[:, 1:]
train_stock_prices[SC] = train_stock_prices[SC].apply(lambda x: int(x[1:]))

In [None]:
train_stock_prices = train_stock_prices.groupby(D).filter(lambda x: len(x) > 1000)
train_stock_prices_dates = train_stock_prices[[D, SC, T]]
x = train_stock_prices.iloc[:, 13:].astype('float32')
y = train_stock_prices[T].astype('float32')

In [None]:
gc.collect()

**Folds**

In [None]:
folds_val_begins = ['2021-11-15', '2021-10-28', '2021-10-10', '2021-09-22', '2021-09-04']
folds_val_ends = ['2021-12-03', '2021-11-15', '2021-10-28', '2021-10-10', '2021-09-22']

folds = []
for val_begin, val_end in zip(folds_val_begins, folds_val_ends):
    train_part = np.where(train_stock_prices_dates[D] < val_begin)[0]
    val_part = np.where((train_stock_prices_dates[D] >= val_begin) & (train_stock_prices_dates[D] <= val_end))[0]
    folds.append((train_part, val_part))
    print(len(train_part), len(val_part))

**Model training**

In [None]:
def mse(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred))))

def feval(preds, train_data):
    labels = train_data.get_label()
    return 'MSE', round(mse(y_true = labels, y_pred = preds), 5), False

params = {
    'objective': 'regression',
    'verbose': 0,
    'metric': '',
    'reg_alpha': 5,
    'reg_lambda': 5,
    'min_data_in_leaf': 1000,
    'max_depth': -1,
    'num_leaves': 128,
    'colsample_bytree': 0.3,
    'learning_rate': 0.05
}

dataset = lightgbm.Dataset(x, y)
ret = lightgbm.cv(params, dataset, num_boost_round=240, folds=folds, stratified=False, return_cvbooster=True, verbose_eval=20)

**Feature importance**

In [None]:
ret['cvbooster'].feature_importance(importance_type='gain')
booster_feature_names = ret['cvbooster'].boosters[0].feature_name()
df = pd.DataFrame(data=ret['cvbooster'].feature_importance(importance_type='gain'), columns=booster_feature_names)
df = df.mean().sort_values(ascending=False)
df

**Validation data eval**

In [None]:
for i in range(len(folds_val_begins)):
    val_fold = folds[i][1]
    prediction = ret['cvbooster'].predict(x.iloc[val_fold])[i]
    main = train_stock_prices_dates.iloc[val_fold]
    main['Prediction'] = prediction
    top200_targets = main.sort_values(T, ascending=False)[:200]
    
    def calc_spread_return_per_day(df):
        top200 = df.sort_values(T, ascending=False)[:200]
        bottom200 = df.sort_values(T)[:200]

        linear_function_2_1 = np.linspace(2, 1, num=200)
        s_up = np.sum(top200[T].values * linear_function_2_1) / np.mean(linear_function_2_1)
        s_down = np.sum(bottom200[T].values * linear_function_2_1) / np.mean(linear_function_2_1)
        r_day = s_up - s_down
        return r_day

    spread_returns = main.groupby(D).apply(calc_spread_return_per_day)
    spread_returns = np.mean(spread_returns) / np.std(spread_returns)
    print(f'fold targets {i}:', spread_returns)
    
    def calc_spread_return_per_day(df):
        top200 = df.sort_values(P, ascending=False)[:200]
        bottom200 = df.sort_values(P)[:200]

        linear_function_2_1 = np.linspace(2, 1, num=200)
        s_up = np.sum(top200[T].values * linear_function_2_1) / np.mean(linear_function_2_1)
        s_down = np.sum(bottom200[T].values * linear_function_2_1) / np.mean(linear_function_2_1)
        r_day = s_up - s_down
        return r_day

    spread_returns = main.groupby(D).apply(calc_spread_return_per_day)
    spread_returns = np.mean(spread_returns) / np.std(spread_returns)
    print(f'fold {i}:', spread_returns)
    
    
    def calc_top_accuracy_per_day(df):
        top200_prediction = df.sort_values(P, ascending=False)[:200][SC].values
        top200_target = df.sort_values(T, ascending=False)[:200][SC].values

        bot200_prediction = df.sort_values(P, ascending=True)[:200][SC].values
        bot200_target = df.sort_values(T, ascending=True)[:200][SC].values

        return len(np.intersect1d(top200_prediction, top200_target)), len(np.intersect1d(bot200_prediction, bot200_target))

    accuracy = main.groupby(D).apply(calc_top_accuracy_per_day)
    print(f'fold {i}:', 'mean', tuple(map(np.mean, zip(*accuracy))), 'std', tuple(map(np.std, zip(*accuracy))))
    print('')

In [None]:
gc.collect()

**Test**

In [None]:
last_test_day = '2022-02-22'
supplemental_stock_prices = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv')
supplemental_stock_prices = supplemental_stock_prices[supplemental_stock_prices[D] <= last_test_day]

In [None]:
def transform_1(stock_prices_last, test_day):
    df = stock_prices_last.copy()
    df_grouped = df.groupby(SC)
    df['Close.diff1'] = df_grouped[C].diff(1)
    df['Close.diff3'] = df_grouped[C].diff(3)
    df['Close.diff7'] = df_grouped[C].diff(7)
    df['Close.diff14'] = df_grouped[C].diff(14)
    df['Close.diff28'] = df_grouped[C].diff(28)
    
    df['Close.diff1relative'] = df['Close.diff1'] / df_grouped[C].shift(1)
    df['Close.diff3relative'] = df['Close.diff3'] / df_grouped[C].shift(3)
    df['Close.diff7relative'] = df['Close.diff7'] / df_grouped[C].shift(7)
    
    df['Close.rolling3'] = df_grouped[C].rolling(3).mean().reset_index(0, drop=True).sort_index()
    df['Close.rolling7'] = df_grouped[C].rolling(7).mean().reset_index(0, drop=True).sort_index()
    df['Close.rolling14'] = df_grouped[C].rolling(14).mean().reset_index(0, drop=True).sort_index()
    df['Close.rolling28'] = df_grouped[C].rolling(28).mean().reset_index(0, drop=True).sort_index()
    
    df['Volume.rolling7'] = df_grouped[V].rolling(7).mean().reset_index(0, drop=True).sort_index()
    df['Volume.rolling14'] = df_grouped[V].rolling(14).mean().reset_index(0, drop=True).sort_index()
    df['Volume.rolling28'] = df_grouped[V].rolling(28).mean().reset_index(0, drop=True).sort_index()
    
    close_features = ['Close.diff1', 'Close.diff3', 'Close.diff7', 'Close.diff14', 'Close.diff28', 
                      'Close.diff1relative', 'Close.diff3relative', 'Close.diff7relative', 
                      'Close.rolling3', 'Close.rolling7', 'Close.rolling14', 'Close.rolling28', 
                      'Volume.rolling7', 'Volume.rolling14', 'Volume.rolling28']
    
    test_day_close_features = df.loc[df[D] == test_day, [D, SC] + close_features].copy().reset_index(drop=True)
    
    return test_day_close_features
    
def transform_2(test_stock_prices, test_day):
    test_day_neighbors_features = test_stock_prices.loc[test_stock_prices[D] == test_day, [D, SC]].copy()
    
    target_name = 'Close.diff1relative'
    target_pivot = pd.concat([train_stock_prices.pivot(D, SC, target_name), test_stock_prices.pivot(D, SC, target_name)])
    for time_feature in time_features:
        pivot, test_pivot = train_stock_prices.pivot(D, SC, time_feature), test_stock_prices.pivot(D, SC, time_feature)
        neighbors_object = TimeNeighbors(name='time_neighbors.' + time_feature, 
                                         pivot=pd.concat([pivot, test_pivot]), 
                                         metric='canberra', 
                                         p=2, 
                                         test=True)
        for count in neighbors_aggregation_counts:
            column = neighbors_object.generate_neighbors_feature_mean(pivot=target_pivot, name=neighbors_object.name + '=' + target_name, count=count)
            test_day_neighbors_features = test_day_neighbors_features.merge(column, on=[D, SC], how='left')
    
    return test_day_neighbors_features

            
last_30_days = pd.unique(train_stock_prices[D])[-30:]
train_stock_prices_last = train_stock_prices.loc[train_stock_prices[D].isin(last_30_days)].copy()

test_stock_prices = pd.DataFrame()
total = pd.DataFrame()
for test_day in pd.unique(supplemental_stock_prices[D]):
    a = time.time()
    test_day_stock_prices = supplemental_stock_prices.loc[supplemental_stock_prices[D] == test_day].copy()
    
    train_stock_prices_last = train_stock_prices_last.append(test_day_stock_prices).reset_index(drop=True)
    test_day_close_features = transform_1(train_stock_prices_last, test_day)
    train_stock_prices_last.drop(train_stock_prices_last.loc[train_stock_prices_last[D] == pd.unique(train_stock_prices_last[D])[0]].index, inplace=True)
    train_stock_prices_last.reset_index(drop=True, inplace=True)
    
    test_day_stock_prices = test_day_stock_prices.merge(test_day_close_features, on=[D, SC], how='left')
    test_stock_prices = test_stock_prices.append(test_day_stock_prices).reset_index(drop=True)
    
    test_day_neighbors_features = transform_2(test_stock_prices, test_day)
    test_day_stock_prices = test_day_stock_prices.merge(test_day_neighbors_features, on=[D, SC], how='left')
    
    x = test_day_stock_prices.loc[:, booster_feature_names].copy()
    predictions = ret['cvbooster'].predict(x)
    prediction = np.zeros((predictions[0].shape))
    for p in predictions:
        prediction += p
    prediction /= len(predictions)
    test_day_stock_prices[P] = prediction
    total = total.append(test_day_stock_prices.loc[:, [D, SC, T, P]].copy()).reset_index(drop=True)
    
    print(test_day)
    print('a1', time.time() - a)
    print('')
display(total)

**Test eval**

In [None]:
def calc_spread_return_per_day(df):
    top200 = df.sort_values(P, ascending=False)[:200]
    bottom200 = df.sort_values(P)[:200]

    linear_function_2_1 = np.linspace(2, 1, num=200)
    s_up = np.sum(top200[T].values * linear_function_2_1) / np.mean(linear_function_2_1)
    s_down = np.sum(bottom200[T].values * linear_function_2_1) / np.mean(linear_function_2_1)
    r_day = s_up - s_down
    return r_day

spread_returns = total.groupby(D).apply(calc_spread_return_per_day)
#display(spread_returns)
print(np.mean(spread_returns), np.std(spread_returns))
spread_returns = np.mean(spread_returns) / np.std(spread_returns)
print(f'test:', spread_returns)
    
    
def calc_top_accuracy_per_day(df):
    top200_prediction = df.sort_values(P, ascending=False)[:200][SC].values
    top200_target = df.sort_values(T, ascending=False)[:200][SC].values

    bot200_prediction = df.sort_values(P, ascending=True)[:200][SC].values
    bot200_target = df.sort_values(T, ascending=True)[:200][SC].values

    return len(np.intersect1d(top200_prediction, top200_target)), len(np.intersect1d(bot200_prediction, bot200_target))

accuracy = total.groupby(D).apply(calc_top_accuracy_per_day)
print(f'test:', 'mean', tuple(map(np.mean, zip(*accuracy))), 'std', tuple(map(np.std, zip(*accuracy))))
print('')

**Random rank**

In [None]:
def calc_spread_return_per_day(df):
    mean_top = 30
    mean_bot = 30
    std_top = 30
    std_bot = 30
    top_random_number = max(1, int(np.random.normal(loc=mean_top, scale=std_top)))
    bot_random_number = max(1, int(np.random.normal(loc=mean_bot, scale=std_bot)))
                         
    top200_target = df.sort_values(T, ascending=False)[:200][SC].values
    bot200_target = df.sort_values(T, ascending=True)[:200][SC].values
    
    top200_target_choice = np.random.choice(top200_target, replace=False, size=top_random_number)
    bot200_target_choice = np.random.choice(bot200_target, replace=False, size=bot_random_number)
    
    sc = pd.unique(df[SC])
    sc = np.setdiff1d(sc, top200_target)
    sc = np.setdiff1d(sc, bot200_target_choice)
    top_choice = np.random.choice(sc, replace=False, size=200-top_random_number)
    
    sc = pd.unique(df[SC])
    sc = np.setdiff1d(sc, bot200_target)
    sc = np.setdiff1d(sc, top200_target_choice)
    sc = np.setdiff1d(sc, top_choice)
    bot_choice = np.random.choice(sc, replace=False, size=200-bot_random_number)

    top_choice = np.union1d(top_choice, top200_target_choice)
    bot_choice = np.union1d(bot_choice, bot200_target_choice)
    np.random.shuffle(top_choice)
    np.random.shuffle(bot_choice)
    df = df.set_index(SC, drop=True).copy()
    top200 = df.loc[top_choice]
    bottom200 = df.loc[bot_choice]
    
    linear_function_2_1 = np.linspace(2, 1, num=200)
    s_up = np.sum(top200[T].values * linear_function_2_1) / np.mean(linear_function_2_1)
    s_down = np.sum(bottom200[T].values * linear_function_2_1) / np.mean(linear_function_2_1)
    r_day = s_up - s_down
    return r_day, top_random_number, bot_random_number 

spread_returns = supplemental_stock_prices.groupby(D).apply(calc_spread_return_per_day)
display(spread_returns)
print('mean', tuple(map(np.mean, zip(*spread_returns))), 'std', tuple(map(np.std, zip(*spread_returns))))
spread_returns = tuple(map(np.mean, zip(*spread_returns)))[0] / tuple(map(np.std, zip(*spread_returns)))[0]
print(f'test:', spread_returns)

In [None]:
calc_spread_return_per_day(supplemental_stock_prices[supplemental_stock_prices[D] == '2021-12-06'])

In [None]:
supplemental_stock_prices[supplemental_stock_prices[D] == '2021-12-06']