In [1]:
%load_ext autoreload
%autoreload 2

# Basic

In [2]:
from typing import Tuple, Dict, Type

import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, train_test_split

from pathlib import Path

import tqdm

In [3]:
DATA_PATH = Path.cwd() / 'data'
OUTPUTS_PATH = Path.cwd() / 'outputs'

RANDOM_SPLIT_SEED = 42
TEST_FRACTION = 0.2

# Read Data

In [4]:
X = pd.read_csv(DATA_PATH / 'x_train.csv', index_col='ID')
y = pd.read_csv(DATA_PATH / 'y_train.csv', index_col='ID')
# train = pd.concat([x_train, y_train], axis=1)
# test = pd.read_csv(DATA_PATH / 'x_test.csv', index_col='ID')

In [5]:
X

Unnamed: 0_level_0,DATE,STOCK,INDUSTRY,INDUSTRY_GROUP,SECTOR,SUB_INDUSTRY,RET_1,VOLUME_1,RET_2,VOLUME_2,...,RET_16,VOLUME_16,RET_17,VOLUME_17,RET_18,VOLUME_18,RET_19,VOLUME_19,RET_20,VOLUME_20
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2,18,5,3,44,-0.015748,0.147931,-0.015504,0.179183,...,0.059459,0.630899,0.003254,-0.379412,0.008752,-0.110597,-0.012959,0.174521,-0.002155,-0.000937
1,0,3,43,15,6,104,0.003984,,-0.090580,,...,0.015413,,0.003774,,-0.018518,,-0.028777,,-0.034722,
2,0,4,57,20,8,142,0.000440,-0.096282,-0.058896,0.084771,...,0.008964,-0.010336,-0.017612,-0.354333,-0.006562,-0.519391,-0.012101,-0.356157,-0.006867,-0.308868
3,0,8,1,1,1,2,0.031298,-0.429540,0.007756,-0.089919,...,-0.031769,0.012105,0.033824,-0.290178,-0.001468,-0.663834,-0.013520,-0.562126,-0.036745,-0.631458
4,0,14,36,12,5,92,0.027273,-0.847155,-0.039302,-0.943033,...,-0.038461,-0.277083,-0.012659,0.139086,0.004237,-0.017547,0.004256,0.579510,-0.040817,0.802806
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418590,223,5703,32,10,4,77,0.021843,-0.217823,-0.021703,-0.125333,...,0.003160,-0.161543,0.007785,-0.141487,0.008205,-0.268215,-0.058448,0.057577,0.031174,-0.163350
418591,223,5705,35,12,5,91,-0.006920,-0.375251,0.000000,-0.029437,...,0.007169,-0.955492,-0.016221,-0.171172,0.007819,1.540184,-0.015396,-0.502337,-0.011073,0.632885
418592,223,5709,2,1,1,5,0.021869,-0.978856,-0.005929,-1.026267,...,0.001110,-0.476550,0.029714,-0.385972,-0.069148,1.780169,-0.082927,1.581453,0.098607,6.956960
418593,223,5710,33,10,4,83,0.012248,-0.627169,0.010925,-0.842108,...,0.007450,-0.210079,0.023729,0.813948,0.026087,-0.868576,-0.051155,0.371052,0.041238,0.045695


# TRAINING Dataset preprocessing

In [6]:
df = pd.concat([X, y], axis=1)
df.dropna(inplace=True)

print(f'{len(df["INDUSTRY"].unique())=}')
print(f'{len(df["INDUSTRY_GROUP"].unique())=}')
print(f'{len(df["SECTOR"].unique())=}')
print(f'{len(df["SUB_INDUSTRY"].unique())=}')

df

len(df["INDUSTRY"].unique())=72
len(df["INDUSTRY_GROUP"].unique())=26
len(df["SECTOR"].unique())=12
len(df["SUB_INDUSTRY"].unique())=175


Unnamed: 0_level_0,DATE,STOCK,INDUSTRY,INDUSTRY_GROUP,SECTOR,SUB_INDUSTRY,RET_1,VOLUME_1,RET_2,VOLUME_2,...,VOLUME_16,RET_17,VOLUME_17,RET_18,VOLUME_18,RET_19,VOLUME_19,RET_20,VOLUME_20,RET
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2,18,5,3,44,-0.015748,0.147931,-0.015504,0.179183,...,0.630899,0.003254,-0.379412,0.008752,-0.110597,-0.012959,0.174521,-0.002155,-0.000937,True
2,0,4,57,20,8,142,0.000440,-0.096282,-0.058896,0.084771,...,-0.010336,-0.017612,-0.354333,-0.006562,-0.519391,-0.012101,-0.356157,-0.006867,-0.308868,False
3,0,8,1,1,1,2,0.031298,-0.429540,0.007756,-0.089919,...,0.012105,0.033824,-0.290178,-0.001468,-0.663834,-0.013520,-0.562126,-0.036745,-0.631458,False
4,0,14,36,12,5,92,0.027273,-0.847155,-0.039302,-0.943033,...,-0.277083,-0.012659,0.139086,0.004237,-0.017547,0.004256,0.579510,-0.040817,0.802806,False
5,0,23,37,12,5,94,0.010938,-0.238878,0.021548,-0.322706,...,-0.062753,-0.004552,-0.097196,0.012677,-0.331521,0.032527,0.665084,0.008400,-0.037627,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418590,223,5703,32,10,4,77,0.021843,-0.217823,-0.021703,-0.125333,...,-0.161543,0.007785,-0.141487,0.008205,-0.268215,-0.058448,0.057577,0.031174,-0.163350,False
418591,223,5705,35,12,5,91,-0.006920,-0.375251,0.000000,-0.029437,...,-0.955492,-0.016221,-0.171172,0.007819,1.540184,-0.015396,-0.502337,-0.011073,0.632885,False
418592,223,5709,2,1,1,5,0.021869,-0.978856,-0.005929,-1.026267,...,-0.476550,0.029714,-0.385972,-0.069148,1.780169,-0.082927,1.581453,0.098607,6.956960,True
418593,223,5710,33,10,4,83,0.012248,-0.627169,0.010925,-0.842108,...,-0.210079,0.023729,0.813948,0.026087,-0.868576,-0.051155,0.371052,0.041238,0.045695,True


## Add further features

In [7]:
volume_cols = [col for col in df.columns if col.startswith("VOLUME_")]
ret_cols    = [col for col in df.columns if col.startswith("RET_")]

df["VOLUME_MEAN"]   = df[volume_cols].aggregate(lambda x: x.mean(), axis=1)
df["RET_MEAN"]      = df[ret_cols].aggregate(lambda x: x.mean(), axis=1)
df["WEIGHTED_RET_MEAN"] = sum(df[volume_col] * df[ret_col] for volume_col, ret_col in zip(volume_cols, ret_cols))
df["WEIGHTED_RET_MEAN_SIGN"] = df["WEIGHTED_RET_MEAN"] > 0

In [8]:
df

Unnamed: 0_level_0,DATE,STOCK,INDUSTRY,INDUSTRY_GROUP,SECTOR,SUB_INDUSTRY,RET_1,VOLUME_1,RET_2,VOLUME_2,...,VOLUME_18,RET_19,VOLUME_19,RET_20,VOLUME_20,RET,VOLUME_MEAN,RET_MEAN,WEIGHTED_RET_MEAN,WEIGHTED_RET_MEAN_SIGN
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2,18,5,3,44,-0.015748,0.147931,-0.015504,0.179183,...,-0.110597,-0.012959,0.174521,-0.002155,-0.000937,True,0.457365,0.016318,1.859505,True
2,0,4,57,20,8,142,0.000440,-0.096282,-0.058896,0.084771,...,-0.519391,-0.012101,-0.356157,-0.006867,-0.308868,False,-0.209642,-0.001043,0.042834,True
3,0,8,1,1,1,2,0.031298,-0.429540,0.007756,-0.089919,...,-0.663834,-0.013520,-0.562126,-0.036745,-0.631458,False,-0.539049,-0.003154,0.016518,True
4,0,14,36,12,5,92,0.027273,-0.847155,-0.039302,-0.943033,...,-0.017547,0.004256,0.579510,-0.040817,0.802806,False,0.606319,-0.003551,1.755262,True
5,0,23,37,12,5,94,0.010938,-0.238878,0.021548,-0.322706,...,-0.331521,0.032527,0.665084,0.008400,-0.037627,False,0.014658,0.001894,-0.076602,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418590,223,5703,32,10,4,77,0.021843,-0.217823,-0.021703,-0.125333,...,-0.268215,-0.058448,0.057577,0.031174,-0.163350,False,-0.079617,0.002034,-0.021136,False
418591,223,5705,35,12,5,91,-0.006920,-0.375251,0.000000,-0.029437,...,1.540184,-0.015396,-0.502337,-0.011073,0.632885,False,-0.130105,-0.000310,0.014157,True
418592,223,5709,2,1,1,5,0.021869,-0.978856,-0.005929,-1.026267,...,1.780169,-0.082927,1.581453,0.098607,6.956960,True,0.297771,0.005549,0.419939,True
418593,223,5710,33,10,4,83,0.012248,-0.627169,0.010925,-0.842108,...,-0.868576,-0.051155,0.371052,0.041238,0.045695,True,-0.240608,0.009756,-0.019322,False


# Dataset split

Initially ignore the following: DATE, STOCK, INDUSTRY, INDUSTRY_GROUP, SECTOR, SUB_INDUSTRY

In [9]:
COLUMNS_TO_IGNORE = ["DATE", "STOCK", "INDUSTRY", "INDUSTRY_GROUP", "SECTOR", "SUB_INDUSTRY"]
LABEL_COLUMNS = ["RET"]

X = df.drop(columns = COLUMNS_TO_IGNORE + LABEL_COLUMNS)
y = df[LABEL_COLUMNS]

X = X.to_numpy(dtype=np.float64)
y = y.to_numpy(dtype=np.float64)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_FRACTION, random_state=RANDOM_SPLIT_SEED)

# Models

In [10]:
from models.base import BinaryClassificationModel
from models.constant import ConstantFalseModel, ConstantTrueModel
from models.nn import NeuralNetworkModel
from models.nn_advanced import AdvNN1, AdvNN2

In [11]:
accuracy_results: Dict[str, float]                  = {}
models: Dict[str, Type[BinaryClassificationModel]]  = {}

In [12]:
ct_classes: Dict[str, Type[BinaryClassificationModel]] = {
    "ct_f": ConstantFalseModel,
    "ct_t": ConstantTrueModel
}


In [13]:
for name, cls in ct_classes.items():
    model = cls()
    accuracy_results[name] = model.evaluate(
        X_test,
        y_test
    )
    models[name] = model

In [None]:
nn_classes : Dict[str, Type[BinaryClassificationModel]] = {
    "nn-basic": NeuralNetworkModel,
    "nn-AdvNN1": AdvNN1,
    "nn-AdvNN2": AdvNN2
}

In [15]:
for name, cls in nn_classes.items():
    model = cls(
        X_train.shape[1],
        hidden_dim=128,
        num_epochs=500
    )

    model.train(
        X_train,
        y_train
    )

    accuracy_results[name] = model.evaluate(
        X_test,
        y_test
    )
    
    models[name] = model

Epoch [100/500], Loss: 0.6915
Epoch [200/500], Loss: 0.6899
Epoch [300/500], Loss: 0.6881
Epoch [400/500], Loss: 0.6862
Epoch [500/500], Loss: 0.6845
Epoch [100/500], Loss: 0.6876
Epoch [200/500], Loss: 0.6773
Epoch [300/500], Loss: 0.6683
Epoch [400/500], Loss: 0.6643
Epoch [500/500], Loss: 0.6561
Epoch [100/500], Loss: 0.6822
Epoch [200/500], Loss: 0.6626
Epoch [300/500], Loss: 0.6315
Epoch [400/500], Loss: 0.5987
Epoch [500/500], Loss: 0.5723


In [16]:
for model, accuracy in accuracy_results.items():
    print(f"{model}:\t{accuracy:.4f}")

ct_f:	0.5021
ct_t:	0.4979
nn-basic:	0.5195
nn-AdvNN1:	0.5138
nn-AdvNN2:	0.5060


# TEST Dataset preprocessing

In [17]:
real_test_df = pd.read_csv(DATA_PATH / 'x_test.csv', index_col='ID')

In [18]:
def test_preprocessing(df: pd.DataFrame) -> Tuple[pd.Index, np.ndarray]:
    df.fillna(0, inplace=True)
    
    volume_cols = [col for col in df.columns if col.startswith("VOLUME_")]
    ret_cols    = [col for col in df.columns if col.startswith("RET_")]

    df["VOLUME_MEAN"]   = df[volume_cols].aggregate(lambda x: x.mean(), axis=1)
    df["RET_MEAN"]      = df[ret_cols].aggregate(lambda x: x.mean(), axis=1)
    df["WEIGHTED_RET_MEAN"] = sum(df[volume_col] * df[ret_col] for volume_col, ret_col in zip(volume_cols, ret_cols))
    df["WEIGHTED_RET_MEAN_SIGN"] = df["WEIGHTED_RET_MEAN"] > 0
    
    real_X_test = df.drop(columns=COLUMNS_TO_IGNORE).to_numpy(dtype=np.float64)
    
    return df.index, real_X_test

result_index, real_X_test = test_preprocessing(real_test_df)



In [20]:
# best_model_name = max(accuracy_results, key=accuracy_results.get)
# print(best_model_name)
# best_model_name = "nn-AdvNN1"

for model_name in ["nn-basic", "nn-AdvNN1", "nn-AdvNN2"]:
    result = models[model_name].predict(real_X_test).astype(bool)
    result_df = pd.DataFrame(
        result,
        index=result_index,
        columns=["RET"]
    )
    result_df.to_csv(
        OUTPUTS_PATH / f"{model_name}-01.csv",
        index=True
    )
    