In [None]:
import pandas as pd
import numpy as np
from collections import Counter

In [2]:

import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold

In [3]:
data_base_init = pd.read_csv('data/initial_data.csv', header=[0, 1], index_col=0, parse_dates=True).copy()
data_base_init.head()

# Sort by date index
data_base_init = data_base_init.sort_index()

In [4]:
# Need to shift Price and day_yield by -1 to align with features

symbol_list = data_base_init.columns.get_level_values(0).unique()

for symbol in symbol_list:
    data_base_init.loc[:, (symbol, 'Price')] = data_base_init[symbol]['Price'].shift(-1)
    data_base_init.loc[:, (symbol, 'Day_yield')] = data_base_init[symbol]['Day_yield'].shift(-1)

In [5]:
# Adding features : rolling mean and rolling std + binary features based on rolling mean

rolling_windows = [5, 10, 20, 50, 75, 100]
feature_list = ['Price', 'Volume', 'High_over_Low', 'Day_yield']
symbol_list = data_base_init.columns.get_level_values(0).unique()

new_columns = {}

for window in rolling_windows:
    for feature in feature_list:
        for symbol in symbol_list:
            col_name = (symbol, f'{feature}_rolling_mean_{window}')
            new_columns[col_name] = data_base_init[symbol][feature].rolling(window=window).mean()
            
            col_name_std = (symbol, f'{feature}_rolling_std_{window}')
            new_columns[col_name_std] = data_base_init[symbol][feature].rolling(window=window).std()
            
            # Binary feature: 1 if Price > rolling mean, else 0
            col_name_bin = (symbol, f'{feature}_above_rolling_mean_{window}')
            new_columns[col_name_bin] = (data_base_init[symbol][feature] > new_columns[col_name]).astype(int)

# Concatenate all new columns at once to avoid fragmentation
data_features = pd.concat([data_base_init, pd.DataFrame(new_columns)], axis=1).copy()

In [6]:
# Remove 100 first days due to rolling features and avoiding NaN values
n = 100+1
first_day = data_features.index[n]
data_features = data_features[data_features.index>first_day]

In [7]:
# Fill forward on any remaining NaN values or 0 if needed
data_features = data_features.ffill()
data_features = data_features.fillna(0)

In [15]:
# n = number of features for each symbol
n = data_features.shape[1] // len(symbol_list)
print(f'Total features per symbol: {n}')

Total features per symbol: 76


In [None]:
# Keep most interestsing features 

list_features = []

for symbol in symbol_list:
    feature_cols = [col for col in data_features[symbol].columns if col != 'Day_yield']
    X_symbol = data_features[symbol][feature_cols]
    y_symbol = (data_features[symbol]['Day_yield']>0).astype(int)

    # Variance Threshold to remove low variance features
    var_thresh = VarianceThreshold(threshold=0.00)
    X_var = var_thresh.fit_transform(X_symbol)

    # SelectKBest to select top 40 features
    selector = SelectKBest(score_func=f_classif, k=n // 6)
    X_new = selector.fit_transform(X_var, y_symbol)

    selected_features = X_symbol.columns[var_thresh.get_support()][selector.get_support()]
   
    list_features.extend(selected_features.tolist())

In [59]:
# Counting feature occurrences + Select features that were selected more than once
feature_counts = Counter(list_features)

final_features = [feature for feature, count in feature_counts.items() if count > 1]
final_features.append('Day_yield')
final_features = sorted(final_features)

In [60]:
# Keep only selected features in data_selected_features
data_selected_features = data_features[[(symbol, feature) for symbol in symbol_list for feature in final_features]]
data_selected_features.columns.names = ['Symbol', 'Feature']

In [94]:
# SVC model on one symbol for testing purpose
from sklearn.svm import SVC

symbol = 'ALJJ'

X = data_selected_features[symbol].drop(columns=['Day_yield'])
Y = (data_selected_features[symbol]['Day_yield'] > 0).astype(int).shift(1).fillna(0).astype(int)

mid_date = X.index[len(X)//2]
X_train = X[X.index <= mid_date]
Y_train = Y[Y.index <= mid_date]
X_test = X[X.index > mid_date]
Y_test = Y[Y.index > mid_date]

model = SVC(kernel='rbf', C=2.0, gamma='scale')
model.fit(X_train, Y_train)

0,1,2
,C,2.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [95]:
model.score(X_train, Y_train)

0.7193836171938361

In [96]:
model.score(X_test, Y_test)

0.6210953346855984