In [1]:
import pandas as pd
import numpy as np

In [44]:

import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold

In [47]:
data_base_init = pd.read_csv('data/initial_data.csv', header=[0, 1], index_col=0, parse_dates=True).copy()
data_base_init.head()

# Sort by date index
data_base_init = data_base_init.sort_index()

In [48]:
# Need to shift Price and day_yield by -1 to align with features

symbol_list = data_base_init.columns.get_level_values(0).unique()

for symbol in symbol_list:
    data_base_init.loc[:, (symbol, 'Price')] = data_base_init[symbol]['Price'].shift(-1)
    data_base_init.loc[:, (symbol, 'Day_yield')] = data_base_init[symbol]['Day_yield'].shift(-1)

In [49]:
# Adding features : rolling mean and rolling std + binary features based on rolling mean

rolling_windows = [5, 10, 20, 50, 75, 100]
feature_list = ['Price', 'Volume', 'High_over_Low', 'Day_yield']
symbol_list = data_base_init.columns.get_level_values(0).unique()

new_columns = {}

for window in rolling_windows:
    for feature in feature_list:
        for symbol in symbol_list:
            col_name = (symbol, f'{feature}_rolling_mean_{window}')
            new_columns[col_name] = data_base_init[symbol][feature].rolling(window=window).mean()
            
            col_name_std = (symbol, f'{feature}_rolling_std_{window}')
            new_columns[col_name_std] = data_base_init[symbol][feature].rolling(window=window).std()
            
            # Binary feature: 1 if Price > rolling mean, else 0
            col_name_bin = (symbol, f'{feature}_above_rolling_mean_{window}')
            new_columns[col_name_bin] = (data_base_init[symbol][feature] > new_columns[col_name]).astype(int)

# Concatenate all new columns at once to avoid fragmentation
data_features = pd.concat([data_base_init, pd.DataFrame(new_columns)], axis=1).copy()

In [50]:
# Remove 100 first days due to rolling features and avoiding NaN values
n = 100+1
first_day = data_features.index[n]
data_features = data_features[data_features.index>first_day]

In [None]:
# Fill forward on any remaining NaN values or 0 if needed
data_features = data_features.ffill()
data_features = data_features.fillna(0)

In [None]:
#

In [63]:
# Keep most interestsing features 

list_features = []

for symbol in symbol_list:
    feature_cols = [col for col in data_features[symbol].columns if col != 'Day_yield']
    X_symbol = data_features[symbol][feature_cols]
    y_symbol = (data_features[symbol]['Day_yield']>0).astype(int)

    # Variance Threshold to remove low variance features
    var_thresh = VarianceThreshold(threshold=0.00)
    X_var = var_thresh.fit_transform(X_symbol)

    # SelectKBest to select top 40 features
    selector = SelectKBest(score_func=f_classif, k=40)
    X_new = selector.fit_transform(X_var, y_symbol)

    selected_features = X_symbol.columns[var_thresh.get_support()][selector.get_support()]
   
    list_features.extend(selected_features.tolist())

In [64]:
# Count per feature how many times it was selected across symbols
from collections import Counter
feature_counts = Counter(list_features)
# Select features that were selected more than once
feature_counts

Counter({'Day_yield_above_rolling_mean_5': 64,
         'Day_yield_above_rolling_mean_10': 64,
         'Day_yield_above_rolling_mean_20': 64,
         'Day_yield_above_rolling_mean_50': 64,
         'Day_yield_above_rolling_mean_75': 64,
         'Day_yield_above_rolling_mean_100': 64,
         'Price_above_rolling_mean_5': 63,
         'Price_above_rolling_mean_10': 63,
         'Price_above_rolling_mean_20': 63,
         'Day_yield_rolling_mean_5': 62,
         'Day_yield_rolling_mean_10': 62,
         'Day_yield_rolling_mean_20': 61,
         'Day_yield_rolling_mean_50': 61,
         'Price_above_rolling_mean_50': 60,
         'Price_above_rolling_mean_75': 59,
         'Day_yield_rolling_mean_75': 59,
         'Price_above_rolling_mean_100': 58,
         'Day_yield_rolling_mean_100': 57,
         'Price': 54,
         'Price_rolling_mean_5': 50,
         'Price_rolling_mean_10': 48,
         'Price_rolling_mean_50': 48,
         'Price_rolling_mean_20': 46,
         'Price_rolling