In [1]:
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras import layers, models, optimizers, callbacks
from tensorflow.keras.layers import Activation, Dense
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.metrics import accuracy_score

In [3]:
data_base_init = pd.read_csv('data/initial_data.csv', header=[0, 1], index_col=0, parse_dates=True).copy()
data_base_init.head()

# Sort by date index
data_base_init = data_base_init.sort_index()

In [4]:
# Adding features : rolling mean and rolling std + binary features based on rolling mean

rolling_windows = [5, 10, 20, 50, 75, 100]
feature_list = ['Price', 'Volume', 'High_over_Low', 'Day_yield']
symbol_list = data_base_init.columns.get_level_values(0).unique()

new_columns = {}

for window in rolling_windows:
    for feature in feature_list:
        for symbol in symbol_list:
            col_name = (symbol, f'{feature}_rolling_mean_{window}')
            new_columns[col_name] = data_base_init[symbol][feature].rolling(window=window).mean()
            
            col_name_std = (symbol, f'{feature}_rolling_std_{window}')
            new_columns[col_name_std] = data_base_init[symbol][feature].rolling(window=window).std()
            
            # Binary feature: 1 if Price > rolling mean, else 0
            col_name_bin = (symbol, f'{feature}_above_rolling_mean_{window}')
            new_columns[col_name_bin] = (data_base_init[symbol][feature] > new_columns[col_name]).astype(int)

# Concatenate all new columns at once to avoid fragmentation
data_features = pd.concat([data_base_init, pd.DataFrame(new_columns)], axis=1).copy()

In [5]:
# Remove 100 first days due to rolling features and avoiding NaN values
n = 100 + 1
first_day = data_features.index[n]
data_features = data_features[data_features.index>first_day]

In [6]:
# Fill forward on any remaining NaN values or 0 if needed
data_features = data_features.ffill()
data_features = data_features.fillna(0)

In [7]:
# n = number of features for each symbol
n = data_features.shape[1] // len(symbol_list)
print(f'Total features per symbol: {n}')

Total features per symbol: 76


In [8]:
# Keep most interestsing features 

list_features = []

for symbol in symbol_list:
    feature_cols = [col for col in data_features[symbol].columns if col != 'Day_yield']
    X_symbol = data_features[symbol][feature_cols]
    y_symbol = (data_features[symbol]['Day_yield']>0).astype(int)

    # Variance Threshold to remove low variance features
    var_thresh = VarianceThreshold(threshold=0.00)
    X_var = var_thresh.fit_transform(X_symbol)

    # SelectKBest to select top 40 features
    selector = SelectKBest(score_func=f_classif, k=n // 6)
    X_new = selector.fit_transform(X_var, y_symbol)

    selected_features = X_symbol.columns[var_thresh.get_support()][selector.get_support()]
   
    list_features.extend(selected_features.tolist())

In [9]:
# Counting feature occurrences + Select features that were selected more than once
feature_counts = Counter(list_features)

final_features = [feature for feature, count in feature_counts.items() if count > 1]
final_features.append('Day_yield')
final_features = sorted(final_features)

In [10]:
# Keep only selected features in data_selected_features
data_selected_features = data_features[[(symbol, feature) for symbol in symbol_list for feature in final_features]]
data_selected_features.columns.names = ['Symbol', 'Feature']

data_selected_features.to_csv('data/processed_data.csv')

In [11]:
data_selected_features.ALJJ.count()

Feature
Day_yield                               4931
Day_yield_above_rolling_mean_10         4931
Day_yield_above_rolling_mean_100        4931
Day_yield_above_rolling_mean_20         4931
Day_yield_above_rolling_mean_5          4931
Day_yield_above_rolling_mean_50         4931
Day_yield_above_rolling_mean_75         4931
Day_yield_rolling_mean_10               4931
Day_yield_rolling_mean_20               4931
Day_yield_rolling_mean_5                4931
High_over_Low                           4931
High_over_Low_above_rolling_mean_10     4931
High_over_Low_above_rolling_mean_100    4931
High_over_Low_above_rolling_mean_20     4931
High_over_Low_above_rolling_mean_5      4931
High_over_Low_above_rolling_mean_50     4931
High_over_Low_above_rolling_mean_75     4931
Price                                   4931
Price_above_rolling_mean_10             4931
Price_above_rolling_mean_20             4931
Price_above_rolling_mean_5              4931
Price_above_rolling_mean_50             4931
Pr

In [12]:
# Let try a light neural network on the few selected features

few_features = ['Day_yield', 'Day_yield_above_rolling_mean_10', 'High_over_Low_above_rolling_mean_10', 'Price_rolling_mean_50']

light_data = data_selected_features[[(symbol, feature) for symbol in symbol_list for feature in few_features]]
light_data.columns.names = ['Symbol', 'Feature']

# Save light data
light_data.to_csv('data/light_processed_data.csv')

In [13]:
# Split data into train and test sets and X and y for each symbol

# X is : for each stock => 'Day_yield_above_rolling_mean_10', 'High_over_Low_above_rolling_mean_10', 'Price_rolling_mean_50'
# Y is : 'Day_yield' shifted by -1 day (next day) and > 0 as binary classification in int format

X = pd.DataFrame()
Y = pd.DataFrame()

for symbol in symbol_list:
    X_symbol = light_data[symbol][['Day_yield_above_rolling_mean_10', 'High_over_Low_above_rolling_mean_10', 'Price_rolling_mean_50']]
    y_symbol = (light_data[symbol]['Day_yield'].shift(-1) > 0).astype(int)
    
    X_symbol.columns = pd.MultiIndex.from_tuples([(symbol, col) for col in X_symbol.columns])
    X = pd.concat([X, X_symbol], axis=1)
    Y = pd.concat([Y, y_symbol.rename(symbol)], axis=1)

In [14]:
# Flat X with symbols as prefix in column names
X_flat = pd.DataFrame()
for symbol in symbol_list:
    X_symbol = X[symbol]
    X_symbol.columns = [f'{symbol}_{col}' for col in X_symbol.columns]
    X_flat = pd.concat([X_flat, X_symbol], axis=1)

In [15]:
dates = sorted(X_flat.index)
end_train_date = dates[int(len(dates)*0.6)]
end_test_date = dates[int(len(dates)*0.8)] 



X_train = X_flat[X_flat.index <= end_train_date]
Y_train = Y[Y.index <= end_train_date]

# Standardize features
scaler = StandardScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)

X_test = X_flat[X_flat.index > end_train_date]
Y_test = Y[Y.index > end_train_date]
X_test = X_test[X_test.index <= end_test_date]
Y_test = Y_test[Y_test.index <= end_test_date]

X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

X_validation = X_flat[X_flat.index > end_test_date]
Y_validation = Y[Y.index > end_test_date]

X_validation = pd.DataFrame(scaler.transform(X_validation), index=X_validation.index, columns=X_validation.columns) 


In [139]:
# Try a simple neural network model with TensorFlow

model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='sigmoid'),
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='sigmoid'),
    layers.Dense(32, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='sigmoid'),
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(Y_train.shape[1], activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, Y_train, epochs=10, batch_size=16, validation_data=(X_test, Y_test))

Epoch 1/10
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.0223 - loss: 0.6785 - val_accuracy: 0.0000e+00 - val_loss: 0.6798
Epoch 2/10
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.0000e+00 - loss: 0.6693 - val_accuracy: 0.0000e+00 - val_loss: 0.6788
Epoch 3/10
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0000e+00 - loss: 0.6687 - val_accuracy: 0.0000e+00 - val_loss: 0.6802
Epoch 4/10
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0000e+00 - loss: 0.6679 - val_accuracy: 0.0000e+00 - val_loss: 0.6789
Epoch 5/10
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0000e+00 - loss: 0.6670 - val_accuracy: 0.0000e+00 - val_loss: 0.6844
Epoch 6/10
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.0027 - loss: 0.6659 - val_accuracy: 0.0101 - val_loss:

<keras.src.callbacks.history.History at 0x17b07cce0d0>

In [None]:
result = model.predict(X_validation).round()
save = {}
for i in range(1, len(Y_validation)):
    save[Y_validation.index[i]] = accuracy_score(Y_validation.iloc[i].tolist(), result[i])

mean = np.mean(list(save.values()))
mean

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [16]:
# Application of neural network model on full data with all selected features

X_full = pd.DataFrame()
Y_full = pd.DataFrame()
for symbol in symbol_list:
    X_symbol = data_selected_features[symbol].drop(columns=['Day_yield'])
    y_symbol = (data_selected_features[symbol]['Day_yield'].shift(-1) > 0).astype(int)
    
    X_symbol.columns = pd.MultiIndex.from_tuples([(symbol, col) for col in X_symbol.columns])
    X_full = pd.concat([X_full, X_symbol], axis=1)
    Y_full = pd.concat([Y_full, y_symbol.rename(symbol)], axis=1)

In [17]:
# Flat X with symbols as prefix in column names
X_full_flat = pd.DataFrame()
for symbol in symbol_list:
    X_symbol = X_full[symbol]
    X_symbol.columns = [f'{symbol}_{col}' for col in X_symbol.columns]
    X_full_flat = pd.concat([X_full_flat, X_symbol], axis=1)

In [18]:
dates = sorted(X_full_flat.index)
end_train_date = dates[int(len(dates)*0.6)]
end_test_date = dates[int(len(dates)*0.8)] 

X_train = X_full_flat[X_full_flat.index <= end_train_date]
Y_train = Y_full[Y_full.index <= end_train_date]

# Standardize features
scaler = StandardScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)

X_test = X_full_flat[X_full_flat.index > end_train_date]
Y_test = Y_full[Y_full.index > end_train_date]
X_test = X_test[X_test.index <= end_test_date]
Y_test = Y_test[Y_test.index <= end_test_date]

X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

X_validation = X_full_flat[X_full_flat.index > end_test_date]
Y_validation = Y_full[Y_full.index > end_test_date]

X_validation = pd.DataFrame(scaler.transform(X_validation), index=X_validation.index, columns=X_validation.columns) 


In [36]:
# Try a neural network model with TensorFlow

model_full = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.BatchNormalization(),
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='sigmoid'),
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='sigmoid'),
    layers.Dense(32, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='sigmoid'),
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(32, activation='sigmoid'),
    layers.Dense(32, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(64, activation='sigmoid'),
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(Y_train.shape[1], activation='sigmoid')
])

model_full.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_full.fit(X_train, Y_train, epochs=10, batch_size=2, validation_data=(X_test, Y_test))

Epoch 1/10
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 7ms/step - accuracy: 0.0000e+00 - loss: 0.6715 - val_accuracy: 0.0000e+00 - val_loss: 0.6819
Epoch 2/10
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 0.0000e+00 - loss: 0.6701 - val_accuracy: 0.0000e+00 - val_loss: 0.6793
Epoch 3/10
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - accuracy: 0.0000e+00 - loss: 0.6698 - val_accuracy: 0.0000e+00 - val_loss: 0.6776
Epoch 4/10
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - accuracy: 0.0000e+00 - loss: 0.6697 - val_accuracy: 0.0000e+00 - val_loss: 0.6788
Epoch 5/10
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - accuracy: 0.0000e+00 - loss: 0.6695 - val_accuracy: 0.0000e+00 - val_loss: 0.6806
Epoch 6/10
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 0.0000e+00 - loss: 0.6696 - val_accu

<keras.src.callbacks.history.History at 0x196207cc380>

In [37]:
result = model_full.predict(X_validation).round()
save = {}
for i in range(1, len(Y_validation)):
    save[Y_validation.index[i]] = accuracy_score(Y_validation.iloc[i].tolist(), result[i])

mean = np.mean(list(save.values()))
mean

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


np.float64(0.542766497461929)

In [49]:
# Try a neural network model with TensorFlow

model_full = models.Sequential([
    layers.Input(shape=(1, X_train.shape[1])),
    
    layers.LSTM(256, return_sequences=True),
    #layers.LSTM(64, return_sequences=True),
    layers.LSTM(32),
    #layers.Dropout(0.05),
    #layers.Dense(64, activation='relu'),
    #layers.Dense(32, activation='relu'),
    #layers.Dense(16, activation='relu'),
    layers.Dropout(0.05),
    layers.Dense(Y_train.shape[1], activation='sigmoid')
])

model_full.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Reshape data for LSTM (add timestep dimension)
X_train_reshaped = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_reshaped = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))
X_validation_reshaped = X_validation.values.reshape((X_validation.shape[0], 1, X_validation.shape[1]))

model_full.fit(X_train_reshaped, Y_train, epochs=10, batch_size=2, validation_data=(X_test_reshaped, Y_test))

result = model_full.predict(X_validation_reshaped).round()
save = {}
for i in range(1, len(Y_validation)):
    save[Y_validation.index[i]] = accuracy_score(Y_validation.iloc[i].tolist(), result[i])

mean = np.mean(list(save.values()))
mean

Epoch 1/10
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 26ms/step - accuracy: 0.0014 - loss: 0.6696 - val_accuracy: 0.0000e+00 - val_loss: 0.6761
Epoch 2/10
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 25ms/step - accuracy: 0.0068 - loss: 0.6535 - val_accuracy: 0.0233 - val_loss: 0.6795
Epoch 3/10
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 26ms/step - accuracy: 0.0068 - loss: 0.6348 - val_accuracy: 0.0122 - val_loss: 0.6902
Epoch 4/10
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 27ms/step - accuracy: 0.0145 - loss: 0.6105 - val_accuracy: 0.0162 - val_loss: 0.7059
Epoch 5/10
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 27ms/step - accuracy: 0.0152 - loss: 0.5876 - val_accuracy: 0.0193 - val_loss: 0.7183
Epoch 6/10
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 27ms/step - accuracy: 0.0179 - loss: 0.5682 - val_accuracy: 0.0071 - val_loss: 0.7383


np.float64(0.512357233502538)