In [408]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [409]:
df = pd.read_csv('ai_financial_market_daily_realistic_synthetic.csv')
df.head()

Unnamed: 0,Date,Company,R&D_Spending_USD_Mn,AI_Revenue_USD_Mn,AI_Revenue_Growth_%,Event,Stock_Impact_%
0,2015-01-01,OpenAI,5.92,0.63,-36.82,,-0.36
1,2015-01-02,OpenAI,5.41,1.81,80.59,,0.41
2,2015-01-03,OpenAI,4.5,0.61,-38.88,,0.23
3,2015-01-04,OpenAI,5.45,0.95,-5.34,,0.93
4,2015-01-05,OpenAI,3.4,1.48,48.45,,-0.09


In [410]:

# Keep only rows where company is "OpenAI"
df = df[df['Company'] == "OpenAI"]

df.head()

Unnamed: 0,Date,Company,R&D_Spending_USD_Mn,AI_Revenue_USD_Mn,AI_Revenue_Growth_%,Event,Stock_Impact_%
0,2015-01-01,OpenAI,5.92,0.63,-36.82,,-0.36
1,2015-01-02,OpenAI,5.41,1.81,80.59,,0.41
2,2015-01-03,OpenAI,4.5,0.61,-38.88,,0.23
3,2015-01-04,OpenAI,5.45,0.95,-5.34,,0.93
4,2015-01-05,OpenAI,3.4,1.48,48.45,,-0.09


In [411]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3653 entries, 0 to 3652
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Date                 3653 non-null   object 
 1   Company              3653 non-null   object 
 2   R&D_Spending_USD_Mn  3653 non-null   float64
 3   AI_Revenue_USD_Mn    3653 non-null   float64
 4   AI_Revenue_Growth_%  3653 non-null   float64
 5   Event                78 non-null     object 
 6   Stock_Impact_%       3653 non-null   float64
dtypes: float64(4), object(3)
memory usage: 228.3+ KB


In [412]:
df

Unnamed: 0,Date,Company,R&D_Spending_USD_Mn,AI_Revenue_USD_Mn,AI_Revenue_Growth_%,Event,Stock_Impact_%
0,2015-01-01,OpenAI,5.92,0.63,-36.82,,-0.36
1,2015-01-02,OpenAI,5.41,1.81,80.59,,0.41
2,2015-01-03,OpenAI,4.50,0.61,-38.88,,0.23
3,2015-01-04,OpenAI,5.45,0.95,-5.34,,0.93
4,2015-01-05,OpenAI,3.40,1.48,48.45,,-0.09
...,...,...,...,...,...,...,...
3648,2024-12-27,OpenAI,10.06,4.71,370.69,,0.93
3649,2024-12-28,OpenAI,9.67,5.32,432.15,,-0.25
3650,2024-12-29,OpenAI,9.17,5.46,445.74,,0.47
3651,2024-12-30,OpenAI,10.36,6.31,530.88,,0.69


In [413]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3653 entries, 0 to 3652
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Date                 3653 non-null   object 
 1   Company              3653 non-null   object 
 2   R&D_Spending_USD_Mn  3653 non-null   float64
 3   AI_Revenue_USD_Mn    3653 non-null   float64
 4   AI_Revenue_Growth_%  3653 non-null   float64
 5   Event                78 non-null     object 
 6   Stock_Impact_%       3653 non-null   float64
dtypes: float64(4), object(3)
memory usage: 228.3+ KB


In [414]:
df = df.drop(columns=['Date','Company'])

In [415]:
df["Stock_Impact_%"] = (df["Stock_Impact_%"] > 0).astype(int)


In [416]:
df


Unnamed: 0,R&D_Spending_USD_Mn,AI_Revenue_USD_Mn,AI_Revenue_Growth_%,Event,Stock_Impact_%
0,5.92,0.63,-36.82,,0
1,5.41,1.81,80.59,,1
2,4.50,0.61,-38.88,,1
3,5.45,0.95,-5.34,,1
4,3.40,1.48,48.45,,0
...,...,...,...,...,...
3648,10.06,4.71,370.69,,1
3649,9.67,5.32,432.15,,0
3650,9.17,5.46,445.74,,1
3651,10.36,6.31,530.88,,1


In [417]:
X = df[['R&D_Spending_USD_Mn','AI_Revenue_USD_Mn','AI_Revenue_Growth_%','Event']]
y= df['Stock_Impact_%']

In [418]:
X_numeric = df[['R&D_Spending_USD_Mn','AI_Revenue_USD_Mn','AI_Revenue_Growth_%']]


In [419]:
import numpy
print(numpy.__version__)


1.26.4


In [420]:
from sentence_transformers import SentenceTransformer

# Load a pretrained model (downloads automatically, no manual files)
model = SentenceTransformer('all-MiniLM-L6-v2')  

# Example: embed your events
sentences = df['Event'].fillna("").astype(str).tolist()
event_embeddings = model.encode(sentences, convert_to_numpy=True)

print(event_embeddings.shape)  # (num_events, 384)

(3653, 384)


In [422]:
print(X_numeric.shape)

(3653, 3)


In [423]:
from sklearn.decomposition import PCA

# Reduce event embeddings from 384 → 50
pca = PCA(n_components=50, random_state=42)
event_embeddings_reduced = pca.fit_transform(event_embeddings)

print("Original:", event_embeddings.shape)
print("Reduced:", event_embeddings_reduced.shape)


Original: (3653, 384)
Reduced: (3653, 50)


In [424]:
from sklearn.preprocessing import StandardScaler
import numpy as np

scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)   # shape: (N_samples, 3)

In [425]:
def create_sequences(X_numeric, X_event, y, window=30):
    Xn_seq, Xe_seq, y_seq = [], [], []
    for i in range(len(X_numeric) - window):
        Xn_seq.append(X_numeric[i:i+window])
        Xe_seq.append(X_event[i:i+window])
        y_seq.append(y[i+window])   # predict next day impact
    return np.array(Xn_seq), np.array(Xe_seq), np.array(y_seq)

y = df['Stock_Impact_%'].values

Xn_seq, Xe_seq, y_seq = create_sequences(X_numeric_scaled, event_embeddings_reduced, y, window=30)

print(Xn_seq.shape)   # (N_samples, 30, 3)
# middle one is window size
print(Xe_seq.shape)   # (N_samples, 30, d)
print(y_seq.shape)    # (N_samples,)


(3623, 30, 3)
(3623, 30, 50)
(3623,)


In [426]:
Xe_seq

array([[[ 2.33695898e-02,  2.43170201e-04, -2.31901853e-04, ...,
         -2.40023201e-08, -6.24206606e-08, -2.36706068e-08],
        [ 2.33709645e-02,  2.48600467e-04, -2.37878456e-04, ...,
          4.15682528e-08,  4.50311388e-09,  2.18951008e-08],
        [ 2.33722143e-02,  2.45532923e-04, -2.35888248e-04, ...,
         -4.16307415e-08,  1.90637877e-07,  1.43410247e-07],
        ...,
        [ 2.33721659e-02,  2.45650590e-04, -2.35531028e-04, ...,
         -3.57998680e-08, -1.24720057e-08,  3.18314726e-08],
        [ 2.33724136e-02,  2.46136566e-04, -2.35830856e-04, ...,
         -1.71188308e-08,  9.65026903e-09, -1.36699541e-09],
        [ 2.33724713e-02,  2.45988515e-04, -2.35745145e-04, ...,
          1.35430875e-08,  2.75805045e-09,  4.46965664e-09]],

       [[ 2.33709645e-02,  2.48600467e-04, -2.37878456e-04, ...,
          4.15682528e-08,  4.50311388e-09,  2.18951008e-08],
        [ 2.33722143e-02,  2.45532923e-04, -2.35888248e-04, ...,
         -4.16307415e-08,  1.90637877e

In [427]:
Xn_seq

array([[[-0.70457346, -1.38434077, -1.3830652 ],
        [-0.97483057, -0.55109965, -0.55398911],
        [-1.45705404, -1.3984635 , -1.39761164],
        ...,
        [-1.65312292, -1.01714977, -1.01622675],
        [-1.37756665, -1.2360521 , -1.23378772],
        [-0.73636841, -0.95359748, -0.951827  ]],

       [[-0.97483057, -0.55109965, -0.55398911],
        [-1.45705404, -1.3984635 , -1.39761164],
        [-0.95363393, -1.15837708, -1.1607731 ],
        ...,
        [-1.37756665, -1.2360521 , -1.23378772],
        [-0.73636841, -0.95359748, -0.951827  ],
        [-0.70987261, -1.54675217, -1.54759525]],

       [[-1.45705404, -1.3984635 , -1.39761164],
        [-0.95363393, -1.15837708, -1.1607731 ],
        [-2.03996153, -0.78412471, -0.78094172],
        ...,
        [-0.73636841, -0.95359748, -0.951827  ],
        [-0.70987261, -1.54675217, -1.54759525],
        [-1.1126087 , -1.12307025, -1.12525438]],

       ...,

       [[ 1.32500445,  1.65910772,  1.65848106],
        [ 1

In [428]:
X

Unnamed: 0,R&D_Spending_USD_Mn,AI_Revenue_USD_Mn,AI_Revenue_Growth_%,Event
0,5.92,0.63,-36.82,
1,5.41,1.81,80.59,
2,4.50,0.61,-38.88,
3,5.45,0.95,-5.34,
4,3.40,1.48,48.45,
...,...,...,...,...
3648,10.06,4.71,370.69,
3649,9.67,5.32,432.15,
3650,9.17,5.46,445.74,
3651,10.36,6.31,530.88,


In [429]:
y

array([0, 1, 1, ..., 1, 1, 0])

In [430]:
import pandas as pd
pd.Series(y).value_counts(normalize=True)


1    0.500684
0    0.499316
Name: proportion, dtype: float64

In [431]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, Dropout, BatchNormalization, Activation

In [432]:

# Shapes from your preprocessing
num_features = Xn_seq.shape[2]   # 3 numerical columns
event_dim = Xe_seq.shape[2]      # 384 from MiniLM

print(num_features)
print(event_dim)

3
50


In [433]:
# --- Numerical branch ---
numeric_input = Input(shape=(30, num_features), name="numeric_input")
x1 = LSTM(256, return_sequences=False)(numeric_input)   # (batch, 64)
x1 = Dropout(0.1)(x1)

In [434]:
# --- Event (text embeddings) branch ---
event_input = Input(shape=(30, event_dim), name="event_input")
x2 = LSTM(64, return_sequences=False)(event_input)    # (batch, 64)
x2 = Dropout(0.1)(x2)


In [435]:

from tensorflow.keras.regularizers import l2
# --- Concatenate both branches ---
merged = Concatenate()([x1, x2])   # (batch, 128)

# Dense layers on top

dense = Dense(128, activation="relu" )(merged)
dense = Dropout(0.3)(dense)
dense = Dense(64, activation="relu")(dense)
dense = Dropout(0.3)(dense)
output = Dense(1, activation="sigmoid")(dense)
  # Regression → stock impact %

In [436]:
from sklearn.model_selection import train_test_split

# Split into train/test (e.g., 80/20 split)
Xn_train, Xn_test, Xe_train, Xe_test, y_train, y_test = train_test_split(
    Xn_seq, Xe_seq, y_seq, test_size=0.2, shuffle=False   # shuffle=False keeps time order
)

print(Xn_train.shape, Xe_train.shape, y_train.shape)
print(Xn_test.shape, Xe_test.shape, y_test.shape)

(2898, 30, 3) (2898, 30, 50) (2898,)
(725, 30, 3) (725, 30, 50) (725,)


In [None]:
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(patience=20, restore_best_weights=True)

# 1. Build + compile model (keep this part)
model = Model(inputs=[numeric_input, event_input], outputs=output)
model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

# 2. Train
history = model.fit(
    [Xn_train, Xe_train], y_train,
    validation_data=([Xn_test, Xe_test], y_test),
    epochs=50,
    batch_size=32,
    # callbacks = [es]
)




Epoch 1/100
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.5117 - loss: 0.6953 - val_accuracy: 0.4979 - val_loss: 0.7422
Epoch 2/100
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - accuracy: 0.5083 - loss: 0.6951 - val_accuracy: 0.4979 - val_loss: 0.6950
Epoch 3/100
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.5186 - loss: 0.6934 - val_accuracy: 0.5021 - val_loss: 0.7071
Epoch 4/100
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 35ms/step - accuracy: 0.4841 - loss: 0.6974 - val_accuracy: 0.4869 - val_loss: 0.6937
Epoch 5/100
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - accuracy: 0.5138 - loss: 0.6934 - val_accuracy: 0.4979 - val_loss: 0.6945
Epoch 6/100
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - accuracy: 0.4985 - loss: 0.6932 - val_accuracy: 0.5048 - val_loss: 0.6937
Epoch 7/100
[1m91/91[0m [