In [18]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Concatenate, Conv1D, GlobalMaxPooling1D, Dropout, Attention, Reshape
from transformers import TFDistilBertModel, DistilBertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [19]:
prices = pd.read_csv('data/h1_2015_MSFT_prices_tech.csv')
prices = prices.iloc[4:]
prices['datetime'] = pd.to_datetime(prices['TIME'], utc=True)
prices['datetime'] = prices['datetime'].dt.tz_convert('America/New_York')

prices.head()

Unnamed: 0,TIME,PRICE,SIZE,returns,return_label,ticker,SP-EMA5,RDP-5,RDP-10,RDP-15,RDP-20,MACD,OBV,Volatility,RPD+5,datetime
4,2015-01-02 09:00:00,46.6516,43628,-0.0584,-1.0,MSFT,46.624651,,,,,0.034299,-43628,,-0.02558,2015-01-02 04:00:00-05:00
5,2015-01-02 10:00:00,47.014152,5102614,0.362552,1.0,MSFT,46.754485,1.105703,,,,0.069454,5058986,,-0.66559,2015-01-02 05:00:00-05:00
6,2015-01-02 11:00:00,47.064821,4597166,0.050669,1.0,MSFT,46.85793,,,,,0.098662,9656152,,-0.761642,2015-01-02 06:00:00-05:00
7,2015-01-02 12:00:00,46.70152,2944126,-0.363301,-1.0,MSFT,46.805793,,,,,0.08712,6712026,,-0.087627,2015-01-02 07:00:00-05:00
8,2015-01-02 13:00:00,46.763572,1970859,0.062053,1.0,MSFT,46.79172,0.114691,,,,0.08244,8682885,,0.108552,2015-01-02 08:00:00-05:00


In [20]:
tweets = pd.read_csv('data/h1_2015_msft_aggtweet.csv')
tweets['datetime'] = pd.to_datetime(tweets['datetime'], utc=True)
tweets['datetime'] = tweets['datetime'].dt.tz_convert('America/New_York')

merged = pd.merge(prices, tweets, left_on='datetime', right_on='datetime')
merged.head()

Unnamed: 0.1,TIME,PRICE,SIZE,returns,return_label,ticker,SP-EMA5,RDP-5,RDP-10,RDP-15,RDP-20,MACD,OBV,Volatility,RPD+5,datetime,Unnamed: 0,aggregated_tweet
0,2015-01-02 09:00:00,46.6516,43628,-0.0584,-1.0,MSFT,46.624651,,,,,0.034299,-43628,,-0.02558,2015-01-02 04:00:00-05:00,21,"Technology In 2014, Part 6: The Final Scorecar..."
1,2015-01-02 10:00:00,47.014152,5102614,0.362552,1.0,MSFT,46.754485,1.105703,,,,0.069454,5058986,,-0.66559,2015-01-02 05:00:00-05:00,22,"$MSFT, Active, +3.71, 1H, 12:00 PM EST, 23 Dec..."
2,2015-01-02 11:00:00,47.064821,4597166,0.050669,1.0,MSFT,46.85793,,,,,0.098662,9656152,,-0.761642,2015-01-02 06:00:00-05:00,23,Myths about risks in Options http://bit.ly/Ris...
3,2015-01-02 12:00:00,46.70152,2944126,-0.363301,-1.0,MSFT,46.805793,,,,,0.08712,6712026,,-0.087627,2015-01-02 07:00:00-05:00,24,@bsurveillance Replacing laid off $IBM & $MSFT...
4,2015-01-02 13:00:00,46.763572,1970859,0.062053,1.0,MSFT,46.79172,0.114691,,,,0.08244,8682885,,0.108552,2015-01-02 08:00:00-05:00,25,#Microsoft : Band Units To Replenish On Jan 3 ...


In [22]:
tech = prices[['returns','SP-EMA5','OBV','MACD']]
merged = merged.dropna(subset=['returns', 'PRICE'])
price_features = ['PRICE', 'SIZE']
tech_features = ['returns','SP-EMA5','OBV','MACD']
X_prices = merged[price_features].values
X_tech = merged[tech_features].values

y = merged['return_label'].shift(-1).dropna()
X_prices = X_prices[:-1]
X_tech = X_tech[:-1]
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(y)
onehot_encoder = OneHotEncoder(sparse_output=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
y_encoded = onehot_encoder.fit_transform(integer_encoded)


In [23]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
max_seq_length = 128  
tweet_texts = merged['aggregated_tweet'].values 
tweet_encodings = tokenizer(list(tweet_texts[:-1]), truncation=True, padding=True, max_length=max_seq_length, return_tensors="tf")
input_ids = tweet_encodings['input_ids']
attention_mask = tweet_encodings['attention_mask']
#token_type_ids = tweet_encodings['token_type_ids']

In [24]:
X_prices_train, X_prices_test, X_tech_train, X_tech_test, y_train, y_test = train_test_split(
    X_prices, X_tech, y_encoded, test_size=0.2, random_state=42)

X_prices_train, X_prices_val, X_tech_train, X_tech_val, y_train, y_val = train_test_split(
    X_prices_train, X_tech_train, y_train, test_size=0.25, random_state=42)  

X_tweet_train = {k: v[:len(X_prices_train)] for k, v in tweet_encodings.items()}
X_tweet_val = {k: v[len(X_prices_train):len(X_prices_train) + len(X_prices_val)] for k, v in tweet_encodings.items()}
X_tweet_test = {k: v[len(X_prices_train) + len(X_prices_val):] for k, v in tweet_encodings.items()}

def create_model():
    tweet_input = Input(shape=(None,), dtype=tf.int32, name='tweet_input')
    bert_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
    bert_embeddings = bert_model(tweet_input)[0]
    cnn_layer = Conv1D(64, kernel_size=3, activation='relu')(bert_embeddings)
    cnn_layer = GlobalMaxPooling1D()(cnn_layer)


    price_input = Input(shape=(len(price_features),), name='price_input')
    lstm_price = LSTM(64, return_sequences=True)(tf.expand_dims(price_input, axis=1))
    attention_price = Attention()([lstm_price, lstm_price])
    lstm_price_output = LSTM(64)(attention_price)

    tech_input = Input(shape=(len(tech_features),), name='tech_input')
    lstm_tech = LSTM(64, return_sequences=True)(tf.expand_dims(tech_input, axis=1))
    attention_tech = Attention()([lstm_tech, lstm_tech])
    lstm_tech_output = LSTM(64)(attention_tech)

    concatenated = Concatenate(axis=1)([cnn_layer, lstm_price_output, lstm_tech_output])
    dense1 = Dense(64, activation='relu')(concatenated)
    dropout = Dropout(0.5)(dense1)
    output = Dense(3, activation='softmax')(dropout)

    model = Model(inputs=[tweet_input, price_input, tech_input], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    return model


model = create_model()

model.summary()






Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


ValueError: Exception encountered when calling layer 'tf_distil_bert_model' (type TFDistilBertModel).

Data of type <class 'keras.src.backend.common.keras_tensor.KerasTensor'> is not allowed only (<class 'tensorflow.python.framework.tensor.Tensor'>, <class 'bool'>, <class 'int'>, <class 'transformers.utils.generic.ModelOutput'>, <class 'tuple'>, <class 'list'>, <class 'dict'>, <class 'numpy.ndarray'>) is accepted for input_ids.

Call arguments received by layer 'tf_distil_bert_model' (type TFDistilBertModel):
  • input_ids=<KerasTensor shape=(None, None), dtype=int32, sparse=None, name=tweet_input>
  • attention_mask=None
  • head_mask=None
  • inputs_embeds=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • training=False

In [None]:
tw

In [None]:


history = model.fit(
    [X_tweet_train['input_ids'], X_prices_train, X_tech_train], y_train,
    epochs=50,
    batch_size=16,
    validation_data=([X_tweet_val['input_ids'], X_prices_val, X_tech_val], y_val)
)

loss, accuracy = model.evaluate([X_tweet_test['input_ids'], X_prices_test, X_tech_test], y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

In [27]:
X_tweet_val['input_ids']

<tf.Tensor: shape=(266, 128), dtype=int32, numpy=
array([[  101, 10166, 17765, ...,  9499,  1011,   102],
       [  101,  2256, 10647, ...,  1012,  1012,   102],
       [  101, 12476,  2739, ...,  2078,  8299,   102],
       ...,
       [  101,  1045,  2428, ...,  1996, 20342,   102],
       [  101,  3087,  2842, ...,  6199,  1001,   102],
       [  101,  2065,  1002, ...,  1002,  2151,   102]])>