TO DO

* refactor validation and test steps
* probably need the finbert model in training mode too when training?
* keeping LSTM for classification?
* needs some hyperparameter tuning
* modify any of the args: get_text_split(text, length=200, overlap=50, max_chunks=4)?

# Imports

In [37]:
import warnings
warnings.filterwarnings("ignore")

In [38]:
# %pip install datasets
# %pip install transformers

In [39]:
import os
import pandas as pd
import yfinance as yf
from concurrent.futures import ThreadPoolExecutor
import datetime
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from datasets import load_dataset
import numpy as np
from statistics import mean
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit


In [40]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from torch.optim import SGD

In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Selected device is {}'.format(device))

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
finbert = AutoModel.from_pretrained("ProsusAI/finbert").to(device)
# bert = AutoModel.from_pretrained("bert-base-uncased").to(device)

# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# distilbert = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

Selected device is cpu


**SET THE MODEL**

In [42]:
model = finbert

In [43]:
checkpoint_dir = os.path.join(os.getcwd(),'checkpoints')

# Data loading


In [70]:
dataset = pd.read_csv('dataset.csv')
dataset

Unnamed: 0,symbol,publish_time,title,body,returns,trend
0,A,2023-05-24,"Agilent (A) Q2 Earnings Match Estimates, Reven...",Agilent Technologies A reported second-quarter...,-5.620333,decrease
1,A,2023-06-02,Agilent Intelligent Mass Spectrometry Solution...,Self-aware solutions making mass spectrometry ...,1.995527,stable
2,A,2023-11-20,Agilent Reports Fourth-Quarter Fiscal Year 202...,Revenue at the high end of guidance and EPS ex...,7.344231,increase
3,AAL,2023-05-22,The Undoing of American and JetBlue’s Northeas...,American Airlines CEO Robert Isom (left) speak...,-3.171385,decrease
4,AAL,2023-05-25,American Airlines flags no earnings impact fro...,(Reuters) - American Airlines Group Inc's Chie...,4.420287,increase
...,...,...,...,...,...,...
3283,ZION,2023-08-07,Why Zions Bancorp Stock Surged 42% in July,Zions Bancorporation (NASDAQ: ZION) rewarded i...,-3.270037,decrease
3284,ZION,2023-11-14,Regional Bank Stocks Rise After Bond Yields Drop,One big winner from Tuesday’s surprise inflati...,8.101503,increase
3285,ZTS,2023-06-15,Zoetis Releases 2022 Sustainability Report to ...,$7.4 million in corporate giving invested in c...,3.972232,increase
3286,ZTS,2023-08-08,Zoetis (ZTS) Q2 2023 Earnings Call Transcript ...,"Hosting the call today is Steve Frank, vice pr...",5.119265,increase


In [72]:
df = dataset[['symbol', 'title', 'body', 'trend']]

In [47]:
def get_smaller_dataset(df, size=0.25):

  df = df.dropna()
  sss = StratifiedShuffleSplit(n_splits=1, test_size=size, random_state=42)
  X = df.drop('trend', axis=1)
  y = df['trend']

  for train_index, test_index in sss.split(X, y):
      X_sample, y_sample = X.iloc[test_index], y.iloc[test_index]

  sample_df = X_sample.assign(trend=y_sample.values)
  sample_df.reset_index(inplace=True, drop=True)

  return sample_df

In [48]:
df = get_smaller_dataset(df)

# Tokenize data

In [49]:
def wrap_tokenizer(tokenizer, padding=True, truncation=True, return_tensors='pt', max_length=None):
    def tokenize(text):
        text = list(text)
        tokens = tokenizer(
            text,
            padding=padding,
            return_attention_mask=False,
            truncation=truncation,
            max_length=max_length,
            return_tensors=return_tensors
            )['input_ids']
        return tokens
    return tokenize

In [50]:
tokenize = wrap_tokenizer(tokenizer, padding=False, truncation = False, return_tensors=None)
tokens = tokenize(df['body'])
num_tokens = [len(x) for x in tokens]
df['length'] = pd.Series(num_tokens)

Token indices sequence length is longer than the specified maximum sequence length for this model (752 > 512). Running this sequence through the model will result in indexing errors


In [51]:
max_length = df['length'].unique().max()
print("Max length of news body: ", max_length)
max_tokens = 512
for i in range(1,(max_length//max_tokens)):
    num = sum(df['length']>i*max_tokens)
    print(f"Number of text that have more than {i}*max_tokens is {num}")

Max length of news body:  16358
Number of text that have more than 1*max_tokens is 662
Number of text that have more than 2*max_tokens is 429
Number of text that have more than 3*max_tokens is 275
Number of text that have more than 4*max_tokens is 197
Number of text that have more than 5*max_tokens is 148
Number of text that have more than 6*max_tokens is 100
Number of text that have more than 7*max_tokens is 73
Number of text that have more than 8*max_tokens is 53
Number of text that have more than 9*max_tokens is 38
Number of text that have more than 10*max_tokens is 34
Number of text that have more than 11*max_tokens is 24
Number of text that have more than 12*max_tokens is 19
Number of text that have more than 13*max_tokens is 16
Number of text that have more than 14*max_tokens is 10
Number of text that have more than 15*max_tokens is 10
Number of text that have more than 16*max_tokens is 9
Number of text that have more than 17*max_tokens is 7
Number of text that have more than 18*

In [52]:
def get_text_split(text, length=200, overlap=50, max_chunks=4):
    chunks = []

    words = text.split()
    n_words = len(words)

    n = max(1, min(max_chunks, (n_words - length) // (length - overlap) + 1))

    for i in range(n):
        start_idx = i * (length - overlap)
        end_idx = min(start_idx + length, n_words)

        chunk_words = words[start_idx:end_idx]

        chunk_text = " ".join(chunk_words)

        # If it's the last chunk and its length is less than 75% of the desired length, skip
        if i == n - 1 and len(chunk_words) < 0.75 * length and n > 1:
            continue

        chunks.append(chunk_text)

    return chunks

**ENCODE LABELS**

In [53]:
labels = ["increase", "stable", "decrease"]

label_encoder = LabelEncoder()

df['trend'] = label_encoder.fit_transform(df['trend'])

# Train-val-test split

In [54]:
def split_df(df):
    n_rows = len(df)

    df_train = df.iloc[:int(0.8*n_rows),:]
    df_val = df.iloc[int(0.8*n_rows):int(0.9*n_rows),:]
    df_test = df.iloc[int(0.9*n_rows):,:]

    return df_train, df_val, df_test

In [55]:
n_rows = len(df)
dfs_train, dfs_val, dfs_test = [],[],[]
gb = df.groupby('trend')
for x in gb.groups:
    group = gb.get_group(x)
    df_train, df_val, df_test = split_df(group)
    dfs_train.append(df_train)
    dfs_val.append(df_val)
    dfs_test.append(df_test)

df_train = pd.concat(dfs_train, ignore_index=True)

df_val = pd.concat(dfs_val, ignore_index=True)

df_test = pd.concat(dfs_test, ignore_index=True)

print(f'Number of training examples: {len(df_train)}')
print(f'Number of validation examples: {len(df_val)}')
print(f'Number of test examples: {len(df_test)}')

Number of training examples: 652
Number of validation examples: 82
Number of test examples: 83


In [56]:
df_train.body = df_train.body.apply(lambda x: get_text_split(x))
df_val.body = df_val.body.apply(lambda x: get_text_split(x))
df_test.body = df_test.body.apply(lambda x: get_text_split(x))

In [57]:
df_train['n_chunks'] = df_train.body.apply(lambda x: len(x))
df_val['n_chunks'] = df_val.body.apply(lambda x: len(x))
df_test['n_chunks'] = df_test.body.apply(lambda x: len(x))

In [58]:
df_train.head()

Unnamed: 0,symbol,title,body,trend,length,n_chunks
0,TTI,TETRA Technologies Insider Buyers Net US$98k D...,"[Insiders who bought TETRA Technologies, Inc. ...",0,799,3
1,RSG,"Republic Services, Inc. (NYSE:RSG) Delivered A...",[Many investors are still learning about the v...,0,1679,4
2,GSIT,"GSI Technology, Inc. (NASDAQ:GSIT) Q2 2024 Ear...","[GSI Technology, Inc. (NASDAQ:GSIT) Q2 2024 Ea...",0,2126,4
3,ARE,"Alexandria Real Estate Equities, Inc.'s Discip...","[PASADENA, Calif., May 24, 2023 /PRNewswire/ -...",0,1437,4
4,MO,Altria (MO) Gains on Pricing Power Amid Low Ci...,"[Altria Group, Inc. MO looks well-positioned d...",0,1328,4


# Model

In [59]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

# def set_lr(optimizer, lr):
#     for param_group in optimizer.param_groups:
#         param_group['lr'] = lr


def save_to_disk(txt_path, values):
    if os.path.isfile(txt_path):
        os.remove(txt_path)
    with open(txt_path, "wb") as fp:
        pickle.dump(values, fp)
    return


# def load_from_disk(txt_path):
#     with open(txt_path, "rb") as f:
#         values =  pickle.load(f)
#     return values


def save_checkpoint(model, classifier, optimizer, logs, epoch):
    print('')
    print('Saving checkpoint...')
    state_dict = {
        'model': model.state_dict(),
        'classifier':classifier.state_dict(),
        'optimizer': optimizer.state_dict(),
    }
    torch.save(state_dict, os.path.join(checkpoint_dir, 'checkpoint_{}.pt'.format(epoch)))
    save_to_disk(os.path.join(checkpoint_dir, 'logs.txt'),logs)
    print(f'Checkpoint saved!')


# def load_checkpoint(checkpoint_dir, epoch, xlmr, classifier, device, optimizer=None):
#     pretrained_dict = torch.load(os.path.join(checkpoint_dir,'checkpoint_{}.pt'.format(epoch)),map_location=torch.device(device))
#     classifier.load_state_dict(pretrained_dict['classifier'])
#     if optimizer is not None:
#         optimizer.load_state_dict(pretrained_dict['optimizer'])
#         return classifier, optimizer
#     return classifier

In [60]:
class MyDataset(Dataset):
    def __init__(self,df):
        self.n_chunks = df['n_chunks'].to_list()
        self.X = df['body'].to_list()
        self.Y = df['trend']

    def __len__(self):
        return len(self.X)

    def __getitem__(self,index):
        return self.X[index], self.Y.iloc[index], self.n_chunks[index]

def collate_func(batch):
    X = [x[0] for x in batch]
    Y = torch.Tensor([x[1] for x in batch])
    c = [x[2] for x in batch]
    return [X,Y,c]

In [61]:
class Classifier(nn.Module):
    def __init__(self, lstm_size, emb_dim, out_dim, dropout):
        super().__init__()

        self.lstm = nn.LSTM(input_size=emb_dim, hidden_size=lstm_size, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(in_features=lstm_size, out_features=out_dim)

    def forward(self, x, n_chunks):
        x = pad_sequence(x, batch_first=True, padding_value=0)
        x = pack_padded_sequence(input=x, lengths=n_chunks, batch_first=True, enforce_sorted=False)
        x, _ = self.lstm(x)
        x, _ = pad_packed_sequence(x, batch_first=True)
        x = x[:,-1,:]
        x = self.dropout(x)
        logit = self.linear(x)
        return logit

In [62]:
# class Classifier(nn.Module):
#     def __init__(self, bert_model, out_dim_lin, dropout):
#         super().__init__()

#         self.bert = bert_model
#         self.dropout = nn.Dropout(dropout)
#         self.linear = nn.Linear(in_features=self.bert.config.hidden_size, out_features=out_dim_lin)

#     def forward(self, x, n_chunks):

#         x = pad_sequence(x, batch_first=True, padding_value=0)

#         with torch.no_grad():
#             bert_outputs = self.bert(x)

#         bert_last_hidden_state = bert_outputs.last_hidden_state

#         x = self.dropout(bert_last_hidden_state)

#         logit = self.linear(x[:, 0, :])

#         return logit

In [63]:
def train(train_loader, tokenize, model, classifier, optimizer, scheduler, logs, dropout=0.0):
    classifier.train()
    model.train()
    #model.eval()

    accuracy = []
    losses = []

    for text, target, n_chunks in train_loader:
        logs['lr'].append(get_lr(optimizer))

        target = target.long().to(device)

        flat_text = [item for sublist in text for item in sublist]
        tokens = tokenize(flat_text)
        tokens = tokens.to(device)

        # with torch.no_grad():
        #   outputs = model(tokens)

        outputs = model(tokens)
        embeddings = outputs.last_hidden_state

        pooled_emb = torch.mean(embeddings, axis=1)
        pooled_emb = nn.Dropout(dropout)(pooled_emb)
        x = [s for s in torch.split(pooled_emb, n_chunks, dim=0)]

        logits = classifier(x, n_chunks)

        loss = nn.CrossEntropyLoss()(input=logits, target=target)

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        _, predictions = torch.max(logits, 1)
        correct_predictions = torch.sum(predictions == target).item()
        acc = correct_predictions / target.size(0)
        accuracy.append(acc)
        losses.append(loss.item())

    return accuracy, losses

In [64]:
def val_step(val_loader, tokenize, model, classifier, device, is_test=False):

    classifier.eval()
    model.eval()

    accuracy = []
    losses = []

    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for text, target, n_chunks in val_loader:
            target = target.long().to(device)

            flat_text = [item for sublist in text for item in sublist]
            tokens = tokenize(flat_text)
            tokens = tokens.to(device)

            outputs = model(tokens)
            embeddings = outputs.last_hidden_state

            pooled_emb = torch.mean(embeddings, axis=1)
            x = [s for s in torch.split(pooled_emb, n_chunks, dim=0)]

            logits = classifier(x, n_chunks)

            loss = nn.CrossEntropyLoss()(input=logits, target=target)

            _, predictions = torch.max(logits, 1)
            correct_predictions = torch.sum(predictions == target).item()
            acc = correct_predictions / target.size(0)
            accuracy.append(acc)
            losses.append(loss.item())

            if is_test:
                all_predictions.append(predictions.cpu())
                all_targets.append(target.cpu())

    if is_test:
        all_predictions = torch.cat(all_predictions).numpy()
        all_targets = torch.cat(all_targets).numpy()
        return accuracy, losses, all_predictions, all_targets

    return accuracy, losses

# Sentiment Analysis

**HYPERPARAMETERS**

In [65]:
BATCH_SIZE = 32
EPOCHS = 5
LR = 1e-7
EMBEDDING_DIM = 768 # 768 for base and 1024 for large
LSTM_SIZE = 128
DROPOUT = 0.2
POOLED_EMB_DO = 0.3
OUT_DIM_LIN = 3

In [66]:
train_dataset = MyDataset(df_train)
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_func)

val_dataset = MyDataset(df_val)
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_func)

test_dataset = MyDataset(df_test)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_func)

In [67]:
classifier = Classifier(lstm_size=LSTM_SIZE,
                        emb_dim=EMBEDDING_DIM,
                        out_dim=OUT_DIM_LIN,
                        dropout= DROPOUT).to(device)

tokenize = wrap_tokenizer(tokenizer)

# If fine-tuning the model (FinBert)
#params = list(model.parameters()) + list(classifier.parameters())

# If using the base FinBert
params = list(classifier.parameters())

optimizer = AdamW(params, lr=5e-5)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=len(train_loader) * 1, 
    num_training_steps=len(train_loader) * EPOCHS)


**TRAINING & VALIDATION**

In [None]:
logs = {'train_acc':[],
        'train_loss':[],
        'val_acc':[],
        'val_loss':[],
        'lr': []}

for epoch in range(EPOCHS):

    train_acc, train_loss = train(train_loader=train_loader,
                                  tokenize=tokenize,
                                  model=model,
                                  classifier=classifier,
                                  optimizer=optimizer,
                                  scheduler=scheduler,
                                  logs=logs,
                                  dropout=POOLED_EMB_DO)

                                  

    val_acc, val_loss = val_step(val_loader=val_loader,
                                 tokenize=tokenize,
                                 model = model,
                                 classifier = classifier,
                                 device=device
                                )

    logs['train_acc'] += train_acc
    logs['train_loss'] += train_loss
    logs['val_acc'] += val_acc
    logs['val_loss'] += val_loss

    # Save checkpoint after each epoch
    save_checkpoint(model = model, classifier = classifier, optimizer=optimizer, logs=logs, epoch=epoch)

    print(f"Epoch {epoch} --> train_loss:{mean(train_loss):.4f},\
                              train_acc:{mean(train_acc): .2f}%, \
                              val_loss:{mean(val_loss): .4f}, \
                              val_acc:{mean(val_acc): .2f}%")