TO DO

* probably need the finbert model in training mode too when training?
* needs some hyperparameter tuning
* modify any of the args: get_text_split(text, length=200, overlap=50, max_chunks=4)?

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
%pip install datasets
%pip install transformers

In [4]:
import os
import pandas as pd
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
import numpy as np
from statistics import mean
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
import torch.nn.functional as F

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Selected device is {}'.format(device))

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
finbert = AutoModel.from_pretrained("ProsusAI/finbert").to(device)
# bert = AutoModel.from_pretrained("bert-base-uncased").to(device)

# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# distilbert = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

**SET THE MODEL**

In [7]:
model = finbert

In [8]:
# Change/Create the checkpoints dir 
checkpoint_dir = '/content/drive/MyDrive/COMP0087/checkpoints_newdataset_new'

# Data loading


In [69]:
dataset = pd.read_csv('/content/drive/MyDrive/COMP0087/dataset2.csv')
dataset.dropna(inplace=True)
dataset.reset_index(inplace=True, drop=True)
dataset

In [87]:
# For news headlines
# df = dataset[['title', 'trend']]
# df.rename(columns={'title' : 'body'}, inplace=True)

# For news body
df = dataset[['body', 'trend']]

df.head()

Unnamed: 0,body,trend
0,Key Takeaways: FWD has switched its focus from...,stable
1,What if you could buy shares today of hugely s...,stable
2,"SHENZHEN, China, March 16, 2023 (GLOBE NEWSWIR...",stable
3,"SHENZHEN, China, Nov. 14, 2022 (GLOBE NEWSWIRE...",stable
4,"HanAll will invest in Interon Laboratories, a ...",stable


# Analyse data

In [None]:
# def get_smaller_dataset(df, size=0.25):

#   df = df.dropna()
#   sss = StratifiedShuffleSplit(n_splits=1, test_size=size, random_state=42)
#   X = df.drop('trend', axis=1)
#   y = df['trend']

#   for train_index, test_index in sss.split(X, y):
#       X_sample, y_sample = X.iloc[test_index], y.iloc[test_index]

#   sample_df = X_sample.assign(trend=y_sample.values)
#   sample_df.reset_index(inplace=True, drop=True)

#   return sample_df

In [88]:
def get_news_bodies_lengths(tokenizer):
    def tokenize(text):
        text = list(text)
        tokens = tokenizer(
            text,
            padding=False,
            return_attention_mask=False,
            truncation=False,
            max_length=None,
            return_tensors=None
            )['input_ids']
        return tokens
    return tokenize

In [89]:
tokenize = get_news_bodies_lengths(tokenizer)
tokens = tokenize(df['body'])
num_tokens = [len(x) for x in tokens]
df['length'] = pd.Series(num_tokens)

In [90]:
max_length = df['length'].unique().max()
print("Max length of news body: ", max_length)
max_tokens = 512

for i in range(1,(max_length//max_tokens)):
    num = sum(df['length']>i*max_tokens)
    print(f"Number of text that have more than {i}*max_tokens is {num}")

Max length of news body:  4675
Number of text that have more than 1*max_tokens is 571
Number of text that have more than 2*max_tokens is 168
Number of text that have more than 3*max_tokens is 69
Number of text that have more than 4*max_tokens is 27
Number of text that have more than 5*max_tokens is 13
Number of text that have more than 6*max_tokens is 8
Number of text that have more than 7*max_tokens is 4
Number of text that have more than 8*max_tokens is 4


# Generate chunks

In [86]:
def get_text_split(text, length=200, overlap=50, max_chunks=5):
    chunks = []

    words = text.split()
    n_words = len(words)

    n = max(1, min(max_chunks, (n_words - length) // (length - overlap) + 1))

    for i in range(n):
        start_idx = i * (length - overlap)
        end_idx = min(start_idx + length, n_words)

        chunk_words = words[start_idx:end_idx]

        chunk_text = " ".join(chunk_words)

        # If it's the last chunk and its length is less than 75% of the desired length, skip
        if i == n - 1 and len(chunk_words) < 0.75 * length and n > 1:
            continue

        chunks.append(chunk_text)

    return chunks

# Train-val-test split

**ENCODE LABELS**

In [None]:
labels = ["increase", "stable", "decrease"]

label_encoder = LabelEncoder()

df['trend'] = label_encoder.fit_transform(df['trend'])

In [92]:
def split_df(df):
    n_rows = len(df)

    df_train = df.iloc[:int(0.8*n_rows),:]
    df_val = df.iloc[int(0.8*n_rows):int(0.9*n_rows),:]
    df_test = df.iloc[int(0.9*n_rows):,:]

    return df_train, df_val, df_test

In [93]:
n_rows = len(df)
dfs_train, dfs_val, dfs_test = [],[],[]
gb = df.groupby('trend')
for x in gb.groups:
    group = gb.get_group(x)
    df_train, df_val, df_test = split_df(group)
    dfs_train.append(df_train)
    dfs_val.append(df_val)
    dfs_test.append(df_test)

df_train = pd.concat(dfs_train, ignore_index=True)

df_val = pd.concat(dfs_val, ignore_index=True)

df_test = pd.concat(dfs_test, ignore_index=True)

print(f'Number of training examples: {len(df_train)}')
print(f'Number of validation examples: {len(df_val)}')
print(f'Number of test examples: {len(df_test)}')

Number of training examples: 22083
Number of validation examples: 2760
Number of test examples: 2762


In [94]:
df_train.body = df_train.body.apply(lambda x: get_text_split(x))
df_val.body = df_val.body.apply(lambda x: get_text_split(x))
df_test.body = df_test.body.apply(lambda x: get_text_split(x))

In [95]:
df_train['n_chunks'] = df_train.body.apply(lambda x: len(x))
df_val['n_chunks'] = df_val.body.apply(lambda x: len(x))
df_test['n_chunks'] = df_test.body.apply(lambda x: len(x))

In [96]:
df_train.head()

Unnamed: 0,body,trend,length,n_chunks
0,[Key Takeaways: Investment banking majors Citi...,0,118,1
1,[Wall Street stocks struggled to advance on Mo...,0,212,1
2,[Key Takeaways: Key factors in a JD share-pric...,0,111,1
3,[Reuters exclusively revealed that Nissan Moto...,0,65,1
4,[By Doug Young A funny thing happened on the w...,0,101,1


# Model

In [97]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def set_lr(optimizer, lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

def save_to_disk(txt_path, values):
    if os.path.isfile(txt_path):
        os.remove(txt_path)
    with open(txt_path, "wb") as fp:
        pickle.dump(values, fp)
    return

def load_from_disk(txt_path):
    with open(txt_path, "rb") as f:
        values =  pickle.load(f)
    return values

def save_checkpoint(model, classifier, optimizer, logs, epoch):
    print('')
    print('Saving checkpoint...')
    state_dict = {
        'model': model.state_dict(),
        'classifier':classifier.state_dict(),
        'optimizer': optimizer.state_dict(),
    }
    torch.save(state_dict, os.path.join(checkpoint_dir, 'checkpoint_{}.pt'.format(epoch)))
    save_to_disk(os.path.join(checkpoint_dir, 'logs.txt'),logs)
    print(f'Checkpoint saved!')

def load_checkpoint(path, model, classifier, device, optimizer=None):
    pretrained_dict = torch.load(path ,map_location=torch.device(device))
    classifier.load_state_dict(pretrained_dict['classifier'])
    model.load_state_dict(pretrained_dict['model'])
    if optimizer is not None:
        optimizer.load_state_dict(pretrained_dict['optimizer'])
        return model, classifier, optimizer
    return model, classifier

In [98]:
def wrap_tokenizer(tokenizer, padding=True, truncation=True, return_tensors='pt', max_length = None, return_attention_mask=True):
    def tokenize(text):
        text = list(text)
        tokens = tokenizer(
            text,
            padding=padding,
            return_attention_mask=return_attention_mask,
            truncation=truncation,
            max_length = max_length,
            return_tensors=return_tensors)
        return tokens
    return tokenize

In [99]:
class MyDataset(Dataset):
    def __init__(self,df):
        self.n_chunks = df['n_chunks'].to_list()
        self.X = df['body'].to_list()
        self.Y = df['trend']

    def __len__(self):
        return len(self.X)

    def __getitem__(self,index):
        return self.X[index], self.Y.iloc[index], self.n_chunks[index]

def collate_func(batch):
    X = [x[0] for x in batch]
    Y = torch.Tensor([x[1] for x in batch])
    c = [x[2] for x in batch]
    return [X,Y,c]

In [102]:
class Classifier(nn.Module):
    def __init__(self, lstm_size, emb_dim, out_dim, dropout):
        super().__init__()

        self.lstm_chunk = nn.LSTM(input_size=emb_dim, hidden_size=lstm_size, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(in_features=lstm_size, out_features=out_dim)

    def forward(self, x, n_chunks):
        x = pad_sequence(x, batch_first=True, padding_value=0)
        x = pack_padded_sequence(input=x, lengths=n_chunks, batch_first=True, enforce_sorted=False)
        x, _ = self.lstm_chunk(x)
        x, _ = pad_packed_sequence(x, batch_first=True)

        x, _ = torch.max(x, dim=1) # Aggregate across chunks

        x = self.dropout(x)
        x = self.linear(x)

        return x
        # return F.softmax(x, dim=-1)

In [103]:
def train(train_loader, tokenize, model, classifier, optimizer, scheduler, logs):
    classifier.train()
    #model.train()
    model.eval()

    accuracy = []
    losses = []

    for text, target, n_chunks in train_loader:
        logs['lr'].append(get_lr(optimizer))

        target = target.long().to(device)

        flat_text = [item for sublist in text for item in sublist]

        tokens = tokenize(flat_text)
        tokens['input_ids'] = tokens['input_ids'].to(device)
        tokens['attention_mask'] = tokens['attention_mask'].to(device)

        with torch.no_grad():
          outputs = model(input_ids=tokens['input_ids'],
                          attention_mask=tokens['attention_mask'])

        # outputs = model(input_ids=tokens['input_ids'],
        #                 attention_mask=tokens['attention_mask'])

        embeddings = outputs.last_hidden_state

        pooled_emb = (embeddings * tokens['attention_mask'][:, :, None]).sum(dim=1) / tokens['attention_mask'][:, :, None].sum(dim=1)

        x = [s for s in torch.split(pooled_emb, n_chunks, dim=0)]

        logits = classifier(x, n_chunks)

        loss = nn.CrossEntropyLoss()(input=logits, target=target)

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        _, predictions = torch.max(logits, 1)
        correct_predictions = torch.sum(predictions == target).item()
        acc = correct_predictions / target.size(0)
        accuracy.append(acc)
        losses.append(loss.item())

    return accuracy, losses

In [104]:
def val_step(val_loader, tokenize, model, classifier, device, is_test=False):

    classifier.eval()
    model.eval()

    accuracy = []
    losses = []

    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for text, target, n_chunks in val_loader:
            target = target.long().to(device)

            flat_text = [item for sublist in text for item in sublist]

            tokens = tokenize(flat_text)
            tokens['input_ids'] = tokens['input_ids'].to(device)
            tokens['attention_mask'] = tokens['attention_mask'].to(device)

            outputs = model(input_ids=tokens['input_ids'],
                            attention_mask=tokens['attention_mask'])

            embeddings = outputs.last_hidden_state

            pooled_emb = (embeddings * tokens['attention_mask'][:, :, None]).sum(dim=1) / tokens['attention_mask'][:, :, None].sum(dim=1)

            x = [s for s in torch.split(pooled_emb, n_chunks, dim=0)]

            logits = classifier(x, n_chunks)

            loss = nn.CrossEntropyLoss()(input=logits, target=target)

            _, predictions = torch.max(logits, 1)
            correct_predictions = torch.sum(predictions == target).item()
            acc = correct_predictions / target.size(0)
            accuracy.append(acc)
            losses.append(loss.item())

            if is_test:
                all_predictions.append(predictions.cpu())
                all_targets.append(target.cpu())

    if is_test:
        all_predictions = torch.cat(all_predictions).numpy()
        all_targets = torch.cat(all_targets).numpy()
        return accuracy, losses, all_predictions, all_targets

    return accuracy, losses

# Sentiment Analysis

**HYPERPARAMETERS**

In [105]:
# Can be fine tuned
BATCH_SIZE = 2
EPOCHS = 20
LSTM_SIZE = 128 
DROPOUT = 0.2

# Fixed
EMBEDDING_DIM = 768 # 768 for base and 1024 for large
OUT_DIM_LIN = 3

In [106]:
train_dataset = MyDataset(df_train)
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_func)

val_dataset = MyDataset(df_val)
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_func)

In [107]:
classifier = Classifier(lstm_size=LSTM_SIZE,
                        emb_dim=EMBEDDING_DIM,
                        out_dim=OUT_DIM_LIN,
                        dropout= DROPOUT).to(device)

tokenize = wrap_tokenizer(tokenizer, return_attention_mask=True)

# If fine-tuning the model (FinBert)
#params = list(model.parameters()) + list(classifier.parameters())

# If using the base FinBert
params = list(classifier.parameters())

optimizer = AdamW(params, lr=0.0001)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=len(train_loader) * 1,
    num_training_steps=len(train_loader) * EPOCHS)


**TRAINING & VALIDATION**

In [None]:
logs = {'train_acc':[],
        'train_loss':[],
        'val_acc':[],
        'val_loss':[],
        'lr': []}

best_val_loss = float('inf')
for epoch in range(EPOCHS):

    train_acc, train_loss = train(train_loader=train_loader,
                                  tokenize=tokenize,
                                  model=model,
                                  classifier=classifier,
                                  optimizer=optimizer,
                                  scheduler=scheduler,
                                  logs=logs
                                  )

    val_acc, val_loss = val_step(val_loader=val_loader,
                                 tokenize=tokenize,
                                 model = model,
                                 classifier = classifier,
                                 device=device
                                )

    if mean(val_loss) < best_val_loss:
        best_val_loss = mean(val_loss)
        save_checkpoint(model = model, classifier = classifier, optimizer=optimizer, logs=logs, epoch=epoch)


    logs['train_acc'] += train_acc
    logs['train_loss'] += train_loss
    logs['val_acc'] += val_acc
    logs['val_loss'] += val_loss

    # Save checkpoint after each epoch
    #save_checkpoint(model = model, classifier = classifier, optimizer=optimizer, logs=logs, epoch=epoch)

    print(f"Epoch {epoch} --> train_loss:{mean(train_loss):.4f},\
                              train_acc:{mean(train_acc): .2f}%, \
                              val_loss:{mean(val_loss): .4f}, \
                              val_acc:{mean(val_acc): .2f}%")


**TESTING**

In [32]:
path = '/content/drive/MyDrive/COMP0087/checkpoints_newdataset_bodies/checkpoint_5.pt'

In [None]:
model, classifier = load_checkpoint(path, model, classifier, device)

In [39]:
test_dataset = MyDataset(df_test)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_func)

In [40]:
test_accuracy, test_loss, all_predictions, all_targets = val_step(val_loader=test_loader,
                                                          tokenize=tokenize,
                                                          model = model,
                                                          classifier = classifier,
                                                          device=device,
                                                          is_test = True
                                                          )

In [41]:
print(f"Test loss:{mean(test_loss):.4f}\n test acc:{mean(test_accuracy):.2f}%")

Test loss:0.9892
 test acc:0.53%
