# Fine-tuning the cross classification-regression model

In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)

# prepare the df
df = pd.read_csv("training.csv")

# MPS device
import torch
device = torch.device("cpu")

# typical row:
# "@PharmaNews: Pfizer faces backlash over possible closure of regional office. #PharmaNews #RegionalOffice",0.000000,0.000000,0.000000,-0.029512,0.000000


## Data Preparation

In [2]:
import torch
from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('prajjwal1/bert-mini')

# Define the stocks and sentiments
stocks = ['None', 'NVDA', 'ING', 'SAN', 'PFE', 'CSCO']
# None can only have None sentiment
sentiments = ['None', '_pos', '_neg']
# 3*5 + 1 = 16 classes in total and 1 regression head with log_returns

# df has ["SocialMediaFeed", 'NVDA', 'ING', 'SAN', 'PFE', 'CSCO'] columns
X_texts = df['SocialMediaFeed'].tolist()

# Calculate which stock is affected by the news and get the log return value
y_stock = []
y_log_return = []
for i, row in df.iterrows():
    affected_stock = "None"
    log_return_val = 0.0
    for stock in stocks[1:]:
        if row[stock] != 0:
            affected_stock = stock
            log_return_val = row[stock]
            break
    y_stock.append(affected_stock)
    y_log_return.append(log_return_val)

y_stock = [stocks.index(s) for s in y_stock]

X_train_texts, X_val_texts, y_train_stock, y_val_stock, y_train_val, y_val_val = train_test_split(X_texts, y_stock, y_log_return, test_size=.2)

# Scale regression target (log returns) with StandardScaler
scaler = StandardScaler()
y_train_val_scaled = scaler.fit_transform(np.array(y_train_val).reshape(-1, 1)).flatten().astype('float32')
y_val_val_scaled = scaler.transform(np.array(y_val_val).reshape(-1, 1)).flatten().astype('float32')

# Set up labels for sentiment analysis
train_sentiments = [sentiments.index("_pos") if val > 0.000001 else sentiments.index("_neg") if val < -0.000001 else sentiments.index("None") for val in y_train_val]
val_sentiments = [sentiments.index("_pos") if val > 0.000001 else sentiments.index("_neg") if val < -0.000001 else sentiments.index("None") for val in y_val_val]

# Next, tokenize the texts
train_encodings = tokenizer(X_train_texts, truncation=True, padding=True)
val_encodings = tokenizer(X_val_texts, truncation=True, padding=True)

# Define the custom dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, stock_labels, sentiment_labels, values):
        self.encodings = {key: torch.tensor(val, device=device) for key, val in encodings.items()}
        self.stock_labels = torch.tensor(stock_labels).to(device)
        self.sentiment_labels = torch.tensor(sentiment_labels).to(device)
        self.values = torch.tensor(values).to(device)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["stock_labels"] = self.stock_labels[idx]
        item["sentiment_labels"] = self.sentiment_labels[idx]
        item["values"] = self.values[idx]
        return item

    def __len__(self):
        return len(self.stock_labels)

# Create the custom dataset
train_dataset = CustomDataset(train_encodings, y_train_stock, train_sentiments, y_train_val_scaled)
val_dataset = CustomDataset(val_encodings, y_val_stock, val_sentiments, y_val_val_scaled)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [3]:
# export scaler
import pickle

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [4]:



train_dataset[0]

# typical output:
# {'input_ids': tensor([  101,  1030,  6887, 27292,  4887, 17299,  3686,  1024,  1052,  8873,
#           6290,  1005,  1055,  1053,  2475, 16565,  2453,  2991,  2917, 10908,
#           1012,  3422,  2041,  2005,  4518, 28892,  1012,  1001, 16565,  6279,
#          13701,  1001,  6887, 27292,  7231,  9333,   102,     0,     0,     0,
#              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
#              0,     0,     0,     0,     0]),
#  'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#          0, 0, 0, 0, 0, 0, 0]),
#  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
#          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#          0, 0, 0, 0, 0, 0, 0]),
#  'stock_labels': tensor(0),
#  'sentiment_labels': tensor(0),
#  'values': tensor(-0.0054)}

{'input_ids': tensor([  101,  1030,  6887, 27292,  4887, 17299,  3686,  1024,  1052,  8873,
          6290,  1005,  1055,  1053,  2475, 16565,  2453,  2991,  2917, 10908,
          1012,  3422,  2041,  2005,  4518, 28892,  1012,  1001, 16565,  6279,
         13701,  1001,  6887, 27292,  7231,  9333,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0]),
 'stock_labels': tensor(0),
 'sentiment_labels': tensor(0),
 'values': tensor(-0.0054)}

## Model Definition

In [29]:
from transformers import BertModel
import torch.nn as nn

class CustomBERTModel(nn.Module):
    def __init__(self):
        super(CustomBERTModel, self).__init__()
        self.bert = BertModel.from_pretrained('prajjwal1/bert-mini')
        self.dropout = nn.Dropout(0.15)
        # self.dense = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size, device=device)
        self.classifier_stock = nn.Linear(self.bert.config.hidden_size, 6) # stock classification head
        self.classifier_sentiment = nn.Linear(self.bert.config.hidden_size, 3) # sentiment classification head
        self.regression = nn.Linear(self.bert.config.hidden_size, 1) # regression head
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = outputs[1]
        # pooled_output = self.dense(pooled_output)
        pooled_output = self.dropout(pooled_output)
        stock_labels = self.classifier_stock(pooled_output)
        sentiment_labels = self.classifier_sentiment(pooled_output)
        regression_values = self.regression(pooled_output)
        
        return stock_labels, sentiment_labels, regression_values

In [30]:
print(len(BertModel.from_pretrained('prajjwal1/bert-mini').encoder.layer))

4


## Training

In [31]:
from torch.nn import CrossEntropyLoss, MSELoss

l1_lambda = 0.01
stock_loss_weight = 1
sentiment_loss_weight = 3
regression_loss_weight = 2

def train(dataloader, model, optimizer):
    l1_norm = sum(p.abs().sum() for p in model.parameters()).detach()
    
    model.train()
    total_loss = 0    # initialize total loss to 0
    for batch in dataloader:
        # clear the gradients
        optimizer.zero_grad()
        
        # get the inputs and labels
        inputs = {key: val.to(device) for key, val in batch.items()}
        stock_labels = inputs.pop('stock_labels').to(device)
        sentiment_labels = inputs.pop('sentiment_labels').to(device)
        values = inputs.pop('values').to(device)
        
        # forward pass
        stock_labels_pred, sentiment_labels_pred, regression_values_pred = model(**inputs)
        
        sentiment_weights = torch.Tensor([1, 2, 2]).to(device)  
        stock_weights = torch.Tensor([1, 3, 4, 4, 3, 2]).to(device)
        
        # compute the loss
        stock_loss = CrossEntropyLoss(weight=stock_weights)(stock_labels_pred.view(-1, 6), stock_labels.view(-1))
        sentiment_loss = CrossEntropyLoss(weight=sentiment_weights)(sentiment_labels_pred.view(-1, 3), sentiment_labels.view(-1))
        regression_loss = MSELoss()(regression_values_pred.view(-1), values.view(-1))
        
        total_loss = stock_loss_weight*stock_loss + sentiment_loss_weight*sentiment_loss + regression_loss_weight*regression_loss #+ l1_lambda * l1_norm

        # backward pass
        total_loss.backward()
        
        # update the weights
        optimizer.step()
        
    return total_loss / len(dataloader)

def validate(dataloader, model):
    l1_norm = sum(p.abs().sum() for p in model.parameters()).detach()
    
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: val for key, val in batch.items()}
            stock_labels = inputs.pop('stock_labels')
            sentiment_labels = inputs.pop('sentiment_labels')
            values = inputs.pop('values')
            
            stock_labels_pred, sentiment_labels_pred, regression_values_pred = model(**inputs)           

            stock_loss = CrossEntropyLoss()(stock_labels_pred.view(-1, 6), stock_labels)
            sentiment_loss = CrossEntropyLoss()(sentiment_labels_pred.view(-1, 3), sentiment_labels)
            regression_loss = MSELoss()(regression_values_pred.view(-1), values.view(-1))
            
            total_loss += stock_loss_weight*stock_loss + sentiment_loss_weight*sentiment_loss + regression_loss_weight*regression_loss #+ l1_lambda * l1_norm

    return total_loss / len(dataloader)

In [32]:
from torch.utils.data import DataLoader
from transformers import AdamW
from torch.nn.utils import clip_grad_norm_
import copy

model = CustomBERTModel().to(device)
# for param in model.bert.encoder.layer[-2:].parameters():
#     param.requires_grad = True

optimizer = AdamW(model.parameters(), lr=1e-5)

epochs = 50
grad_clip = 1.0
best_loss = None
patience = 5
no_improve = 0
early_stopping = False


train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    train_loss = train(train_loader, model, optimizer)

    # Gradient clipping
    clip_grad_norm_(model.parameters(), grad_clip)

    print(f'Train loss {train_loss}')

    val_loss = validate(val_loader, model)
    print(f'Validation loss {val_loss}')

    # Save the model if validation loss decreases
    if best_loss is None or val_loss < best_loss:
        print(f'Validation loss decreased from {None if best_loss is None else best_loss} to {val_loss}.')
        no_improve = 0
        best_loss = val_loss
        if epoch > 10:
            print(f'Saving model...')
            best_model = copy.deepcopy(model.state_dict())
    else:
        no_improve += 1
    if no_improve >= patience:
        print(f'No improvement in validation loss for {patience} epochs. Stopping...')
        early_stopping = True
        break

if early_stopping:
    # Load the best model weights
    model.load_state_dict(best_model)

print('Training complete')



Epoch 1/50
----------
Train loss 0.06904063373804092
Validation loss 6.766324520111084
Validation loss decreased from None to 6.766324520111084.
Epoch 2/50
----------
Train loss 0.06516032665967941
Validation loss 6.411446571350098
Validation loss decreased from 6.766324520111084 to 6.411446571350098.
Epoch 3/50
----------
Train loss 0.1812034249305725
Validation loss 6.256017208099365
Validation loss decreased from 6.411446571350098 to 6.256017208099365.
Epoch 4/50
----------
Train loss 0.03919582441449165
Validation loss 5.709074020385742
Validation loss decreased from 6.256017208099365 to 5.709074020385742.
Epoch 5/50
----------
Train loss 0.05612219497561455
Validation loss 5.268950462341309
Validation loss decreased from 5.709074020385742 to 5.268950462341309.
Epoch 6/50
----------
Train loss 0.2042253315448761
Validation loss 5.110407829284668
Validation loss decreased from 5.268950462341309 to 5.110407829284668.
Epoch 7/50
----------
Train loss 0.11633892357349396
Validation los

## Evaluation

In [38]:
def validate(dataloader, model):
    model.eval()
    total_loss = 0
    stock_outputs, sentiment_outputs, value_outputs = [], [], []
    stock_labels, sentiment_labels, values = [], [], []

    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: val.to(device) for key, val in batch.items()}
            batch_stock_labels = inputs.pop('stock_labels').to(device)
            batch_sentiment_labels = inputs.pop('sentiment_labels').to(device)
            batch_values = inputs.pop('values').to(device)
            
            stock_labels_pred, sentiment_labels_pred, regression_values_pred = model(**inputs)
            
            stock_loss = CrossEntropyLoss()(stock_labels_pred.view(-1, 6), batch_stock_labels.view(-1))
            sentiment_loss = CrossEntropyLoss()(sentiment_labels_pred.view(-1, 3), batch_sentiment_labels.view(-1))
            regression_loss = MSELoss()(regression_values_pred.view(-1), batch_values.view(-1))
            
            total_loss += stock_loss + sentiment_loss + regression_loss

            # store predictions and true labels for metric computation
            stock_outputs.extend(torch.argmax(stock_labels_pred, dim=1).tolist())
            sentiment_outputs.extend(torch.argmax(sentiment_labels_pred, dim=1).tolist())
            value_outputs.extend(regression_values_pred.tolist())

            stock_labels.extend(batch_stock_labels.tolist())
            sentiment_labels.extend(batch_sentiment_labels.tolist())
            values.extend(batch_values.tolist())

    return total_loss / len(dataloader), (stock_outputs, sentiment_outputs, value_outputs), (stock_labels, sentiment_labels, values)

In [39]:
from sklearn.metrics import confusion_matrix, f1_score
import numpy as np

val_loss, (stock_outputs, sentiment_outputs, value_outputs), (stock_labels, sentiment_labels, values) = validate(val_loader, model)
print(f'Validation loss {val_loss}')

# compute confusion matrix, F1 score, etc
stock_cm = confusion_matrix(stock_labels, stock_outputs)
sentiment_cm = confusion_matrix(sentiment_labels, sentiment_outputs)

stock_f1 = f1_score(stock_labels, stock_outputs, average="weighted")
sentiment_f1 = f1_score(sentiment_labels, sentiment_outputs, average="weighted")

print('Stock Classification F1: ', stock_f1, '\nConfusion Matrix:\n', stock_cm)
print('Sentiment Classification F1: ', sentiment_f1, '\nConfusion Matrix:\n', sentiment_cm)

# for regression, we use root mean squared error (RMSE) instead of F1, etc.
value_rmse = np.sqrt(MSELoss()(torch.tensor(values, device=device).cpu(), torch.tensor(value_outputs, device=device).cpu())) 
print('Regression RMSE: ', value_rmse.item())


Validation loss 1.6258095502853394
Stock Classification F1:  0.8238874194457986 
Confusion Matrix:
 [[119   5   2   4   4   2]
 [  2  20   0   0   0   0]
 [  4   0  12   0   0   0]
 [  5   0   0  16   0   0]
 [  4   0   0   0  18   0]
 [  9   0   0   0   0  11]]
Sentiment Classification F1:  0.8234362395055037 
Confusion Matrix:
 [[117   4  15]
 [  8  42   1]
 [ 13   1  36]]
Regression RMSE:  1.2860907316207886


  return F.mse_loss(input, target, reduction=self.reduction)


In [40]:
train_loss, (stock_outputs, sentiment_outputs, value_outputs), (stock_labels, sentiment_labels, values) = validate(train_loader, model)
print(f'Train loss {train_loss}')

# compute confusion matrix, F1 score, etc
stock_cm = confusion_matrix(stock_labels, stock_outputs)
sentiment_cm = confusion_matrix(sentiment_labels, sentiment_outputs)

stock_f1 = f1_score(stock_labels, stock_outputs, average="weighted")
sentiment_f1 = f1_score(sentiment_labels, sentiment_outputs, average="weighted")

print('Stock Classification F1: ', stock_f1, '\nConfusion Matrix:\n', stock_cm)
print('Sentiment Classification F1: ', sentiment_f1, '\nConfusion Matrix:\n', sentiment_cm)

# for regression, we use root mean squared error (RMSE) instead of F1, etc.
value_rmse = np.sqrt(MSELoss()(torch.tensor(values, device=device).cpu(), torch.tensor(value_outputs, device=device).cpu())) 
print('Regression RMSE: ', value_rmse.item())

Train loss 0.8533780574798584
Stock Classification F1:  0.9174713675354278 
Confusion Matrix:
 [[499  20   7   7  11   4]
 [  0  89   0   0   0   0]
 [  3   0  73   1   0   0]
 [  5   0   0  70   0   0]
 [  3   0   0   0  67   0]
 [ 17   0   0   0   0  69]]
Sentiment Classification F1:  0.9537804469940004 
Confusion Matrix:
 [[515   7  26]
 [  8 190   0]
 [  3   0 196]]
Regression RMSE:  1.3598183393478394


  return F.mse_loss(input, target, reduction=self.reduction)


In [41]:
# quantized model
import torch.quantization

model = model.eval()

# Quantize
model_q = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

print(model_q)

RuntimeError: Didn't find engine for operation quantized::linear_prepack NoQEngine

In [42]:
def predict(model, dataset_item, scaler):
    model.eval()
    with torch.no_grad():
        inputs = {key: val.unsqueeze(0).to(device) for key, val in dataset_item.items()} # unsqueeze to mimic batch dim
        inputs.pop('stock_labels')
        inputs.pop('sentiment_labels')
        inputs.pop('values')

        stock_labels_pred, sentiment_labels_pred, regression_values_pred = model(**inputs)
        
        stock_label = torch.argmax(stock_labels_pred, dim=1).item()
        sentiment_label = torch.argmax(sentiment_labels_pred, dim=1).item()
        regression_value = scaler.inverse_transform(regression_values_pred.cpu().numpy()) # inverse transform of scaling

    return stock_label, sentiment_label, regression_value[0][0]  # return the single value from the 2D array

In [43]:
# reality check

for i in range(100):
    stock_label, sentiment_label, value = predict(model, train_dataset[i], scaler)
    print(f'Example {i}:')
    print(f'Predicted stock_class: {stock_label}, sentiment_class: {sentiment_label}, value: {value}')
    print(f'True stock_class: {train_dataset[i]["stock_labels"].item()}, sentiment_class: {train_dataset[i]["sentiment_labels"].item()}, value: {scaler.inverse_transform([[train_dataset[i]["values"].item()]])[0][0]}')
    print("---")

Example 0:
Predicted stock_class: 0, sentiment_class: 0, value: 0.010525597259402275
True stock_class: 0, sentiment_class: 0, value: 3.0091839122500103e-12
---
Example 1:
Predicted stock_class: 0, sentiment_class: 0, value: 0.0019254387589171529
True stock_class: 0, sentiment_class: 0, value: 3.0091839122500103e-12
---
Example 2:
Predicted stock_class: 0, sentiment_class: 0, value: 9.125718861469068e-06
True stock_class: 0, sentiment_class: 0, value: 3.0091839122500103e-12
---
Example 3:
Predicted stock_class: 1, sentiment_class: 2, value: -0.0347357876598835
True stock_class: 1, sentiment_class: 2, value: -0.025382427936891654
---
Example 4:
Predicted stock_class: 0, sentiment_class: 0, value: 0.0011488996678963304
True stock_class: 0, sentiment_class: 0, value: 3.0091839122500103e-12
---
Example 5:
Predicted stock_class: 0, sentiment_class: 0, value: 0.0019121920922771096
True stock_class: 0, sentiment_class: 0, value: 3.0091839122500103e-12
---
Example 6:
Predicted stock_class: 3, s

device(type='mps')

# Testing model saving and loading

In [23]:
torch.save(model.state_dict(), "082_081.pt")

In [17]:
def prepare_input(text, tokenizer):
    """
    Takes a string, tokenizes, and prepares it into expected format (list of token ids, attention masks, etc.) ready for model input

    Arguments:
    text -- string, Raw text string
    tokenizer -- transformers.Tokenizer, Initialized tokenizer

    Returns:
    input_dict -- dictionary, Contains required inputs for model
    """

    # Tokenize the text
    encoding = tokenizer.encode_plus(
        text,
        truncation=True, 
        padding=True,
        return_tensors='pt'  # Return PyTorch tensors
    )

    # Get the input ids and attention masks from tokenizer and convert to tensors
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    # Put all tensor entries into a single dictionary
    input_dict = {
        'input_ids': input_ids,
        'token_type_ids': torch.zeros(input_ids.shape, dtype=torch.long),
        'attention_mask': attention_mask,
    }
    
    return input_dict

In [18]:
dummy_model = CustomBERTModel()
dummy_model.load_state_dict(torch.load('best_model.pt'))
dummy_model = dummy_model.to("cpu")

In [19]:
res = prepare_input("Pfizer faces backlash over possible closure of regional office.", tokenizer)
res

{'input_ids': tensor([[  101,  1052,  8873,  6290,  5344, 25748,  2058,  2825,  8503,  1997,
           3164,  2436,  1012,   102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [20]:
def predict_loaded_from_loaded_model(model, res, scaler):
    model.eval()
    with torch.no_grad():
        inputs = {key: val.to(device) for key, val in res.items()}  

        stock_labels_pred, sentiment_labels_pred, regression_values_pred = model(inputs["input_ids"], inputs["attention_mask"], inputs["token_type_ids"])
        
        stock_label = torch.argmax(stock_labels_pred, dim=1).item()
        sentiment_label = torch.argmax(sentiment_labels_pred, dim=1).item()
        regression_value = scaler.inverse_transform(regression_values_pred.cpu().numpy()) # inverse transform of scaling

    return stock_label, sentiment_label, regression_value[0][0]  # return the single value 

In [30]:
stock_label, sentiment_label, value = predict_loaded_from_loaded_model(dummy_model, res, scaler)

In [31]:
stock_label, sentiment_label, value

(0, 0, -0.017077213)