In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_metric
from transformers import BertTokenizer, BertModel,BertForSequenceClassification, Trainer, TrainingArguments
import numpy as np

import evaluate
import torch

## Data Preprocessing

In [3]:
data_training = pd.read_csv('/Users/stoneman/Library/CloudStorage/OneDrive-Vanderbilt/AB_Project/5899-spring24-project1-team2/sentiment_updated_sortedDate_Training.csv', header=0)
data_prediction = pd.read_csv('/Users/stoneman/Library/CloudStorage/OneDrive-Vanderbilt/AB_Project/5899-spring24-project1-team2/sentiment_updated_sortedDate_Testing.csv', header=0)

In [16]:
data_training.head(2)

Unnamed: 0,summary,description,Ticker,adjusted_date,sentiment
0,Osaka Governor Hirofumi Yoshimura said that th...,Years of delay to plans for Japan‚Äö√Ñ√¥s firs...,MGM,9/15/2022 0:00,negative
1,MetLife (MET) is a Finance stock that has seen...,Dividends are one of the best benefits to bein...,MET,9/15/2022 0:00,positive


In [17]:
# check the average length of the columns: summary and description
print('data_training', data_training['summary'].apply(lambda x: len(x.split())).describe())
print('data_prediction', data_prediction['summary'].apply(lambda x: len(x.split())).describe())

data_training count    1086.000000
mean      115.712707
std        45.145769
min        39.000000
25%        90.000000
50%       111.000000
75%       132.000000
max       817.000000
Name: summary, dtype: float64
data_prediction count    3268.000000
mean      120.356487
std        40.700251
min        39.000000
25%        94.000000
50%       117.000000
75%       141.000000
max       540.000000
Name: summary, dtype: float64


In [18]:
data_training['sentiment'].unique()

array(['negative', 'positive', 'neutral'], dtype=object)

In [19]:
data_training['sentiment'].value_counts()

positive    809
negative    251
neutral      26
Name: sentiment, dtype: int64

In [20]:
data_training.shape

(1086, 5)

In [21]:
data_training['labels'] = data_training['sentiment'].map({'positive': 0, 'neutral': 2, 'negative': 1})

In [22]:
data_training.head(2)

Unnamed: 0,summary,description,Ticker,adjusted_date,sentiment,labels
0,Osaka Governor Hirofumi Yoshimura said that th...,Years of delay to plans for Japan‚Äö√Ñ√¥s firs...,MGM,9/15/2022 0:00,negative,1
1,MetLife (MET) is a Finance stock that has seen...,Dividends are one of the best benefits to bein...,MET,9/15/2022 0:00,positive,0


In [23]:
data_training = data_training[['summary', 'labels']]
data_training.columns = ['text', 'labels']
data_training.to_csv('data_training.csv', index=False)

In [24]:
data_training.head(2)

Unnamed: 0,text,labels
0,Osaka Governor Hirofumi Yoshimura said that th...,1
1,MetLife (MET) is a Finance stock that has seen...,0


## Prepare Dataset

In [25]:
dataset = Dataset.from_pandas(data_training[['text', 'labels']].reset_index(drop=True))

In [26]:
dataset

Dataset({
    features: ['text', 'labels'],
    num_rows: 1086
})

In [27]:
dataset = dataset.train_test_split(test_size=0.2)

In [28]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 868
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 218
    })
})

In [64]:
dataset['train']['text'][0]

'Active With The Activists: Sysco Strike Ends; Teamsters Declare Victory. We look forward to getting back to business as usual and returning our focus to servicing our customers and community.‚Äö√Ñ√π\n\nThe New England Sysco truck drivers‚Äö√Ñ√¥ union is relatively young, organized with the Teamsters several years ago due to dissatisfaction over issues including health insurance and wages. \n\nWith support from Teamsters Local 633 of New Hampshire, Sysco drivers picketed in Manchester, Bow, and Epping. The union failed to win a shift from the company-sponsored 401-K retirement plan to the union-sponsored pension fund.'

In [65]:
dataset['train']['labels'][0]

0

## Fine-Tuning BERT Model only on classifier layer

In [15]:
# Load the pre-trained BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
finbert_cls_model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [36]:
print(torch.backends.mps.is_built())
# Check if GPU is available and set the device accordingly
device = torch.device("mps")  # mps is for Apple Silicon GPU

True


In [37]:
# Move the model to the GPU
finbert_cls_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [71]:
finbert_cls_model.config

BertConfig {
  "_name_or_path": "ProsusAI/finbert",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.27.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [66]:
bert_input = bert_tokenizer(dataset['train']['text'][2], return_tensors='pt')

In [67]:
bert_input

{'input_ids': tensor([[  101,  3341,  7925, 26566,  1733,  1132, 24498,  1158,  1146,  7352,
          1113,  1207,  2058,  1496,  1106,  1126, 12104, 16907,  1104,  2331,
          2205,  4481,  1496,  1106,  1344, 16935,  5600,   119,  1109, 25630,
          1138,  5605,  1313,  7925, 26566,  1733,   112, 18155,   117,  5416,
          1147, 17901,  1177, 10832,   117,  1105,  1103,   156,   111,   153,
          3291, 24729, 13068, 10204,  3341, 12851, 12859, 10146,  1146,  3746,
           119,  3078,   110,  1177,  1677,  1142,  1214,   119,  1109,  1954,
          1903,  2603,  1104,  1807,   128,   110,  1113,  1103,  1927,  1476,
           118,  1214,  4275, 16935,  1110,  1543,  1146, 20407,  4481,  1750,
          8394,   117, 16863,  1103,  4528,  1104,  3685,  4481,   117,  1134,
          1132,  3417,  1167, 19017,  1190,  1207,  2058,   119,  3341,  7925,
         26566,  1733,  1113,  1903,  1138,  2120,  7352,  1107,  1164,  1160,
           118, 12704,  1104,  1147,  

In [68]:
bert_input = {k: v.to(device) for k, v in bert_input.items()}  # Move the input tensors to MPS device

output = finbert_cls_model(**bert_input)  # Run the model

In [79]:
output.logits

# Assuming `output` is the model output and `finbert_cls_model.config.id2label` is the id-to-label mapping
probabilities = torch.nn.functional.softmax(output.logits, dim=-1)

# Assuming 'probabilities' is your probabilities tensor
probabilities_cpu = probabilities.to('cpu')
predicted_class_index = torch.argmax(probabilities_cpu, dim=-1)
predicted_class_index


# Get the predicted label from the id2label mapping in the model's configuration
predicted_label = finbert_cls_model.config.id2label[predicted_class_index.item()]

predicted_label

tensor([2])

In [38]:
# Function to tokenize the text data
def tokenize_function(examples):
    return bert_tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = dataset['train']
test_dataset = dataset['test']
# Tokenize the text data in the train and test datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Define the compute_metrics function for accuracy
accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01
)

# Initialize the Trainer
trainer = Trainer(
    model=finbert_cls_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
)


  accuracy_metric = load_metric("accuracy")


In [83]:
# check the number of trainable parameters
total_params = sum(p.numel() for p in finbert_cls_model.parameters() if p.requires_grad)
print(f'{total_params:,} total parameters.')

109,484,547 total parameters.


In [90]:
# Freeze all layers in the BERT model
for param in finbert_cls_model.bert.parameters():
    param.requires_grad = False

In [91]:
# check the number of trainable parameters
total_params = sum(p.numel() for p in finbert_cls_model.parameters() if p.requires_grad)
print(f'{total_params:,} total parameters.')

2,307 total parameters.


In [94]:

# Train and evaluate the model
trainer.train()

# Evaluate the model on the test dataset
results = trainer.evaluate()

# Print the accuracy
print(results)


 46%|████▌     | 500/1090 [31:51<36:55,  3.75s/it]  

{'loss': 1.3428, 'learning_rate': 5e-05, 'epoch': 4.59}


 92%|█████████▏| 1000/1090 [1:03:32<05:41,  3.80s/it]

{'loss': 0.6466, 'learning_rate': 7.627118644067798e-06, 'epoch': 9.17}


100%|██████████| 1090/1090 [1:09:28<00:00,  3.82s/it]


{'train_runtime': 4168.7992, 'train_samples_per_second': 2.082, 'train_steps_per_second': 0.261, 'train_loss': 0.968447148034332, 'epoch': 10.0}


100%|██████████| 28/28 [01:37<00:00,  3.50s/it]

{'eval_loss': 0.6671083569526672, 'eval_accuracy': 0.7201834862385321, 'eval_runtime': 101.772, 'eval_samples_per_second': 2.142, 'eval_steps_per_second': 0.275, 'epoch': 10.0}





## Fine-Tuning BERT Model on only bert.pooler and classifier layers

In [30]:
# Freeze all layers in the BERT model
for param in finbert_cls_model.bert.parameters():
    param.requires_grad = False

In [32]:
for param in finbert_cls_model.bert.pooler.parameters():
    param.requires_grad = True

In [33]:
# check the number of trainable parameters
total_params = sum(p.numel() for p in finbert_cls_model.parameters() if p.requires_grad)
print(f'{total_params:,} total parameters.')

592,899 total parameters.


In [39]:
# Train and evaluate the model
trainer.train()

# Evaluate the model on the test dataset
results = trainer.evaluate()

# Print the accuracy
print(results)


 46%|████▌     | 500/1090 [32:34<39:15,  3.99s/it]  

{'loss': 0.6478, 'learning_rate': 5e-05, 'epoch': 4.59}


 92%|█████████▏| 1000/1090 [1:05:39<05:58,  3.98s/it]

{'loss': 0.6325, 'learning_rate': 7.627118644067798e-06, 'epoch': 9.17}


100%|██████████| 1090/1090 [1:11:38<00:00,  3.94s/it]


{'train_runtime': 4298.2406, 'train_samples_per_second': 2.019, 'train_steps_per_second': 0.254, 'train_loss': 0.6399818245424043, 'epoch': 10.0}


100%|██████████| 28/28 [01:38<00:00,  3.51s/it]

{'eval_loss': 0.6771977543830872, 'eval_accuracy': 0.7201834862385321, 'eval_runtime': 101.9094, 'eval_samples_per_second': 2.139, 'eval_steps_per_second': 0.275, 'epoch': 10.0}





In [95]:
trainer.save_model("./finbert_cls_model")

# Inferencing

In [6]:
# Load the pre-trained BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

fine_tune_model_path = "/Users/stoneman/Library/CloudStorage/OneDrive-Vanderbilt/AB_Project/5899-spring24-project1-team2/finbert_cls_model"
finbert_cls_model = BertForSequenceClassification.from_pretrained(fine_tune_model_path)

device = torch.device("mps")  # mps is for Apple Silicon GPU
finbert_cls_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [7]:
def batch_predict(data, model, tokenizer, batch_size=32):
    predictions = []

    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        inputs = tokenizer(list(batch), padding=True, truncation=True, return_tensors='pt')
        inputs = {name: tensor.to(model.device) for name, tensor in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
        
        batch_probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        batch_probabilities = batch_probabilities.cpu().numpy()
        predictions.extend(batch_probabilities)

    return np.array(predictions)

# Perform batch inference
batch_size = 32  # Adjust based on your device's memory
probabilities = batch_predict(data_prediction['summary'], finbert_cls_model, bert_tokenizer, batch_size=batch_size)

# Convert probabilities to DataFrame and concatenate with the original DataFrame
prob_df = pd.DataFrame(probabilities, columns=['positive', 'negative', 'neutral'])
data_prediction = pd.concat([data_prediction.reset_index(drop=True), prob_df], axis=1)


In [8]:
data_prediction.to_csv('data_prediction.csv', index=False)

In [9]:
data_prediction

Unnamed: 0,summary,description,Ticker,adjusted_date,sentiment,positive,negative,neutral
0,Walgreens Boots Alliance has sold its remainin...,Walgreens Boots Alliance has sold its remainin...,CVS,1/3/2023 0:00,positive,0.786601,0.192157,0.021241
1,\n\nAlthough the financial companies don't mak...,"In this article, we will take a look at 10 of ...",BLK,1/3/2023 0:00,positive,0.802762,0.173660,0.023579
2,Cigna Group reported strong second-quarter res...,Cigna Group second-quarter results beat Wall S...,CI,1/3/2023 0:00,positive,0.768271,0.209153,0.022576
3,The Environmental Protection Agency has ordere...,The Environmental Protection Agency has ordere...,NSC,1/3/2023 0:00,positive,0.803227,0.173373,0.023400
4,Sen. Joe Manchin (D-WV) is blaming Secretary o...,WASHINGTON ‚Äî Sen. Joe Manchin (D-WV) is blam...,NSC,1/3/2023 0:00,positive,0.749967,0.232096,0.017937
...,...,...,...,...,...,...,...,...
3263,"Vauxhall, the owner of Vauxhall and the UK car...",Stellantis may be premature in saying Brexit d...,F,9/7/2023 0:00,negative,0.764649,0.199559,0.035792
3264,The Federal Trade Commission is seeking to blo...,The biotech company is facing a patent cliff s...,ABBV,9/7/2023 0:00,positive,0.750436,0.228293,0.021271
3265,Ford has announced it is recalling over 1.2 mi...,Just a day after Honda announced a half a mill...,F,9/7/2023 0:00,positive,0.789926,0.192225,0.017849
3266,Ford CEO Jim Farley discussed the company's st...,Ford Making EVs Means Turning the Clock Back 1...,F,9/7/2023 0:00,negative,0.763356,0.216059,0.020585
