In [23]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_metric
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import numpy as np

import evaluate
import torch


## Data Preprocessing

In [4]:
data = pd.read_excel('finished_annotation_5.2k.xlsx', header=0)

In [5]:
data = data[['title & content', 'summary', 'description', 'content', 'title', 'human_label_sentiment']]

In [6]:
data['human_label_sentiment'].unique()

array(['No', 'Positive', 'Negative', 'Neutral', 'Neutral ', ' Neutral'],
      dtype=object)

In [7]:
data['human_label_sentiment'] = data['human_label_sentiment'].apply(lambda x: 'Neutral' if x.strip() == 'Neutral' else x)

In [8]:
data['human_label_sentiment'].unique()

array(['No', 'Positive', 'Negative', 'Neutral'], dtype=object)

In [9]:
data_sentiment = data[data['human_label_sentiment'] != 'No']

In [10]:
data_sentiment['human_label_sentiment'].unique()

array(['Positive', 'Negative', 'Neutral'], dtype=object)

In [11]:
data_sentiment['human_label_sentiment'].value_counts()

human_label_sentiment
Negative    1801
Neutral      948
Positive     836
Name: count, dtype: int64

In [12]:
data_sentiment.shape

(3585, 6)

In [13]:
data_sentiment['sentiment_score'] = data_sentiment['human_label_sentiment'].apply(lambda x: 0 if x == 'Neutral' else 1 if x == 'Positive' else -1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sentiment['sentiment_score'] = data_sentiment['human_label_sentiment'].apply(lambda x: 0 if x == 'Neutral' else 1 if x == 'Positive' else -1)


In [14]:
data_sentiment = data_sentiment[['title & content', 'human_label_sentiment', 'sentiment_score']]
data_sentiment.columns = ['text', 'human_label', 'labels']
data_sentiment.to_csv('prepared_sentiment_data.csv', index=False)

In [15]:
data_sentiment

Unnamed: 0,text,human_label,labels
3,"Shareholders v. Tesla, Nasdaq's diversity rule...",Positive,1
5,Robert Half Named One of Barron's Most Sustain...,Positive,1
6,The new electric USPS mail truck is America‚Äö...,Positive,1
8,"FedEx closing more locations, planning to furl...",Negative,-1
9,"FedEx Parks Planes, Maersk Cancels Sails: Worl...",Negative,-1
...,...,...,...
5211,American Airlines Pilots' Union Calls Strike A...,Neutral,0
5212,Ford recalling over 1.2 MILLION cars over 'ser...,Negative,-1
5213,Ford Making EVs Means Turning the Clock Back 1...,Neutral,0
5214,Former SVB chief says the Fed never discussed ...,Negative,-1


In [16]:
data_sentiment['len'] = data_sentiment['text'].apply(lambda x: len(x))
data_sentiment['len'].describe()

count    3585.000000
mean     3379.271409
std      1773.055330
min       333.000000
25%      2033.000000
50%      3192.000000
75%      4678.000000
max      7436.000000
Name: len, dtype: float64

## Prepare Dataset

In [17]:
dataset = Dataset.from_pandas(data_sentiment[['text', 'human_label', 'labels']].reset_index(drop=True))

In [18]:
dataset

Dataset({
    features: ['text', 'human_label', 'labels'],
    num_rows: 3585
})

In [19]:
dataset = dataset.train_test_split(test_size=0.2)

In [20]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'human_label', 'labels'],
        num_rows: 2868
    })
    test: Dataset({
        features: ['text', 'human_label', 'labels'],
        num_rows: 717
    })
})

In [21]:
dataset.save_to_disk('dataset')

Saving the dataset (0/1 shards):   0%|          | 0/2868 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/717 [00:00<?, ? examples/s]

## Fine-Tuning BERT Model on all parameters

In [62]:
# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 222818a5-d1da-4126-add9-8573dca2e940)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 47d62d3f-f8e2-4005-97df-2ef02a2892a6)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/config.json
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 7992b227-e8c6-4faa-b620-c9418ed426ad)')' thrown while requesti

In [63]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [49]:
print(torch.backends.mps.is_built())
# Check if GPU is available and set the device accordingly
device = torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu")  # mps is for Apple Silicon GPU

True


In [50]:

# Move the model to the GPU
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [91]:
# Function to tokenize the text data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = dataset['train']
test_dataset = dataset['test']
# Tokenize the text data in the train and test datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Define the compute_metrics function for accuracy
accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
)


Map:   0%|          | 0/2868 [00:00<?, ? examples/s]

Map:   0%|          | 0/717 [00:00<?, ? examples/s]

In [92]:

# Train and evaluate the model
trainer.train()

# Evaluate the model on the test dataset
results = trainer.evaluate()

# Print the accuracy
print(results)


  0%|          | 0/1077 [00:00<?, ?it/s]

{'loss': 0.3215, 'learning_rate': 5e-05, 'epoch': 1.39}
{'loss': 0.2506, 'learning_rate': 6.672443674176777e-06, 'epoch': 2.79}
{'train_runtime': 3544.7119, 'train_samples_per_second': 2.427, 'train_steps_per_second': 0.304, 'train_loss': 0.2774076461791992, 'epoch': 3.0}


  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.45036745071411133, 'eval_accuracy': 0.3905160390516039, 'eval_runtime': 29.7873, 'eval_samples_per_second': 24.071, 'eval_steps_per_second': 3.021, 'epoch': 3.0}


## Fine-Tuning BERT Model on only last layer

In [44]:
# check the number of trainable parameters
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'{total_params:,} total parameters.')

109,484,547 total parameters.


In [45]:
# Freeze all layers in the BERT model
for param in model.bert.parameters():
    param.requires_grad = False

In [46]:
# check the number of trainable parameters
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'{total_params:,} total parameters.')

2,307 total parameters.


In [96]:
# Train and evaluate the model
trainer.train()

# Evaluate the model on the test dataset
results = trainer.evaluate()

# Print the accuracy
print(results)


  0%|          | 0/1077 [00:00<?, ?it/s]

{'loss': 0.1094, 'learning_rate': 5e-05, 'epoch': 1.39}
{'loss': 0.1153, 'learning_rate': 6.672443674176777e-06, 'epoch': 2.79}
{'train_runtime': 443.4699, 'train_samples_per_second': 19.402, 'train_steps_per_second': 2.429, 'train_loss': 0.11023234878299186, 'epoch': 3.0}


  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.473304808139801, 'eval_accuracy': 0.3877266387726639, 'eval_runtime': 28.7564, 'eval_samples_per_second': 24.934, 'eval_steps_per_second': 3.13, 'epoch': 3.0}


In [97]:
trainer.save_model("./bert_sentiment_model")

In [None]:
# Function to tokenize the text data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = dataset['train']
test_dataset = dataset['test']
# Tokenize the text data in the train and test datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Define the compute_metrics function for accuracy
accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
)


Map:   0%|          | 0/2868 [00:00<?, ? examples/s]

Map:   0%|          | 0/717 [00:00<?, ? examples/s]

  accuracy_metric = load_metric("accuracy")


In [51]:
# Train and evaluate the model
trainer.train()

# Evaluate the model on the test dataset
results = trainer.evaluate()

# Print the accuracy
print(results)

  0%|          | 0/17950 [00:00<?, ?it/s]

{'loss': 0.1815, 'learning_rate': 5e-05, 'epoch': 1.39}
{'loss': 0.1668, 'learning_rate': 4.856733524355301e-05, 'epoch': 2.79}
{'loss': 0.1674, 'learning_rate': 4.713467048710602e-05, 'epoch': 4.18}
{'loss': 0.1689, 'learning_rate': 4.570200573065903e-05, 'epoch': 5.57}
{'loss': 0.1638, 'learning_rate': 4.426934097421204e-05, 'epoch': 6.96}
{'loss': 0.1673, 'learning_rate': 4.2836676217765046e-05, 'epoch': 8.36}
{'loss': 0.1667, 'learning_rate': 4.140401146131805e-05, 'epoch': 9.75}
{'loss': 0.1606, 'learning_rate': 3.997134670487106e-05, 'epoch': 11.14}
{'loss': 0.167, 'learning_rate': 3.853868194842407e-05, 'epoch': 12.53}
{'loss': 0.1625, 'learning_rate': 3.7106017191977077e-05, 'epoch': 13.93}
{'loss': 0.163, 'learning_rate': 3.567335243553009e-05, 'epoch': 15.32}
{'loss': 0.1752, 'learning_rate': 3.4240687679083095e-05, 'epoch': 16.71}
{'loss': 0.1583, 'learning_rate': 3.280802292263611e-05, 'epoch': 18.11}
{'loss': 0.166, 'learning_rate': 3.1375358166189114e-05, 'epoch': 19.5}
{

  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.122591033577919, 'eval_accuracy': 0.4895397489539749, 'eval_runtime': 26.635, 'eval_samples_per_second': 26.919, 'eval_steps_per_second': 3.379, 'epoch': 50.0}


In [52]:
trainer.save_model("./bert_sentiment_model")

In [53]:
# Train and evaluate the model
trainer.train()

# Evaluate the model on the test dataset
results = trainer.evaluate()

# Print the accuracy
print(results)

  0%|          | 0/17950 [00:00<?, ?it/s]

{'loss': 0.1596, 'learning_rate': 5e-05, 'epoch': 1.39}
{'loss': 0.1644, 'learning_rate': 4.856733524355301e-05, 'epoch': 2.79}
{'loss': 0.1641, 'learning_rate': 4.713467048710602e-05, 'epoch': 4.18}
{'loss': 0.1612, 'learning_rate': 4.570200573065903e-05, 'epoch': 5.57}
{'loss': 0.1571, 'learning_rate': 4.426934097421204e-05, 'epoch': 6.96}
{'loss': 0.1664, 'learning_rate': 4.2836676217765046e-05, 'epoch': 8.36}
{'loss': 0.1603, 'learning_rate': 4.140401146131805e-05, 'epoch': 9.75}
{'loss': 0.1603, 'learning_rate': 3.997134670487106e-05, 'epoch': 11.14}
{'loss': 0.1667, 'learning_rate': 3.853868194842407e-05, 'epoch': 12.53}
{'loss': 0.1623, 'learning_rate': 3.7106017191977077e-05, 'epoch': 13.93}
{'loss': 0.1464, 'learning_rate': 3.567335243553009e-05, 'epoch': 15.32}
{'loss': 0.17, 'learning_rate': 3.4240687679083095e-05, 'epoch': 16.71}
{'loss': 0.1577, 'learning_rate': 3.280802292263611e-05, 'epoch': 18.11}
{'loss': 0.1622, 'learning_rate': 3.1375358166189114e-05, 'epoch': 19.5}


  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.12130001187324524, 'eval_accuracy': 0.4895397489539749, 'eval_runtime': 26.3239, 'eval_samples_per_second': 27.238, 'eval_steps_per_second': 3.419, 'epoch': 50.0}
