In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m
Insta

In [2]:
#Import libraries
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
import random
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
#Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

import warnings
warnings.filterwarnings("ignore")

device = torch.device("cuda")
torch.cuda.empty_cache()

Mounted at /content/drive


##Data loading and preprocessing

In [4]:
#Text preprocessing
data = pd.read_csv('/content/drive/MyDrive/University/MODULE5/IMDB Dataset.csv')
print(data.shape)
data.sample(5)

(50000, 2)


Unnamed: 0,review,sentiment
21268,"As you may have gathered from the title, I who...",negative
39816,Not worth the video rental or the time or the ...,negative
19684,I found this movie to be okay.<br /><br />On p...,negative
40724,this a haunting piece of work.its only ten min...,positive
30389,This is the most compelling and excellent perf...,positive


In [5]:
#Label 0 = negative sentiment, 1 = positive
data['sentiment'] = data['sentiment'].replace(1,'positive')
data['sentiment'] = data['sentiment'].replace(0,'negative')

label_encoder = LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])

#Get list of sentences and labels
sentences = data.review.values
labels = data.sentiment.values

##Text tokenization and conversion to BERT input features

In [6]:
#BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

#Tokenize and get IDs from one sentence
print('Original: ', sentences[413])
print('Tokenized: ', tokenizer.tokenize(sentences[413]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[413])))

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Original:  A rating of "1" does not begin to express how dull, depressing and relentlessly bad this movie is.
Tokenized:  ['a', 'rating', 'of', '"', '1', '"', 'does', 'not', 'begin', 'to', 'express', 'how', 'dull', ',', 'de', '##pressing', 'and', 'relentless', '##ly', 'bad', 'this', 'movie', 'is', '.']
Token IDs:  [1037, 5790, 1997, 1000, 1015, 1000, 2515, 2025, 4088, 2000, 4671, 2129, 10634, 1010, 2139, 24128, 1998, 21660, 2135, 2919, 2023, 3185, 2003, 1012]


In [7]:
#Complete tokenization and IDs for all sentences
input_ids = []
for sent in sentences:
  id = tokenizer.encode(sent)
  input_ids.append(id)

Token indices sequence length is longer than the specified maximum sequence length for this model (563 > 512). Running this sequence through the model will result in indexing errors


In [8]:
#Find max sentence length
# max_len = max([len(sen) for sen in input_ids])
# print('Max sentence length: ', max_len)

max_len = 512

#Pad the input to the max length
input_ids = pad_sequences(input_ids, maxlen=max_len, dtype="long", value=0, truncating="post", padding="post")

#Create attention masks
attention_mask = []
for sent in input_ids:
  att_mask = [int(token_id > 0) for token_id in sent]
  attention_mask.append(att_mask)

In [9]:
#Split dataset into training and testing and validation
X_train, X_test, y_train, y_test = train_test_split(input_ids, labels, random_state=10, test_size=0.5)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=10, test_size=0.2)

#Split attention masks
train_masks, test_masks, train_label, _ = train_test_split(attention_mask, labels,  random_state=10, test_size=0.5)
train_masks, val_masks, _, _ = train_test_split(train_masks, train_label,  random_state=10, test_size=0.2)

#Convert input data to tensors
X_train = torch.tensor(X_train)
X_test = torch.tensor(X_test)
X_val = torch.tensor(X_val)

y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)
y_val = torch.tensor(y_val)

train_masks = torch.tensor(train_masks)
test_masks = torch.tensor(test_masks)
val_masks = torch.tensor(val_masks)

##Model definition, training, and evaluation

In [10]:
#Load the pre-trained BERT model and tune
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2, output_attentions = False, output_hidden_states = False, )
optimizer = AdamW(model.parameters(),lr = 2e-5,eps = 1e-8)
model.cuda()

batch_size = 16

#Create the DataLoader for training
train_data = TensorDataset(X_train, train_masks, y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

#Create the DataLoader for validation
validation_data = TensorDataset(X_val, val_masks, y_val)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

epochs = 2
total_steps = len(train_dataloader)*epochs
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 0,num_training_steps = total_steps)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
#Function to calculate the accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [12]:
#Train model
seed_val = 10
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []
for i in range(0, epochs):
  print('Epoch {:} / {:}'.format(i + 1, epochs))
  total_loss = 0
  model.train()
  for step, batch in enumerate(train_dataloader):
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    model.zero_grad()
    outputs = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask,labels=b_labels)
    loss = outputs[0]
    total_loss += loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()
  avg_train_loss = total_loss / len(train_dataloader)
  loss_values.append(avg_train_loss)
  print('Average training loss: {0:.2f}'.format(avg_train_loss))

  model.eval()
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  for batch in validation_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
      outputs = model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1
  print('Accuracy: {0:.2f}'.format(eval_accuracy/nb_eval_steps))

Epoch 1 / 2
Average training loss: 0.26
Accuracy: 0.93
Epoch 2 / 2
Average training loss: 0.13
Accuracy: 0.93


In [13]:
#Create the DataLoader for testing
prediction_data = TensorDataset(X_test, test_masks, y_test)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [14]:
#Evaluate model on the test set
model.eval()
predictions , true_labels = [], []

for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
  logits = outputs[0]

  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  predictions.append(logits)
  true_labels.append(label_ids)

flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]

In [15]:
#Performance metrics
accuracy = accuracy_score(flat_true_labels, flat_predictions)
precision = precision_score(flat_true_labels, flat_predictions, average='weighted')
recall= recall_score(flat_true_labels, flat_predictions, average='weighted')
f1_score = metrics.f1_score(flat_true_labels, flat_predictions, average='weighted')
print('Performance Evaluation:')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Performance Evaluation:
Accuracy: 0.9312
Precision: 0.9316237983314802
Recall: 0.9312
F1 Score: 0.9311874412173906


##Sample movie review predictions and explanations

In [16]:
def get_predictions(text,max_len):
  encoded_text = tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_len, truncation=True, return_token_type_ids=False, padding='max_length',return_attention_mask=True,return_tensors='pt')
  input_ids = encoded_text["input_ids"].to(device)
  attention_mask = encoded_text["attention_mask"].to(device)
  outputs = model(input_ids, attention_mask)
  _, predictions = torch.max(outputs[0], dim=1)
  return predictions

In [17]:
text = data.iloc[21042][0]
prediction = get_predictions(text,max_len)
print(text,':', prediction)

Brilliant. Ranks along with Citizen Kane, The Matrix and Godfathers. Must see, at least for basset in her early days. Watch it. : tensor([1], device='cuda:0')
