<a href="https://colab.research.google.com/github/samancha/nlp-master/blob/main/mod5/NLP_mod5_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data loading and preprocessing.

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
! pip install transformers

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.metrics import classification_report
# Download the stopwords and tokenizer from nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

seed_val = 42
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

[nltk_data] Downloading package stopwords to /Users/Steve/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Steve/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def preprocess_text(text):
    # Convert text to lowercase
    words = word_tokenize(text)

    # Convert words to lowercase
    words = [word.lower() for word in words]

    # Remove punctuation from words
    words = [word for word in words if word.isalnum()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Remove duplicate words
    unique_words = list(dict.fromkeys(words))

    # Join the words back into a string
    text = ' '.join(unique_words)

    return text


In [7]:
df = pd.read_csv('IMDB Dataset.csv', )
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df['processed_review'] = df['review'].apply(preprocess_text)
df.reset_index(drop=True)
display(df.head())

Unnamed: 0,review,sentiment,label,processed_review
0,One of the other reviewers has mentioned that ...,positive,1,one reviewers mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,positive,1,wonderful little production br filming techniq...
2,I thought this was a wonderful way to spend ti...,positive,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,0,basically family little boy jake thinks zombie...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,petter mattei love time money visually stunnin...


# Text tokenization and conversion to BERT input features.

In [8]:
# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

Initializing processed reviews as inputs and associated labels

In [9]:
inputs = df.processed_review.values
labels = df.label.values
print("Train data size ", len(inputs))
print('Original: ', inputs[0])
# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(inputs[0]))
# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(inputs[0])))

Train data size  50000
Original:  one reviewers mentioned watching 1 oz episode hooked right exactly happened br first thing struck brutality unflinching scenes violence set word go trust show faint hearted timid pulls punches regards drugs sex hardcore classic use called nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em home many aryans muslims gangstas latinos christians italians irish scuffles death stares dodgy dealings shady agreements never far would say main appeal due fact goes shows dare forget pretty pictures painted mainstream audiences charm romance mess around ever saw nasty surreal could ready watched developed taste got accustomed levels graphic injustice crooked guards sold nickel inmates kill order get away well mannered middle class turned bitches lack street skills experience may become comfortable uncomfortable viewing thats touch darker side
Tokeniz

In [10]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in inputs:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        max_length = 64,           # Pad & truncate all sentences.
                        padding='max_length',
                        truncation=True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

print('Original: ', inputs[0])
print('Token IDs:', input_ids[0])
print('Tokenized:', tokenizer.decode(input_ids[0][0]))
print('Attention_mask', attention_masks[0])

Original:  one reviewers mentioned watching 1 oz episode hooked right exactly happened br first thing struck brutality unflinching scenes violence set word go trust show faint hearted timid pulls punches regards drugs sex hardcore classic use called nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em home many aryans muslims gangstas latinos christians italians irish scuffles death stares dodgy dealings shady agreements never far would say main appeal due fact goes shows dare forget pretty pictures painted mainstream audiences charm romance mess around ever saw nasty surreal could ready watched developed taste got accustomed levels graphic injustice crooked guards sold nickel inmates kill order get away well mannered middle class turned bitches lack street skills experience may become comfortable uncomfortable viewing thats touch darker side
Token IDs: tensor([[  101,  20

In [11]:
bert_input_ids = torch.cat(input_ids, dim=0)
bert_attention_masks = torch.cat(attention_masks, dim=0)
bert_labels = torch.tensor(labels)

# Bert Input Features
print(type(bert_input_ids))
print(type(bert_attention_masks))

<class 'torch.Tensor'>
<class 'torch.Tensor'>


# Model definition, training, and evaluation.

#### Sequence Classifacation

In [12]:
# Load BertForSequenceClassification, the pretrained BERT model with a single
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Fine-tune the BERT model on the preprocessed IMDb dataset for sentiment analysis.

### Implement training loops and loss calculation.

In [15]:
batch_size = 64
epochs = 2
optimizer = AdamW(model.parameters(), lr=2e-5)
model.train()

for epoch in range(epochs):
    for i in range(0, bert_input_ids.size(0), batch_size):
        batch_input_ids = bert_input_ids[i:i+batch_size]
        batch_attention_masks = bert_attention_masks[i:i+batch_size]
        batch_labels = bert_labels[i:i+batch_size]

        optimizer.zero_grad()

        outputs = model(
            input_ids=batch_input_ids,
            attention_mask=batch_attention_masks,
            labels=batch_labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()



KeyboardInterrupt: 

In [None]:
predicted_labels = torch.argmax(outputs.logits, dim=1)
print(predicted_labels[0])
# for text, label in zip(inputs, predicted_labels):
#     print(f'Text: {text}\nPredicted Label: {label.item()}\n')

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


### Evaluation

testing set using accuracy, precision, recall, and F1-score metrics

In [16]:
# Use train_test_split to split our data into train and validation sets for
# training
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 64

def train_valid_split(input_ids, attention_masks, labels):
  # Use 80% for training and 20% for validation.
  train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                              random_state=2021, test_size=0.2)
  # Do the same for the masks.
  train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
                                              random_state=2021, test_size=0.2)


  print('example train_input:', train_inputs[0])
  print('example attention_mask', train_masks[0])
  print('type of train_labels:', type(train_labels[0]))
  print('type of validation_labels:', type(validation_labels[0]))

  train_labels = torch.tensor(train_labels)
  validation_labels = torch.tensor(validation_labels)

  # Create the DataLoader for our training set.
  train_data = TensorDataset(train_inputs, train_masks, train_labels)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

  # Create the DataLoader for our validation set.
  validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
  validation_sampler = SequentialSampler(validation_data)
  validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

  return train_dataloader, validation_dataloader

In [18]:
bert_train_dataloader, bert_validation_dataloader = train_valid_split(bert_input_ids, bert_attention_masks, bert_labels)

example train_input: tensor([  101,  2471,  3585,  8201,  7842,  5737, 10649,  2466,  7987,  7814,
         6463,  2614,  4289, 12991, 20857,  3689,  7315,  3370,  4125,  2991,
         2637,  2034,  2931,  2694,  2739,  8133, 12347,  5896,  3257,  2488,
         3459,  2453,  2979, 20327, 13109,  5714,  6508,  2428,  4276,  3947,
         2204,  4516,  3395,  2190,  2126,  2175,  4050,  2844,  2537,  5300,
         3185,  5450, 23105,  2214, 19857,  4246, 11973,   102,     0,     0,
            0,     0,     0,     0])
example attention_mask tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0])
type of train_labels: <class 'torch.Tensor'>
type of validation_labels: <class 'torch.Tensor'>


  train_labels = torch.tensor(train_labels)
  validation_labels = torch.tensor(validation_labels)


In [None]:
# function to train the model
def train(train_dataloader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model.train()
    total_loss, total_accuracy = 0, 0
  
    # empty list to save model predictions
    total_preds=[]
  
    # iterate over batches
    for step,batch in enumerate(train_dataloader):
        
        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
        
        # push the batch to gpu
        batch = [r.to(device) for r in batch]
 
        sent_id, mask, labels = batch
        
        # clear previously calculated gradients 
        model.zero_grad()        

        # get model predictions for the current batch
        preds = model(sent_id, mask)

        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)
  
      # predictions are in the form of (no. of batches, size of batch, no. of classes).
      # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    #returns the loss and predictions
    return avg_loss, total_preds

In [None]:
train(bert_train_dataloader)

In [20]:
from transformers import get_linear_schedule_with_warmup
import time
import datetime

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

def train_model(model, epochs, train_dataloader, validation_dataloader):      
    optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)
    loss_values = []
    eval_accs = []

    for epoch_i in range(0, epochs):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
        t0 = time.time()

        total_loss = 0
        model.train()

        for step, batch in enumerate(train_dataloader):

            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                
                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            b_input_ids = batch[0] #.to(device)
            b_input_mask = batch[1] #.to(device)
            b_labels = batch[2] #.to(device)

            model.zero_grad()        

            # Perform a forward pass (evaluate the model on this training batch).
            # This will return the loss (rather than the model output) because we
            # have provided the `labels`.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
            
            # The call to `model` always returns a tuple, so we need to pull the 
            # loss value out of the tuple.
            loss = outputs[0]

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value 
            # from the tensor.
            total_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()

            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)            
        
        # Store the loss value for plotting the learning curve.
        loss_values.append(avg_train_loss)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
            

        print("Running Validation...")

        t0 = time.time()
        model.eval()

        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:
            # batch = tuple(t.to(device) for t in batch)
            batch = tuple(t for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            
            with torch.no_grad():        
                # Forward pass, calculate logit predictions.
                # This will return the logits rather than the loss because we have
                # not provided labels.
                # token_type_ids is the same as the "segment ids", which 
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask)
            
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            logits = outputs[0]
            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            # Calculate the accuracy for this batch of test sentences.
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            # Accumulate the total accuracy.
            eval_accuracy += tmp_eval_accuracy
            # Track the number of batches
            nb_eval_steps += 1

        avg_eval_acc = eval_accuracy/nb_eval_steps
        print("  Accuracy: {0:.2f}".format(avg_eval_acc))
        print("  Validation took: {:}".format(format_time(time.time() - t0)))
        eval_accs.append(avg_eval_acc)
    print("")
    print("Training complete!")
    return loss_values, eval_accs

In [21]:
freeze_bert_loss_vals, freeze_bert_eval_accs = train_model(model, 4, bert_train_dataloader, bert_validation_dataloader) # about 1 minute for 4 epochs using CPU




Training...


# Sample movie review predictions and explanations.


In [19]:
# Perform inference
with torch.no_grad():
    outputs = model(bert_input_ids[0], attention_mask=bert_attention_masks[0]).logits

# Get predicted label
predicted_label = torch.argmax(outputs, dim=1).item()

# Define label names
label_names = [1, 0]

# Print result
print("Text:", input_ids[0])
print("Predicted Label:", label_names[predicted_label])

# for text, label in zip(test_texts, predicted_labels):
#     print(f'Text: {text}\nPredicted Label: {label.item()}\n')

ValueError: not enough values to unpack (expected 2, got 1)