In [1]:
## Pre-requisites

In [2]:
import pandas as pd
import numpy as np
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
import string
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch.nn as nn
from transformers import BertModel
import torch
from transformers import BertTokenizer
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
import xgboost
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
import math
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import time

%matplotlib inline

In [3]:
## Load Dataset

In [4]:
imdb_data = pd.read_csv('IMDB_dataset.csv')

In [5]:
imdb_data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [6]:
labelEncoder = LabelEncoder()

In [7]:
encoded = labelEncoder.fit_transform(imdb_data['sentiment'])
imdb_data.drop(labels=['sentiment'], axis=1, inplace=True)
imdb_data['sentiment'] = encoded
imdb_data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [8]:
imdb_data.describe()

Unnamed: 0,sentiment
count,50000.0
mean,0.5
std,0.500005
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [9]:
## Data Cleaning

In [10]:
### Check for nulls

In [11]:
imdb_data.isna().sum()

review       0
sentiment    0
dtype: int64

In [12]:
imdb_data.dropna(axis=0, inplace=True)

In [13]:
### To lower case

In [14]:
imdb_data['review'] = imdb_data['review'].str.lower()

In [15]:
### Remove HTML and URLs from comments

In [16]:
imdb_data['review'] = imdb_data['review']\
    .replace('http\S+', ' ', regex=True)\
    .replace('www\S+', ' ', regex=True)\
    .replace('<\S+>', ' ', regex=True)

In [17]:
### Remove non-alphabetical characters

In [18]:
imdb_data['review'] = imdb_data['review']\
    .replace('[^\w +]', ' ', regex=True)\
    .replace(r'([\;\:\|•«\n])', ' ', regex=True)

In [19]:
### Remove unnecessary spaces

In [20]:
imdb_data['review'] = imdb_data['review'].str.strip()
imdb_data['review'] = imdb_data['review']\
    .replace(' +', ' ', regex=True)

In [21]:
### Resolving contractions

In [22]:
imdb_data['review'] = imdb_data['review']\
    .apply(lambda row: contractions.fix(row))

In [23]:
### Remove digits and words with digits

In [24]:
imdb_data['review'] = imdb_data['review']\
    .replace('\w*\d\w*', ' ', regex=True)

In [25]:
### Post data cleaning

In [26]:
imdb_data

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production br br the filmin...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there s a family where a little boy ...,0
4,petter mattei s love in the time of money is a...,1
...,...,...
49995,i thought this movie did a down right good job...,1
49996,bad plot bad dialogue bad acting idiotic direc...,0
49997,i am a catholic taught in parochial elementary...,0
49998,i m going to have to disagree with the previou...,0


In [27]:
## Train - Valid - Test split

In [28]:
train_imdb, temp = train_test_split(imdb_data, train_size=0.80, shuffle=True)
valid_imdb, test_imdb = train_test_split(temp, train_size=0.50, shuffle=True)

In [29]:
print(train_imdb.shape)
print(valid_imdb.shape)
print(test_imdb.shape)

(40000, 2)
(5000, 2)
(5000, 2)


In [30]:
## Fine Tuning BERT for classification

In [31]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA GeForce RTX 3060 Laptop GPU


In [32]:
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = self.bert.config.hidden_size, 50, 2

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

In [33]:
bert_classifier = BertClassifier(freeze_bert=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [34]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [35]:
MAX_LEN = 350
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=sent,  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            truncation=True,                # Truncaate long sentences to max len
            padding='max_length',
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [36]:
### Create Torch Dataloaders

In [37]:
# Run function `preprocessing_for_bert` on the train set and the validation set
print('Tokenizing data...')
train_inputs, train_masks = preprocessing_for_bert(train_imdb['review'])
val_inputs, val_masks = preprocessing_for_bert(valid_imdb['review'])

Tokenizing data...


In [38]:
# Convert other data types to torch.Tensor
train_labels = torch.tensor(np.array(train_imdb['sentiment']))
val_labels = torch.tensor(np.array(valid_imdb['sentiment']))

batch_size = 8

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [39]:
def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [40]:
### BERT Training Loop

In [41]:
loss_fn = nn.CrossEntropyLoss()

def train(model, train_dataloader, val_dataloader=None, epochs=2, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    min_error = math.inf
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.type(torch.LongTensor).to(device) for t in batch)
            
            

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
        if val_loss <= min_error:
            print('Min Error decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            min_error,
            val_loss))
            torch.save(model.state_dict(), 'BERT-new-best-saved-model.pt')
            min_error = val_loss
    
    print("Training complete!")

In [42]:
def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.type(torch.LongTensor).to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [43]:
### Start Training

In [44]:
bert_classifier, optimizer, scheduler = initialize_model(epochs=2)
train(bert_classifier, train_dataloader, val_dataloader, epochs=2, evaluation=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.699387   |     -      |     -     |   15.00  
   1    |   40    |   0.710280   |     -      |     -     |   11.12  
   1    |   60    |   0.684048   |     -      |     -     |   11.17  
   1    |   80    |   0.563364   |     -      |     -     |   11.22  
   1    |   100   |   0.595891   |     -      |     -     |   11.27  
   1    |   120   |   0.452116   |     -      |     -     |   11.30  
   1    |   140   |   0.542669   |     -      |     -     |   11.30  
   1    |   160   |   0.340593   |     -      |     -     |   11.28  
   1    |   180   |   0.481458   |     -      |     -     |   11.38  
   1    |   200   |   0.406017   |     -      |     -     |   11.35  
   1    |   220   |   0.477809   |     -      |     -     |   11.25  
   1    |   240   |   0.523949   |     -      |     -     |   11.25  


   1    |  2320   |   0.225517   |     -      |     -     |   22.00  
   1    |  2340   |   0.179775   |     -      |     -     |   21.96  
   1    |  2360   |   0.336086   |     -      |     -     |   21.98  
   1    |  2380   |   0.400798   |     -      |     -     |   22.08  
   1    |  2400   |   0.291477   |     -      |     -     |   21.88  
   1    |  2420   |   0.268629   |     -      |     -     |   21.93  
   1    |  2440   |   0.300277   |     -      |     -     |   21.99  
   1    |  2460   |   0.212890   |     -      |     -     |   22.03  
   1    |  2480   |   0.420676   |     -      |     -     |   21.82  
   1    |  2500   |   0.328386   |     -      |     -     |   21.96  
   1    |  2520   |   0.385793   |     -      |     -     |   22.02  
   1    |  2540   |   0.293414   |     -      |     -     |   21.93  
   1    |  2560   |   0.270514   |     -      |     -     |   21.99  
   1    |  2580   |   0.207275   |     -      |     -     |   21.91  
   1    |  2600   | 

   1    |  4680   |   0.281184   |     -      |     -     |   22.16  
   1    |  4700   |   0.257668   |     -      |     -     |   22.09  
   1    |  4720   |   0.266685   |     -      |     -     |   21.94  
   1    |  4740   |   0.314006   |     -      |     -     |   21.94  
   1    |  4760   |   0.230425   |     -      |     -     |   22.15  
   1    |  4780   |   0.267500   |     -      |     -     |   22.17  
   1    |  4800   |   0.137867   |     -      |     -     |   22.07  
   1    |  4820   |   0.256123   |     -      |     -     |   22.07  
   1    |  4840   |   0.275214   |     -      |     -     |   22.22  
   1    |  4860   |   0.326402   |     -      |     -     |   22.01  
   1    |  4880   |   0.240917   |     -      |     -     |   22.02  
   1    |  4900   |   0.292458   |     -      |     -     |   22.07  
   1    |  4920   |   0.345489   |     -      |     -     |   22.10  
   1    |  4940   |   0.412601   |     -      |     -     |   22.17  
   1    |  4960   | 

   2    |  1920   |   0.234328   |     -      |     -     |   21.84  
   2    |  1940   |   0.121158   |     -      |     -     |   21.84  
   2    |  1960   |   0.110569   |     -      |     -     |   21.91  
   2    |  1980   |   0.134302   |     -      |     -     |   21.90  
   2    |  2000   |   0.420544   |     -      |     -     |   21.92  
   2    |  2020   |   0.171808   |     -      |     -     |   21.93  
   2    |  2040   |   0.172926   |     -      |     -     |   21.96  
   2    |  2060   |   0.227374   |     -      |     -     |   21.99  
   2    |  2080   |   0.240127   |     -      |     -     |   22.02  
   2    |  2100   |   0.161330   |     -      |     -     |   21.92  
   2    |  2120   |   0.124510   |     -      |     -     |   21.96  
   2    |  2140   |   0.321210   |     -      |     -     |   21.94  
   2    |  2160   |   0.087276   |     -      |     -     |   21.87  
   2    |  2180   |   0.132115   |     -      |     -     |   21.91  
   2    |  2200   | 

   2    |  4280   |   0.103932   |     -      |     -     |   21.83  
   2    |  4300   |   0.145844   |     -      |     -     |   21.80  
   2    |  4320   |   0.128802   |     -      |     -     |   21.84  
   2    |  4340   |   0.187283   |     -      |     -     |   21.85  
   2    |  4360   |   0.135049   |     -      |     -     |   21.81  
   2    |  4380   |   0.150776   |     -      |     -     |   21.77  
   2    |  4400   |   0.101563   |     -      |     -     |   21.86  
   2    |  4420   |   0.123992   |     -      |     -     |   21.90  
   2    |  4440   |   0.239859   |     -      |     -     |   21.95  
   2    |  4460   |   0.175891   |     -      |     -     |   21.81  
   2    |  4480   |   0.120384   |     -      |     -     |   21.89  
   2    |  4500   |   0.087169   |     -      |     -     |   21.84  
   2    |  4520   |   0.144038   |     -      |     -     |   21.84  
   2    |  4540   |   0.087797   |     -      |     -     |   21.89  
   2    |  4560   | 

In [45]:
### Prediction with BERT

In [46]:
def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

In [47]:
### Testing model on test data

In [48]:
print('Tokenizing data...')
test_inputs, test_masks = preprocessing_for_bert(test_imdb['review'])

# Create the DataLoader for our test set
test_dataset = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=8)

Tokenizing data...


In [49]:
probs = bert_predict(bert_classifier, test_dataloader)

In [50]:
threshold = 0.5
preds = np.where(probs[:, 1] > threshold, 1, 0)

In [51]:
accuracy_score(test_imdb['sentiment'], preds)

0.9318