# Sentiment Classification with BERT

## Import Libraries

In [60]:
import pandas as pd 
import numpy as np 

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch 
import torch.nn.functional as F

from datasets import load_dataset

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, average_precision_score, confusion_matrix

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

## Load Data

In [38]:
data = load_dataset('imdb', split = 'test')

y = np.array(data['label'])
X = np.array(data['text'])

## Descriptive Statistics

In [39]:
print('Number of samples:', len(X))

Number of samples: 25000


In [40]:
len_of_text = [len(text.split()) for text in X]

print('Average length of text:', np.mean(len_of_text))
print('Min length of text:', np.min(len_of_text))
print('Max length of text:', np.max(len_of_text))
print('Median length of text:', np.median(len_of_text))
print()

Average length of text: 228.52668
Min length of text: 4
Max length of text: 2278
Median length of text: 172.0



In [41]:
print('Percent of positive reviews in set:', np.mean(y) * 100)


Percent of positive reviews in set: 50.0


## Example Reviews

In [42]:
print('Example positive review:')
print(X[y == 1][65])
print()

print('Example negative review:')
print(X[y == 0][25])

Example positive review:
Fun, entertaining movie about WWII German spy (Julie Andrews!) falling in love with American pilot (Rock Hudson), while trying to get secrets from him. For some reason this was attacked by critics and shunned by the public in 1970--I can't see why. It's beautifully shot, has wonderful costumes and interiors, and exciting aerial dogfights. Also it has Andrews doing a strip-tease (strictly PG material) and singing a beautiful song--"Whistling in the Dark". The movie does have problems. Andrews and Hudson did not get along during the shooting of this--and it shows. Their love scenes lack spark and they have zero sexual chemistry. Still, they turn in OK performances. The film is a little long (even in the 105 min director's cut I saw) and gets way too dark and serious at the end. Still, worth catching. Try seeing the directors cut...the other one runs half an hour longer!

Example negative review:
I of course saw the previews for this at the beginning of some other

## Simple Preprocessing Function

In [43]:
def preproccess_text(text): 
    return text.strip().lower()

X = [preproccess_text(text) for text in X_test]

## Load Model 

In [44]:
MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)

## Prediction Function

In [55]:
def predict_sentiment(text): 
    inputs = tokenizer(text, return_tensors = 'pt', truncation = True, padding = True).to(device) 
    outputs = model(**inputs) 
    probs = F.softmax(outputs.logits, dim = 1) 
    label = torch.argmax(probs, dim = 1)
    return label, probs[:, 1]

In [56]:
predict_sentiment(X[0:4])

(tensor([0, 0, 0, 0]),
 tensor([3.8375e-04, 3.8294e-01, 2.8993e-04, 4.2437e-03],
        grad_fn=<SelectBackward0>))

## Predict on Reviews

In [59]:
BATCH_SIZE = 64
model.eval()
with torch.no_grad():
    pred, pred_proba = [], []
    for i in range(0, len(X), BATCH_SIZE):
        batch = X[i:i + BATCH_SIZE]
        labels, probs = predict_sentiment(batch)
        pred.extend(labels.cpu().numpy())
        pred_proba.extend(probs.detach().cpu().numpy())

        if i % 640 == 0: 
            print(f'Processed {i + len(batch)} samples...')

Processed 64 samples...
Processed 704 samples...
Processed 1344 samples...
Processed 1984 samples...
Processed 2624 samples...
Processed 3264 samples...
Processed 3904 samples...
Processed 4544 samples...
Processed 5184 samples...
Processed 5824 samples...
Processed 6464 samples...
Processed 7104 samples...
Processed 7744 samples...
Processed 8384 samples...
Processed 9024 samples...
Processed 9664 samples...
Processed 10304 samples...
Processed 10944 samples...
Processed 11584 samples...
Processed 12224 samples...
Processed 12864 samples...
Processed 13504 samples...
Processed 14144 samples...
Processed 14784 samples...
Processed 15424 samples...
Processed 16064 samples...
Processed 16704 samples...
Processed 17344 samples...
Processed 17984 samples...
Processed 18624 samples...
Processed 19264 samples...
Processed 19904 samples...
Processed 20544 samples...
Processed 21184 samples...
Processed 21824 samples...
Processed 22464 samples...
Processed 23104 samples...
Processed 23744 samp

In [61]:
print('Accuracy:', accuracy_score(y, pred))
print('F1 score:', f1_score(y, pred))
print('Precision:', precision_score(y, pred))
print('Recall:', recall_score(y, pred))
print('ROC AUC:', roc_auc_score(y, pred_proba))
print('Average Precision:', average_precision_score(y, pred_proba))
print('Confusion matrix:')
print(confusion_matrix(y, pred))

Accuracy: 0.89072
F1 score: 0.8874794069192751
Precision: 0.9146010186757215
Recall: 0.86192
ROC AUC: 0.9587060063999999
Average Precision: 0.9576490987225383
Confusion matrix:
[[11494  1006]
 [ 1726 10774]]
