In [23]:
import numpy  as np
import pandas as pd

import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

PATH    = "../datasets/"
FILE    = "movie_reviewsBERT.csv"
batch_1 = pd.read_csv(PATH + FILE, delimiter=',', header=None)

print(batch_1.shape)
ROW =1
print("Review 1st column: " + batch_1.iloc[ROW][0])
print("Rating 2nd column: " + str(batch_1.iloc[ROW][1]))

# Show counts for review scores.
print("** Showing review counts")
print(batch_1[1].value_counts())

# Load pretrained models.
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel,
                                                    ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model     = model_class.from_pretrained(pretrained_weights)

# Tokenize the sentences.
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
print("\n****************** Tokenized reviews ")
print(tokenized)
print(tokenized.values)
print("******************")

# For processing we convert to 2D array.
max_len = 0

# Get maximum number of tokens (get biggest sentence).
print("\nGetting maximum number of tokens in a sentence")
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

print("Most tokens in a review (max_len): " + str(max_len))

# Add padding
print("------------")
print("Padded so review arrays as same size: ")
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
print("These are the padded reviews:")
print(padded)
print("This is the last padded sentence:")
LAST_INDEX = len(batch_1) -1
print(padded[LAST_INDEX])
print("\n------------")
print("Attention mask tells BERT to ignore the padding.")

# If we directly send padded data to BERT, that would slightly confuse it.
# We need to create another variable to tell it to ignore (mask) the padding
# we've added when it's processing its input. That's what attention_mask is
attention_mask = np.where(padded != 0, 1, 0)
print(attention_mask.shape)
print(attention_mask)
print(attention_mask[LAST_INDEX])
print("=============")

input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)
print("Input ids which are padded reviews in torch tensor format:")
print(input_ids)
print("Attention mask in torch tensor format:")
print(attention_mask)
print("++++++++++++++")

# The model() function runs our sentences through BERT. The results of the
# processing will be returned into last_hidden_states.
print("BERT model transforms tokens and attention mask tensors into features ")
print("for logistic regression.")
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

# We'll save those in the features variable, as they'll serve as the
# features to our logitics regression model.
features = last_hidden_states[0][:,0,:].numpy()

print("Let's see the features: ")
print(features)
print(features[1999])
print("-------------------------")

from   sklearn.linear_model    import LogisticRegression
from   sklearn.model_selection import train_test_split

# The labels indicating which sentence is positive and negative
# now go into the labels variable
labels = batch_1[1]

# Let's now split our datset into a training set and testing set (even
# though we're using 2,000 sentences from the SST2 training set).
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)
train_labels.describe()
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(train_features, train_labels)

print(lr_clf.score(test_features, test_labels))

(2001, 2)
Review 1st column: a stirring , funny and finally transporting re imagining of beauty and the beast and 1930s horror films
Rating 2nd column: 1
** Showing review counts
1    1042
0     959
Name: 1, dtype: int64


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



****************** Tokenized reviews 
0                                        [101, 1014, 102]
1       [101, 1037, 18385, 1010, 6057, 1998, 2633, 182...
2       [101, 4593, 2128, 27241, 23931, 2013, 1996, 62...
3       [101, 2027, 3653, 23545, 2037, 4378, 24185, 10...
4       [101, 2023, 2003, 1037, 17453, 14726, 19379, 1...
                              ...                        
1996    [101, 2205, 20857, 1998, 11865, 16643, 2135, 5...
1997    [101, 2009, 2515, 1050, 1005, 1056, 2147, 2004...
1998    [101, 2023, 2028, 8704, 2005, 1996, 11848, 199...
1999    [101, 1999, 1996, 2171, 1997, 2019, 9382, 1898...
2000    [101, 1996, 3185, 2003, 25757, 2011, 1037, 244...
Name: 0, Length: 2001, dtype: object
[list([101, 1014, 102])
 list([101, 1037, 18385, 1010, 6057, 1998, 2633, 18276, 2128, 16603, 1997, 5053, 1998, 1996, 6841, 1998, 5687, 5469, 3152, 102])
 list([101, 4593, 2128, 27241, 23931, 2013, 1996, 6276, 2282, 2723, 1997, 2151, 2445, 12217, 7815, 102])
 ...
 list([101, 2023, 2028,

In [22]:
dfEx = pd.DataFrame(columns=[0,1])
dfEx = dfEx.append({0:"This brilliant movie is jaw-dropping.", 1:1},
                   ignore_index=True)
dfEx = dfEx.append({0:"This movie is awful.", 1:0}, ignore_index=True)

tokenized_lines = dfEx[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
print("\n Sentences Tokenized: ")
print(tokenized_lines.values)

# padding
padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized_lines.values])
print("----Padded------")
print(padded)
print("----------------")

# add attention mask
attention_mask = np.where(padded != 0, 1, 0)
print("------ Attention Mask ------")
print(attention_mask)

# Convert attention mask in tensor format
attention_mask = torch.tensor(attention_mask)
print("------ Torch Attention Mask ------")
print(attention_mask)

# Convert padded reviews in tensor format
input_ids = torch.tensor(padded)
print("------ Torch Padded ------")
print(input_ids)

# The model() function runs our sentences through BERT. The results of the
# processing will be returned into last_hidden_states.
print("BERT model transforms tokens and attention mask tensors into features ")
print("for logistic regression.")
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

# We'll save those in the features variable, as they'll serve as the
# features to our logitics regression model.
features = last_hidden_states[0][:,0,:].numpy()

print("Let's see the features: ")
print(features)
print("-------------------------")



 Sentences Tokenized: 
[list([101, 2023, 8235, 3185, 2003, 5730, 1011, 7510, 1012, 102])
 list([101, 2023, 3185, 2003, 9643, 1012, 102])]
----Padded------
[[ 101 2023 8235 3185 2003 5730 1011 7510 1012  102    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [ 101 2023 3185 2003 9643 1012  102    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]]
----------------
------ Attention Mask ------
[[1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

ValueError: Found input variables with inconsistent numbers of samples: [2, 2001]

In [25]:
import time
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from joblib.logger import format_time
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoModel, BertTokenizerFast

# specify GPU
device = torch.device("cpu")

PATH = "../datasets/"
df   = pd.read_csv(PATH + "spamdata_lite.csv")
df.head()

# Create training set.
train_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['label'], random_state=2018,
                                                                    test_size=0.3, stratify=df['label'])

# Use temp set from above to create validation and test set.
# Validation is used to test the model while training. Test is used
# to validate the model after training.
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, random_state=2018,
                                                                test_size=0.5, stratify=temp_labels)

# import BERT-base pretrained model.
# We are using weights that are suitable for uncased content. However if
# upper and lower case words are relevant for your domain use cased.
bertModel = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Tokenize and encode sentences.
def prepXandY(text, labels):
    textList = text.tolist()
    tokens = tokenizer.batch_encode_plus(
        textList,
        max_length=25,
        pad_to_max_length=True,
        truncation=True
    )

    seq = torch.tensor(tokens['input_ids'])
    mask = torch.tensor(tokens['attention_mask'])
    y = torch.tensor(labels.tolist())

    return seq, mask, y

print(test_text)

# Prepare the data.
def getTensor(text, labels):
    seq, mask, y = prepXandY(train_text, train_labels)
    tensorData = TensorDataset(seq, mask, y)
    return tensorData

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# define a batch size
batch_size = 32

# wrap tensors
train_data = getTensor(train_text, train_labels)
val_data   = getTensor(val_text, val_labels)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# freeze all the parameters
for param in bertModel.parameters():
    param.requires_grad = False

class BERT_Arch(nn.Module):
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert

        # dropout layer
        self.dropout = nn.Dropout(0.1)

        # relu activation function
        self.relu = nn.ReLU()

        # dense layer 1
        self.fc1 = nn.Linear(768, 512)

        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512, 2)

        # softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)

    # define the forward pass
    def forward(self, sent_id, mask):
        # pass the inputs to the model
        _, cls_hs = self.bert(sent_id, attention_mask=mask)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)

        # output layer
        x = self.fc2(x)

        # apply softmax activation
        x = self.softmax(x)
        return x

# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bertModel)

# push the model to GPU
model = model.to(device)

# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(),
                  lr=1e-5)  # learning rate

from sklearn.utils.class_weight import compute_class_weight

# compute the class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)
print("Class Weights:", class_weights)

# converting list of class weights to a tensor
weights = torch.tensor(class_weights, dtype=torch.float)

# push to GPU (if it exists)
weights = weights.to(device)

# define the loss function
cross_entropy = nn.NLLLoss(weight=weights)

# number of training epochs
epochs = 10

from sklearn.utils.class_weight import compute_class_weight

# compute the class weights
class_weights = compute_class_weight(class_weight ='balanced',
                                     classes=np.unique(train_labels),
                                     y=train_labels)
print("Class Weights:", class_weights)

# function to train the model
def train():
    model.train()

    total_loss, total_accuracy = 0, 0

    # empty list to save model predictions
    total_preds = []

    # iterate over batches
    for step, batch in enumerate(train_dataloader):

        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        # push the batch to gpu
        batch = [r.to(device) for r in batch]

        sent_id, mask, labels = batch

        # clear previously calculated gradients
        model.zero_grad()

        # get model predictions for the current batch
        preds = model(sent_id, mask)

        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds = preds.detach().cpu().numpy()

        # append the model predictions
        total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds = np.concatenate(total_preds, axis=0)

    # returns the loss and predictions
    return avg_loss, total_preds

# function for evaluating the model
def evaluate():
    print("\nEvaluating...")

    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0

    # empty list to save the model predictions
    total_preds = []

    # iterate over batches
    for step, batch in enumerate(val_dataloader):

        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch

        # deactivate autograd
        with torch.no_grad():

            # model predictions
            preds = model(sent_id, mask)

            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds, labels)
            total_loss = total_loss + loss.item()
            preds = preds.detach().cpu().numpy()
            total_preds.append(preds)

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader)

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds = np.concatenate(total_preds, axis=0)
    return avg_loss, total_preds

# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses = []
valid_losses = []

# for each epoch
for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

    # train model
    train_loss, _ = train()
    valid_loss, _ = evaluate()

    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

# load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

# Validate with test data.
test_seq, test_mask, test_y = prepXandY(test_text, test_labels)

# get predictions for test data
with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()

# %%
print("Show preds before report")
print(preds)
preds = np.argmax(preds, axis=1)
print("Preds after argmax")
print(preds)
print(classification_report(test_y, preds))

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

126    Just so that you know,yetunde hasn't sent mone...
75           I am waiting machan. Call me once you free.
90     Yeah do! Don‰Û÷t stand to close tho- you‰Û÷ll ...
48     Yeah hopefully, if tyler can't do it I could m...
119    PRIVATE! Your 2004 Account Statement for 07742...
169    Yes :)it completely in out of form:)clark also...
144                            Yes see ya not on the dot
38                           Anything lor... U decide...
114    Wa, ur openin sentence very formal... Anyway, ...
57                    Sorry, I'll call later in meeting.
98     Hi. Wk been ok - on hols now! Yes on for a bit...
10     I'm gonna be home soon and i don't want to tal...
162    I'm so in love with you. I'm excited each day ...
65     As a valued customer, I am pleased to advise y...
33     For fear of fainting with the of all that hous...
124             ÌÏ predict wat time Ì_'ll finish buying?
95     Your free ringtone is waiting to be collected....
89             Ela kano.,il dow

TypeError: linear(): argument 'input' (position 1) must be Tensor, not str