In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -qq transformers

In [None]:
import transformers
import torch
import torch.nn as nn

In [None]:
from tqdm import tqdm
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

# Define Cofigurations 

In [None]:
DEVICE = "cuda"
MAX_LEN=64
TRAIN_BATCH_SIZE=64
VALID_BATCH_SIZE=64
TEST_BATCH_SIZE=64
EPOCHS=2
TRAINING_FILE = "/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
MODEL_PATH = "./model_BERT.bin"
BERT_PRE_TRAINED_MODEL = "bert-base-uncased"
TOKENIZER = transformers.BertTokenizer.from_pretrained(BERT_PRE_TRAINED_MODEL)

In [None]:
dfx = pd.read_csv(TRAINING_FILE).fillna("none")

In [None]:
print(dfx.head())
print(dfx.shape)

In [None]:
class_names = list(np.unique(dfx.sentiment))
print(class_names)
print(len(class_names))

In [None]:
import seaborn as sns
sns.countplot(dfx.sentiment)

# Convert sentiments into integer 1 and 0

In [None]:
dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0)

In [None]:
print(dfx.info())

# Splitting dataset into Train-Validation-Test

In [None]:
dfx_sample1, dfx_sample2 = model_selection.train_test_split(dfx, test_size=0.8,random_state=42, 
                                                           stratify=dfx.sentiment.values)
print(dfx_sample1.shape)
print(dfx_sample2.shape)

In [None]:
df_train, df_valid = model_selection.train_test_split(dfx_sample1, test_size=0.2,random_state=42,stratify=dfx_sample1.sentiment.values)
#df_train, df_valid = model_selection.train_test_split(dfx, test_size=0.2,random_state=42,stratify=dfx.sentiment.values)
df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)
print(df_train.shape)
print(df_valid.shape)

In [None]:
df_valid, df_test = model_selection.train_test_split(df_valid, test_size=0.5,random_state=42, 
                                                      stratify=df_valid.sentiment.values)
df_valid = df_valid.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
print(df_valid.shape)
print(df_test.shape)

# BERT Preprocessing of Input dataset - Tokenization of data into Input IDs and Attention Mask

In [None]:
class IMDBDataset:
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]
        
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

# DataLoader

In [None]:
train_dataset = IMDBDataset(reviews=df_train.review.values, targets=df_train.sentiment.values,tokenizer=TOKENIZER,
                            max_len=MAX_LEN)
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4)

In [None]:
len(train_dataset)

In [None]:
len(train_data_loader)

In [None]:
valid_dataset = IMDBDataset(reviews=df_valid.review.values, targets=df_valid.sentiment.values,tokenizer=TOKENIZER,
                            max_len=MAX_LEN)
valid_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1)

In [None]:
len(valid_dataset)

In [None]:
len(valid_data_loader)

# BERT Classifier Transformer

In [None]:
class BERTBaseUncasedClassifier(nn.Module):
    def __init__(self,n_classes):
        super(BERTBaseUncasedClassifier, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_PRE_TRAINED_MODEL)
        self.bert_drop = nn.Dropout(0.3) #Regularization
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes) #Fully connected Layer

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False
        )
        output = self.bert_drop(pooled_output)
        return self.out(output)

In [None]:
device = torch.device(DEVICE)
model = BERTBaseUncasedClassifier(len(class_names))
model.to(device)

# AdamW Optimizer

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
print("Length of Train Data Loader: ",len(train_data_loader))
print("Total Steps: ",total_steps)
print("Epochs: ",EPOCHS)

scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)
loss_fn = nn.CrossEntropyLoss().to(device)

# Training Function

In [None]:
def train_epoch(model,data_loader,loss_fn,optimizer,device,scheduler,n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for bi, d in tqdm(enumerate(data_loader),total=len(data_loader)):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return correct_predictions.double() / n_examples, np.mean(losses)

# Evaluation Function

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader),total=len(data_loader)):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
              )
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

# BERT Training process

In [None]:
import warnings
warnings.filterwarnings("ignore")

from collections import defaultdict

history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')
    val_acc, val_loss = eval_model(
    model,
    valid_data_loader,
    loss_fn,
    device,
    len(df_valid)
    )
    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    checkpoint = {'model': BERTBaseUncasedClassifier(len(class_names)),
              'state_dict': model.state_dict(),
              'optimizer' : optimizer.state_dict()}
    
    torch.save(checkpoint, 'checkpoint.pth')
           
    if val_acc > best_accuracy:
        #torch.save(model.state_dict(), 'best_model_state.bin')
        torch.save(model.state_dict(), MODEL_PATH)
        best_accuracy = val_acc

In [None]:
import matplotlib.pyplot as plt
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

# Test Dataset Loader

In [None]:
test_dataset = IMDBDataset(reviews=df_test.review.values, targets=df_test.sentiment.values,tokenizer=TOKENIZER,
                            max_len=MAX_LEN)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, num_workers=4)

In [None]:
print(len(test_dataset))
print(len(test_data_loader))

In [None]:
def get_predictions(model, data_loader):
    model = model.eval()
    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []
    with torch.no_grad():
        for d in data_loader:
            texts = d["review_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(targets)
        predictions = torch.stack(predictions).cpu()
        prediction_probs = torch.stack(prediction_probs).cpu()
        real_values = torch.stack(real_values).cpu()
    return review_texts, predictions, prediction_probs, real_values

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(model,test_data_loader)

In [None]:
print(len(y_review_texts),len(y_pred),len(y_pred_probs),len(y_test))

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True sentiment')
    plt.xlabel('Predicted sentiment');
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)

In [None]:
def load_checkpoint(filepath):
    checkpoint = torch.load(filepath)
    model_load = checkpoint['model']
    model_load.load_state_dict(checkpoint['state_dict'])
    for parameter in model_load.parameters():
        parameter.requires_grad = False
    
    #model.eval()
    
    return model_load

In [None]:
model_load = load_checkpoint('checkpoint.pth')
print(model_load)

In [None]:
model_load = model_load.to(device)

In [None]:
review_text = "I hate love working on BERT!!"

encoded_review = TOKENIZER.encode_plus(
  review_text,
  max_length=MAX_LEN,
  add_special_tokens=True,
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',
)

raw_input_ids = encoded_review['input_ids'].to(device)
raw_attention_mask = encoded_review['attention_mask'].to(device)
raw_output = model_load(raw_input_ids, raw_attention_mask)
_, raw_prediction = torch.max(raw_output, dim=1)
print(f'Review text: {review_text}')
print(f'Sentiment  : {class_names[raw_prediction]}')