#Amazon Review- Tagging Negative Review in Amazon Product Review with BERT Model

#Introduction

### Connecting to Golab Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m94.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25h

### Importing Libraries

In [None]:
import csv
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import classification_report
import time
import torch
from tabulate import tabulate
from torch import cuda
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import AutoTokenizer, AutoModel

### Check Device

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'

### Dataset

In [None]:
df_train = "/content/drive/MyDrive/Masters_Thesis/Dataset/encoded_data/encoded_topic_train.csv"
df_val = "/content/drive/MyDrive/Masters_Thesis/Dataset/encoded_data/encoded_topic_test.csv"
target_list = "encoded_topic"

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
train_loader, valid_loader = get_data_loaders(df_train, df_val, tokenizer, train_batch_size = 8, valid_batch_size = 8)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})


### Model  Implementation, Prediction, and Save File

In [None]:
model = BERTModel()
model.to(device)
loss_function = torch.nn.CrossEntropyLoss()
LEARNING_RATE = 2e-5
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train(total_epoch=20, model=model, train_loader=train_loader)

 Epoch  |  Train Loss  | Train Accuracy |  Elapsed 
---------------------------------------------------
   1    |   0.262634   |  89.475318   |   3.93   
   2    |   0.131984   |  95.860499   |   3.85   
   3    |   0.096455   |  96.947118   |   3.85   
   4    |   0.075153   |  97.785367   |   3.85   
   5    |   0.055314   |  98.468385   |   3.85   
   6    |   0.044060   |  98.778847   |   3.85   
   7    |   0.036800   |  99.037566   |   3.85   
   8    |   0.030531   |  99.110007   |   3.85   
   9    |   0.034959   |  99.089310   |   3.85   
  10    |   0.030096   |  99.254890   |   3.85   
  11    |   0.028092   |  99.265239   |   3.85   
  12    |   0.026596   |  99.348029   |   3.85   
  13    |   0.027726   |  99.203146   |   3.85   
  14    |   0.021930   |  99.451516   |   3.85   
  15    |   0.022873   |  99.275587   |   3.85   
  16    |   0.024166   |  99.337680   |   3.85   
  17    |   0.017642   |  99.534306   |   3.85   
  18    |   0.019256   |  99.482562   |   3.85

In [None]:
predictions = valid(model, valid_loader) 

 Train Loss  | Train Accuracy |  Elapsed 
-----------------------------------------
  0.198969   |   96.771523    |   0.30   


In [None]:
path = "/content/drive/MyDrive/Masters_Thesis/Models/BERT/Topic/Topic_only_back_01"
saveModel(model=model, path=path)

All files saved


In [None]:
saveCSVValidationResult(predictions,path)

### Data Preprocessing & DataLoader

In [None]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df.reviewText
        self.tokenizer = tokenizer
        self.targets = df[target_list].values

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        #the attention_masks and token type ids, everything is returned in a dictionary format
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.long),
            'text' : text
        }

def get_data_loaders(train_dataframe, valid_dataframe, tokenizer, max_len=512, train_batch_size=16, valid_batch_size=16, learning_rate=2e-5):

    Train_Dataframe = pd.read_csv(train_dataframe)
    Validation_Dataframe = pd.read_csv(valid_dataframe)
    print(tokenizer)
    train_dataset = CustomDataset(Train_Dataframe, tokenizer, max_len)
    valid_dataset = CustomDataset(Validation_Dataframe, tokenizer, max_len)
    train_loader = DataLoader(train_dataset, batch_size=train_batch_size,
                              num_workers=1, shuffle=True, pin_memory=True)
    valid_loader = DataLoader(valid_dataset, batch_size=valid_batch_size,
                              num_workers=1, shuffle=False, pin_memory=True)

    return train_loader, valid_loader

### BERT Model

In [None]:
class BERTModel(torch.nn.Module):
    def __init__(self):
        super(BERTModel, self).__init__()
        self.roberta = AutoModel.from_pretrained('roberta-base')
        self.dropout = torch.nn.Dropout(0.5)
        self.fc = torch.nn.Linear(768, 2)

    def forward(self, ids, mask, token_type_ids):
        _, features = self.roberta(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        features = self.dropout(features)
        output = self.fc(features)
        return output

### Training & Validation

In [None]:

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

def train(total_epoch, model, train_loader):
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Train Accuracy':^12} | {'Elapsed':^9}")
    print("-"*51)
    for epoch in range(total_epoch): 
      t0_epoch = time.time() 
      tr_loss = 0
      n_correct = 0
      nb_tr_steps = 0
      nb_tr_examples = 0
      model.train()
      for _,data in enumerate(train_loader, 0):
          ids = data['ids'].to(device, dtype = torch.long)
          mask = data['mask'].to(device, dtype = torch.long)
          token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
          targets = data['targets'].to(device, dtype = torch.long)

          outputs = model(ids, mask, token_type_ids)
        
          loss = loss_function(outputs, targets)
          tr_loss += loss.item()
          big_val, big_idx = torch.max(outputs.data, dim=1)
          n_correct += calcuate_accu(big_idx, targets)

          nb_tr_steps += 1
          nb_tr_examples+=targets.size(0)
          
          optimizer.zero_grad()
          loss.backward()
          # # When using GPU
          optimizer.step()

      time_elapsed = (time.time() - t0_epoch)/60
      epoch_loss = tr_loss/nb_tr_steps
      epoch_accu = (n_correct*100)/nb_tr_examples
      print(f"{epoch + 1:^7} | {epoch_loss:^12.6f} | {epoch_accu:^12.6f} | {time_elapsed:^9.2f}")
 

def valid(model, testing_loader):
    print(f"{'Train Loss':^12} | {'Train Accuracy':^12} | {'Elapsed':^9}")
    print("-"*41)
    model.eval()
    t0_epoch = time.time() 
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    predictions = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
          ids = data['ids'].to(device, dtype = torch.long)
          mask = data['mask'].to(device, dtype = torch.long)
          token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
          targets = data['targets'].to(device, dtype = torch.long)
          text = data['text']
          outputs = model(ids, mask, token_type_ids)


          loss = loss_function(outputs, targets)
          tr_loss += loss.item()
          big_val, big_idx = torch.max(outputs.data, dim=1)
          n_correct += calcuate_accu(big_idx, targets)

          nb_tr_steps += 1
          nb_tr_examples+=targets.size(0)

          for i in range(len(text)):
            predictions.append({
                'text': text[i],
                'predicted': switch_issue(big_idx[i].item()),
                'target': switch_issue(targets[i].item())
                })

    time_elapsed = (time.time() - t0_epoch)/60                
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"{epoch_loss:^12.6f} | {epoch_accu:^14.6f} | {time_elapsed:^9.2f}")
    return predictions

def switch_issue(case):
    if case == 0:
        return "Product Issue"
    else:
        return "Seller Issue"

### Save Files

In [None]:
def saveModel(model, path):
  MODEL_PATH = path+'/model.pth'
  VOCAB_PATH = path+'/vocab'
  torch.save(model.state_dict(), MODEL_PATH)  
  print('All files saved')

def loadModel(path):
  MODEL_PATH = path+'/model.pth'
  VOCAB_PATH = path+'/vocab'
  model.load_state_dict(torch.load(MODEL_PATH))
  tokenizer = AutoTokenizer.from_pretrained(VOCAB_PATH)

  return model, tokenizer

In [None]:
def saveCSVValidationResult(predictions, path):
    path = path+"/result.csv"
    with open(path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Text', 'Predicted', 'Target'])
        for example in predictions:
            writer.writerow([example['text'], example['predicted'], example['target']])