#Amazon Review- Tagging Negative Review in Amazon Product Review with BERT Model

#Introduction

### Connecting to Golab Colab

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m82.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m104.6 MB/s[0m eta [36m0:00:00[0m
[?25h

### Importing Libraries

In [4]:
import csv
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import classification_report
import time
import torch
from tabulate import tabulate
from torch import cuda
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import AutoTokenizer, AutoModel


### Check Device

In [5]:
device = 'cuda' if cuda.is_available() else 'cpu'

### Dataset

In [6]:
df_train = "/content/drive/MyDrive/Masters_Thesis/Dataset/encoded_data/encoded_topic(seller)2subTopic_train.csv"
df_val = "/content/drive/MyDrive/Masters_Thesis/Dataset/encoded_data/encoded_topic(seller)2subTopic_test.csv"
target_list = "encode_sub_seller_topic"

In [12]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
train_loader, valid_loader = get_data_loaders(df_train, df_val, tokenizer, train_batch_size = 8, valid_batch_size = 8)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})


### Model

In [13]:
model = BERTModel()
model.to(device)
loss_function = torch.nn.CrossEntropyLoss()
LEARNING_RATE = 2e-5
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
train(total_epoch=20, model=model, train_loader=train_loader)

 Epoch  |  Train Loss  | Train Accuracy |  Elapsed 
---------------------------------------------------
   1    |   0.117592   |  95.571732   |   1.39   
   2    |   0.031916   |  99.059751   |   1.32   
   3    |   0.024668   |  99.363057   |   1.31   
   4    |   0.017896   |  99.514710   |   1.31   
   5    |   0.012348   |  99.757355   |   1.31   
   6    |   0.010203   |  99.727025   |   1.31   
   7    |   0.005108   |  99.848347   |   1.31   
   8    |   0.003548   |  99.939339   |   1.31   
   9    |   0.013344   |  99.727025   |   1.31   
  10    |   0.010972   |  99.818016   |   1.31   
  11    |   0.005969   |  99.818016   |   1.31   
  12    |   0.015679   |  99.696694   |   1.31   
  13    |   0.007551   |  99.787686   |   1.31   
  14    |   0.004824   |  99.878678   |   1.31   
  15    |   0.002386   |  99.878678   |   1.31   
  16    |   0.005537   |  99.878678   |   1.31   
  17    |   0.007919   |  99.878678   |   1.31   
  18    |   0.003897   |  99.939339   |   1.31

In [15]:
predictions = valid(model, valid_loader) 

 Train Loss  | Train Accuracy |  Elapsed 
-----------------------------------------
  0.063883   |   99.271845    |   0.10   


In [16]:
path = "/content/drive/MyDrive/Masters_Thesis/Models/BERT/Topic(Seller)2SubTopic/Topic(Seller)2SubTopic_only_back_01"
saveModel(model=model, path=path)

All files saved


In [17]:
saveCSVValidationResult(predictions,path)

### Data Preprocessing & DataLoader

In [7]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df.reviewText
        self.tokenizer = tokenizer
        self.targets = df[target_list].values

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        #the attention_masks and token type ids, everything is returned in a dictionary format
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.long),
            'text' : text
        }

def get_data_loaders(train_dataframe, valid_dataframe, tokenizer, max_len=512, train_batch_size=16, valid_batch_size=16, learning_rate=2e-5):

    Train_Dataframe = pd.read_csv(train_dataframe)
    Validation_Dataframe = pd.read_csv(valid_dataframe)
    print(tokenizer)
    train_dataset = CustomDataset(Train_Dataframe, tokenizer, max_len)
    valid_dataset = CustomDataset(Validation_Dataframe, tokenizer, max_len)
    train_loader = DataLoader(train_dataset, batch_size=train_batch_size,
                              num_workers=1, shuffle=True, pin_memory=True)
    valid_loader = DataLoader(valid_dataset, batch_size=valid_batch_size,
                              num_workers=1, shuffle=False, pin_memory=True)

    return train_loader, valid_loader

### BERT Model

In [8]:
class BERTModel(torch.nn.Module):
    def __init__(self):
        super(BERTModel, self).__init__()
        self.roberta = AutoModel.from_pretrained('roberta-base')
        self.dropout = torch.nn.Dropout(0.5)
        self.fc = torch.nn.Linear(768, 2)

    def forward(self, ids, mask, token_type_ids):
        _, features = self.roberta(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        features = self.dropout(features)
        output = self.fc(features)
        return output

### Training & Validation

In [9]:

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

def train(total_epoch, model, train_loader):
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Train Accuracy':^12} | {'Elapsed':^9}")
    print("-"*51)
    for epoch in range(total_epoch): 
      t0_epoch = time.time() 
      tr_loss = 0
      n_correct = 0
      nb_tr_steps = 0
      nb_tr_examples = 0
      model.train()
      for _,data in enumerate(train_loader, 0):
          ids = data['ids'].to(device, dtype = torch.long)
          mask = data['mask'].to(device, dtype = torch.long)
          token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
          targets = data['targets'].to(device, dtype = torch.long)

          outputs = model(ids, mask, token_type_ids)
        
          loss = loss_function(outputs, targets)
          tr_loss += loss.item()
          big_val, big_idx = torch.max(outputs.data, dim=1)
          n_correct += calcuate_accu(big_idx, targets)

          nb_tr_steps += 1
          nb_tr_examples+=targets.size(0)
          
          optimizer.zero_grad()
          loss.backward()
          # # When using GPU
          optimizer.step()

      time_elapsed = (time.time() - t0_epoch)/60
      epoch_loss = tr_loss/nb_tr_steps
      epoch_accu = (n_correct*100)/nb_tr_examples
      print(f"{epoch + 1:^7} | {epoch_loss:^12.6f} | {epoch_accu:^12.6f} | {time_elapsed:^9.2f}")
 

def valid(model, testing_loader):
    print(f"{'Train Loss':^12} | {'Train Accuracy':^12} | {'Elapsed':^9}")
    print("-"*41)
    model.eval()
    t0_epoch = time.time() 
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    predictions = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
          ids = data['ids'].to(device, dtype = torch.long)
          mask = data['mask'].to(device, dtype = torch.long)
          token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
          targets = data['targets'].to(device, dtype = torch.long)
          text = data['text']
          outputs = model(ids, mask, token_type_ids)


          loss = loss_function(outputs, targets)
          tr_loss += loss.item()
          big_val, big_idx = torch.max(outputs.data, dim=1)
          n_correct += calcuate_accu(big_idx, targets)

          nb_tr_steps += 1
          nb_tr_examples+=targets.size(0)

          for i in range(len(text)):
            predictions.append({
                'text': text[i],
                'predicted': switch_issue(big_idx[i].item()),
                'target': switch_issue(targets[i].item())
                })

    time_elapsed = (time.time() - t0_epoch)/60                
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"{epoch_loss:^12.6f} | {epoch_accu:^14.6f} | {time_elapsed:^9.2f}")
    return predictions

def switch_issue(issue_type):
    switcher = {
      1: 'Delivery and Return Issue',
      0: 'Product Authenticity Issue'
      }
    return switcher.get(issue_type, "Invalid Issue Type")

### Save Files

In [10]:
def saveModel(model, path):
  MODEL_PATH = path+'/model.pth'
  VOCAB_PATH = path+'/vocab'
  torch.save(model.state_dict(), MODEL_PATH)  
  print('All files saved')

def loadModel(path):
  MODEL_PATH = path+'/model.pth'
  VOCAB_PATH = path+'/vocab'
  model.load_state_dict(torch.load(MODEL_PATH))
  tokenizer = AutoTokenizer.from_pretrained(VOCAB_PATH)

  return model, tokenizer

In [11]:
def saveCSVValidationResult(predictions, path):
    path = path+"/result.csv"
    with open(path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Text', 'Predicted', 'Target'])
        for example in predictions:
            writer.writerow([example['text'], example['predicted'], example['target']])