#Amazon Review- Tagging Negative Review in Amazon Product Review with DistilBERT Model

#Introduction

### Connecting to Golab Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m91.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m97.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25h

### Importing Libraries

In [None]:
import csv
import numpy as np
import pandas as pd
import torch
from sklearn import metrics
from torch import cuda
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import DistilBertModel, DistilBertTokenizer
import time

### Check Device

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'

### Dataset

In [None]:
df_train = "/content/drive/MyDrive/Masters_Thesis/Dataset/encoded_data/encoded_sub_topic_train.csv"
df_val = "/content/drive/MyDrive/Masters_Thesis/Dataset/encoded_data/encoded_sub_topic_test.csv"
target_list = "encoded_sub_topic"

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
train_loader, valid_loader = get_data_loaders(df_train, df_val, tokenizer, train_batch_size = 8, valid_batch_size = 8)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

DistilBertTokenizer(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)


### Model Implementation

In [None]:
model = DistilBERTClass()
model.to(device)
loss_function = torch.nn.CrossEntropyLoss()
LEARNING_RATE = 2e-5
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train(total_epoch=20, model=model, train_loader=train_loader)

 Epoch  |  Train Loss  | Train Accuracy |  Elapsed 
---------------------------------------------------
   1    |   0.761247   |  71.364862   |   7.32   
   2    |   0.421138   |  86.112840   |   7.51   
   3    |   0.292861   |  90.997718   |   7.51   
   4    |   0.207193   |  93.777225   |   7.48   
   5    |   0.145328   |  95.799627   |   7.49   
   6    |   0.106660   |  96.764157   |   7.48   
   7    |   0.083526   |  97.656088   |   7.48   
   8    |   0.061753   |  98.143539   |   7.49   
   9    |   0.053674   |  98.485791   |   7.49   
  10    |   0.050326   |  98.548019   |   7.48   
  11    |   0.044843   |  98.786559   |   7.47   
  12    |   0.035328   |  98.973242   |   7.49   
  13    |   0.035734   |  98.807301   |   7.49   
  14    |   0.033680   |  98.796930   |   7.48   
  15    |   0.028589   |  98.952499   |   7.48   
  16    |   0.030301   |  99.025099   |   7.48   
  17    |   0.024558   |  99.045841   |   7.48   
  18    |   0.024653   |  99.139183   |   7.48

In [None]:
predictions = valid(model, valid_loader) 

 Valid Loss  | Valid Accuracy |  Elapsed 
-----------------------------------------
  0.848262   |   86.146827    |   0.69   


In [None]:
path = "/content/drive/MyDrive/Masters_Thesis/Models/DISTILBERT"
saveModel(model=model, path=path)

All files saved


In [None]:
saveCSVValidationResult(predictions,path)

### Data Preprocessing & DataLoader

In [None]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df.reviewText
        self.tokenizer = tokenizer
        self.targets = df[target_list].values

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        #the attention_masks and token type ids, everything is returned in a dictionary format
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.long),
            'text' : text
        }

def get_data_loaders(train_dataframe, valid_dataframe, tokenizer, max_len=512, train_batch_size=16, valid_batch_size=16, learning_rate=2e-5):

    Train_Dataframe = pd.read_csv(train_dataframe)
    Validation_Dataframe = pd.read_csv(valid_dataframe)
    print(tokenizer)
    train_dataset = CustomDataset(Train_Dataframe, tokenizer, max_len)
    valid_dataset = CustomDataset(Validation_Dataframe, tokenizer, max_len)
    train_loader = DataLoader(train_dataset, batch_size=train_batch_size,
                              num_workers=1, shuffle=True, pin_memory=True)
    valid_loader = DataLoader(valid_dataset, batch_size=valid_batch_size,
                              num_workers=1, shuffle=False, pin_memory=True)

    return train_loader, valid_loader

### DistilBERT Model

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

### Training & Validation

In [None]:

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

def train(total_epoch, model, train_loader):
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Train Accuracy':^12} | {'Elapsed':^9}")
    print("-"*51)
    for epoch in range(total_epoch): 
      t0_epoch = time.time() 
      tr_loss = 0
      n_correct = 0
      nb_tr_steps = 0
      nb_tr_examples = 0
      model.train()
      for _,data in enumerate(train_loader, 0):
          ids = data['ids'].to(device, dtype = torch.long)
          mask = data['mask'].to(device, dtype = torch.long)
          token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
          targets = data['targets'].to(device, dtype = torch.long)

          #TODO: make change for distilbert and bert - token_type_ids
          outputs = model(ids, mask, token_type_ids)
        
          loss = loss_function(outputs, targets)
          tr_loss += loss.item()
          big_val, big_idx = torch.max(outputs.data, dim=1)
          n_correct += calcuate_accu(big_idx, targets)

          nb_tr_steps += 1
          nb_tr_examples+=targets.size(0)
          
          optimizer.zero_grad()
          loss.backward()
          # # When using GPU
          optimizer.step()

      time_elapsed = (time.time() - t0_epoch)/60
      epoch_loss = tr_loss/nb_tr_steps
      epoch_accu = (n_correct*100)/nb_tr_examples
      print(f"{epoch + 1:^7} | {epoch_loss:^12.6f} | {epoch_accu:^12.6f} | {time_elapsed:^9.2f}")
 

def valid(model, testing_loader):
    print(f"{'Valid Loss':^12} | {'Valid Accuracy':^12} | {'Elapsed':^9}")
    print("-"*41)
    model.eval()
    t0_epoch = time.time() 
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    predictions = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
          ids = data['ids'].to(device, dtype = torch.long)
          mask = data['mask'].to(device, dtype = torch.long)
          token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
          targets = data['targets'].to(device, dtype = torch.long)
          text = data['text']
          outputs = model(ids, mask, token_type_ids)


          loss = loss_function(outputs, targets)
          tr_loss += loss.item()
          big_val, big_idx = torch.max(outputs.data, dim=1)
          n_correct += calcuate_accu(big_idx, targets)

          nb_tr_steps += 1
          nb_tr_examples+=targets.size(0)

          for i in range(len(text)):
            predictions.append({
                'text': text[i],
                'predicted': switch_issue(big_idx[i].item()),
                'target': switch_issue(targets[i].item())
                })

    time_elapsed = (time.time() - t0_epoch)/60                
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"{epoch_loss:^12.6f} | {epoch_accu:^14.6f} | {time_elapsed:^9.2f}")
    return predictions

def switch_issue(issue_type):
    switcher = {
      4: 'Product Description Issue',
      3: 'Delivery and Return Issue',
      2: 'Design Issue',
      1: 'Quality Issue',
      0: 'Product Authenticity Issue'
      }
    return switcher.get(issue_type, "Invalid Issue Type")

### Save Files

In [None]:
def saveModel(model, path):
  MODEL_PATH = path+'/SubTopic/back_only_01/model.pth'
#   VOCAB_PATH = path+'/vocab'
  torch.save(model.state_dict(), MODEL_PATH)  
  print('All files saved')

def loadModel(path):
  MODEL_PATH = path+'/SubTopic/back_only_01/model.pth'
#   VOCAB_PATH = path+'/vocab'
  model.load_state_dict(torch.load(MODEL_PATH))
#   tokenizer = DistilBertTokenizer.from_pretrained(VOCAB_PATH)

  return model #, tokenizer

In [None]:
def saveCSVValidationResult(predictions, path):
    path = path+"/SubTopic/back_only_01/result.csv"
    with open(path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Text', 'Predicted', 'Target'])
        for example in predictions:
            writer.writerow([example['text'], example['predicted'], example['target']])