#Amazon Review- Tagging Negative Review in Amazon Product Review with CNN Model

#Introduction


### Connecting to Golab Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Importing Libraries

In [None]:
import functools
import sys
import csv
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm
import time


### Dataset

In [None]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
vectors = torchtext.vocab.FastText()

.vector_cache/wiki.en.vec: 6.60GB [00:36, 182MB/s]                            
100%|██████████| 2519370/2519370 [05:42<00:00, 7359.97it/s]


In [None]:
df_train = "/content/drive/MyDrive/Masters_Thesis/Dataset/encoded_data/encoded_topic_train.csv"
df_test = "/content/drive/MyDrive/Masters_Thesis/Dataset/encoded_data/encoded_topic_test.csv"
target_list = 'encoded_topic'
train_dataloader, valid_dataloader, vocab_size, pad_index, output_dim, vocab = get_data_loaders(train_dataframe= df_train,valid_dataframe=df_test,target_list=target_list)

### Model Implementation

In [None]:
embedding_dim = 300
hidden_dim = 300
n_layers = 2
bidirectional = True
dropout_rate = 0.5

model = CNN(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout_rate, 
             pad_index)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,787,102 trainable parameters


In [None]:
model = apply_weights(model,vocab)

lr = 5e-4

optimizer = optim.Adam(model.parameters(), lr=lr)
loss_function = nn.CrossEntropyLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
loss_function = loss_function.to(device)


In [None]:
train(total_epoch=20, model=model, train_loader=train_dataloader)

 Epoch  |  Train Loss  | Train Accuracy |  Elapsed 
---------------------------------------------------
   1    |   0.332587   |    0.862162    |   0.20   
   2    |   0.159429   |    0.944635    |   0.07   
   3    |   0.109417   |    0.963575    |   0.06   
   4    |   0.080089   |    0.972676    |   0.05   
   5    |   0.055828   |    0.983440    |   0.06   
   6    |   0.041446   |    0.988928    |   0.05   
   7    |   0.028453   |    0.993790    |   0.05   
   8    |   0.024514   |    0.994619    |   0.05   
   9    |   0.022187   |    0.994205    |   0.05   
  10    |   0.017583   |    0.995445    |   0.05   
  11    |   0.017539   |    0.996480    |   0.05   
  12    |   0.018539   |    0.995654    |   0.05   
  13    |   0.015147   |    0.996791    |   0.05   
  14    |   0.013381   |    0.996480    |   0.05   
  15    |   0.015689   |    0.995654    |   0.06   
  16    |   0.016625   |    0.996378    |   0.05   
  17    |   0.017693   |    0.996068    |   0.05   
  18    |   

In [None]:
predictions = valid(model, valid_dataloader) 

Validation Loss | Validation Accuracy |  Elapsed 
-----------------------------------------
  0.166346   |    0.958059    |   0.01   


In [None]:
path = "/content/drive/MyDrive/Masters_Thesis/Models/CNN"
saveModel(model=model, path=path)

All files saved


In [None]:
saveCSVValidationResult(predictions,path)

### Data Preprocessing & DataLoader

In [None]:
def tokenize_data(text, tokenizer, max_length):
    tokens = tokenizer(text)[:max_length]
    length = len(tokens)
    return {'tokens': tokens, 'length': length}

def numericalize_data(example, vocab):
    ids = [vocab[token] for token in example['tokens']]
    return {'ids': ids}

def collate(batch, pad_index,target_list):
    batch_ids = [torch.tensor(i['ids']) for i in batch]  # Convert ids list to tensor
    batch_ids = pad_sequence(batch_ids, padding_value=torch.tensor(pad_index), batch_first=True)
    batch_length = [torch.tensor(i['length']) for i in batch]  # Convert length list to tensor
    batch_length = torch.stack(batch_length)
    batch_label = [torch.tensor(i[target_list]) for i in batch]  # Convert reviewText list to tensor
    batch_label = torch.stack(batch_label)
    batch_text = [(i['reviewText']) for i in batch]
    batch = {'ids': batch_ids,
             'length': batch_length,
             target_list: batch_label,
             'text': batch_text}
    return batch

def get_data_loaders(train_dataframe, valid_dataframe,target_list, max_length=256,min_freq = 5, train_batch_size=16, learning_rate=2e-5):
    df_train = pd.read_csv(train_dataframe)
    df_test = pd.read_csv(valid_dataframe)
    df_train = pd.concat([df_train, df_train['reviewText'].apply(tokenize_data, tokenizer=tokenizer, max_length=max_length).apply(pd.Series)], axis=1)
    df_test = pd.concat([df_test, df_test['reviewText'].apply(tokenize_data, tokenizer=tokenizer, max_length=max_length).apply(pd.Series)], axis=1)
    special_tokens = ['<unk>', '<pad>']
    tokens = df_train['tokens'].tolist()
    vocab = torchtext.vocab.build_vocab_from_iterator(tokens, min_freq=min_freq, specials=special_tokens)
    unk_index = vocab['<unk>']
    pad_index = vocab['<pad>']
    vocab.set_default_index(unk_index)
    df_train = pd.concat([df_train, df_train.apply(numericalize_data, vocab=vocab, axis=1).apply(pd.Series)], axis=1)
    df_test = pd.concat([df_test, df_test.apply(numericalize_data, vocab=vocab, axis=1).apply(pd.Series)], axis=1)
    df_train = df_train[['ids', target_list, 'length','reviewText']]
    df_test = df_test[['ids', target_list, 'length','reviewText']]
    vocab_size = len(vocab)
    output_dim = df_train[target_list].nunique()
    train = df_train.apply(lambda row: {
        'ids': row['ids'],
        'length': row['length'],
        target_list: row[target_list],
        'reviewText': row['reviewText']
        }, axis=1).tolist()
    validation = df_test.apply(lambda row: {
        'ids': row['ids'],
        'length': row['length'],
        target_list: row[target_list],
        'reviewText': row['reviewText']
        }, axis=1).tolist()
    batch_size = 64
    collate_fn = functools.partial(collate, pad_index=pad_index, target_list=target_list)
    train_dataloader = DataLoader(train, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
    valid_dataloader = DataLoader(validation, batch_size=batch_size, collate_fn=collate_fn)

    return train_dataloader, valid_dataloader, vocab_size, pad_index, output_dim, vocab

### CNN Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, 
                 dropout_rate, pad_index, filter_sizes=[3, 4, 5], num_filters=[100, 100, 100]):

        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        self.fc = nn.Linear(sum(num_filters), output_dim)  # modify the input dimension of the linear layer
        self.dropout = nn.Dropout(dropout_rate)
        self.flatten = nn.Flatten()  # add a flatten layer
        
    def forward(self, ids, length):
        # Get embeddings from `input_ids`. Output shape: (b, max_len, embed_dim)
        x_embed = self.embedding(ids).float()

        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        # Output shape: (b, embed_dim, max_len)
        x_reshaped = x_embed.permute(0, 2, 1)

        # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

        # Max pooling. Output shape: (b, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]
        
        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters))
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],dim=1)
        
        x_fc = self.flatten(x_fc)  
        
        # Compute logits. Output shape: (b, n_classes)
        prediction = self.fc(self.dropout(x_fc))
        return prediction

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.orthogonal_(param)

def apply_weights(model,vocab):
  model.apply(initialize_weights)
  pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())
  model.embedding.weight.data = pretrained_embedding
  return model

### Training & Validation

In [None]:
def train(total_epoch, model, train_loader):
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Train Accuracy':^12} | {'Elapsed':^9}")
    print("-"*51)
    for epoch in range(total_epoch):
      t0_epoch = time.time() 
      tr_loss = 0
      n_correct = 0
      nb_tr_steps = 0
      nb_tr_examples = 0
      model.train()
      for _,data in enumerate(train_loader, 0):
      # for batch in tqdm.tqdm(dataloader, desc='training...', file=sys.stdout):
        input_ids  = data['ids'].to(device)
        length = data['length']
        targets  = data[target_list].to(device)
        # print(input_ids)
        outputs = model(input_ids,length)

        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        n_correct += get_accuracy(outputs, targets).item()
        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

      time_elapsed = (time.time() - t0_epoch)/60
      epoch_loss = tr_loss/nb_tr_steps
      epoch_accu = n_correct/nb_tr_steps # np.mean(n_correct)
      print(f"{epoch + 1:^7} | {epoch_loss:^12.6f} | {epoch_accu:^14.6f} | {time_elapsed:^9.2f}")

def valid(model, testing_loader):
    print(f"{'Validation Loss':^12} | {'Validation Accuracy':^12} | {'Elapsed':^9}")
    print("-"*41)
    model.eval()
    t0_epoch = time.time() 
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    predictions = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
        #   input_ids, targets, text = tuple(t.to(device) for t in data)
          input_ids  = data['ids'].to(device)
          length = data['length']
          targets  = data[target_list].to(device)
          text = data['text']
          outputs = model(input_ids,length)
          

          loss = loss_function(outputs, targets)
          tr_loss += loss.item()
          n_correct += get_accuracy(outputs, targets).item()

          nb_tr_steps += 1
          nb_tr_examples+=targets.size(0)

          #Todo: get text
          for i in range(len(text)):
            predictions.append({
                'text': text[i],
                'predicted': switch_issue(outputs.argmax(dim=-1)[i].item()),
                'target': switch_issue(targets[i].item())
                })

    time_elapsed = (time.time() - t0_epoch)/60                
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = n_correct/nb_tr_steps # np.mean(n_correct)
    print(f"{epoch_loss:^12.6f} | {epoch_accu:^14.6f} | {time_elapsed:^9.2f}")
    return predictions

def switch_issue(issue_type):
    switcher = {
        1: 'Seller Issue',
        0: 'Product Issue'
        }
    return switcher.get(issue_type, "Invalid Issue Type")

def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy


### Save Files

In [None]:
def saveCSVValidationResult(predictions, path, fileName="/Topic/back_only_01/result.csv"):
    path = path+"/"+fileName
    with open(path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Text', 'Predicted', 'Target'])
        for example in predictions:
            writer.writerow([example['text'], example['predicted'], example['target']])

In [None]:
def saveModel(model, path):
  MODEL_PATH = path+'/Topic/back_only_01/model.pth'
  torch.save(model.state_dict(), MODEL_PATH)  
  print('All files saved')

def loadModel(path):
  MODEL_PATH = path+'/model.pth'
  model.load_state_dict(torch.load(MODEL_PATH))
  return model, tokenizer