<a href="https://colab.research.google.com/github/sanjeevr5/NLP_Excercises/blob/main/DL_NLP_With_Torch_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A multiclass classification using the Embedding bag layer

The Embedding layer gives us a vector representation for every token while the embeddingbg performs an aggregation operation on top of every sentence and returns the result.

Embedding I/P : [['hi', how', 'are', 'you'], ['ok', 'bye', '<pad>', '<pad>']]

Embedding layer O/P : (2,4,64)

Embedding bag I/P : ['hi', how', 'are', 'you', 'ok', 'bye']

Embedding bag O/ P: There is no need of padding tokens it can be directly fed. (2, 64) -> (B, embed_dim)

Embedding bag since it performs some aggregate operation sequential information is lost. There is no need of padding. Offsets should be mentioned instead of the padding. The offsets of above example is [0, 4] since first sentence starts from 0 and second sentence starts at 4th index.

More at : https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html

In [None]:
%%capture
!wget https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv
!wget https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils import data
from torch.nn.utils.rnn import pack_sequence
import time
import torch.optim as optim

SEED = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
train_data = pd.read_csv('./train.csv', sep = ',', header = None)
test_data = pd.read_csv('./test.csv', sep = ',', header = None)

print(f'Train shape : {train_data.shape} test shape : {test_data.shape}')

Train shape : (120000, 3) test shape : (7600, 3)


In [None]:
X_train, y_train = train_data.iloc[:,1].map(lambda x : x.split(' (')[0].lower()), train_data.iloc[:,0].map(lambda x : int(x - 1))
X_test, y_test = test_data.iloc[:,1].map(lambda x : x.split(' (')[0].lower()), test_data.iloc[:,0].map(lambda x : int(x - 1))

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(X_train), specials=["<unk>"]) #padding token not required
vocab.set_default_index(vocab["<unk>"]) 

In [None]:
train_encoded = [torch.tensor(vocab(tokenizer(item))) for item in X_train.values]
test_encoded = [torch.tensor(vocab(tokenizer(item))) for item in X_test.values]

print('train sample:', train_encoded[0])
print('test sample:', test_encoded[0])

train sample: tensor([  473,   295,     2,   873, 11871,    53,    67,    11,   480])
test sample: tensor([482,   6,  44, 108, 792,  32,  48])


In [None]:
print(f'The length of vocabulary is {len(vocab)}')

The length of vocabulary is 39515


In [None]:
from torch.utils.data import DataLoader, Dataset

class Data_Iterator(data.Dataset):

  def __init__(self, text, label):
    super(Data_Iterator, self).__init__()
    assert len(text) == len(label)
    self.text = text
    self.label = label
  
  def __len__(self):
    return len(self.label)

  def __getitem__(self, index):
    return self.text[index], self.label[index]

train_data = Data_Iterator(train_encoded, y_train)
test_data = Data_Iterator(test_encoded, y_test)

In [None]:
def batchTransformer(batch):
  offset = [0]
  newsL = []
  labelL = []
  for news, label in batch:
    offset.append(news.size(0))
    newsL.append(news)
    labelL.append(label)
  label = torch.tensor(labelL, dtype=torch.int64)
  offset = torch.tensor(offset[:-1]).cumsum(dim=0)
  news = torch.cat(newsL)
  return news, offset, label #Offset ignoring last index

trainloader = DataLoader(train_data, batch_size=64, shuffle=True, collate_fn= batchTransformer) 
testloader = DataLoader(test_data, batch_size=128, shuffle=True, collate_fn = batchTransformer)

In [None]:
class NewsClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

model = NewsClassifier(len(vocab), 128, 4)
print(f'The number of trainable parameters are : {sum(p.numel() for p in model.parameters() if p.requires_grad):,}')

The number of trainable parameters are : 5,058,436


In [None]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def accuracy(preds, true):
  _, index = torch.max(preds, dim = 1)
  return (index == true).sum().float() / len(preds)

In [None]:
def train_m(model, iterator, optimizer, l):
  e_loss = 0
  e_acc = 0
  model.train()

  for inputs, offsets, labels in iterator:
    inputs, offsets, labels = inputs.to(device), offsets.to(device), labels.to(device)
    optimizer.zero_grad()
    preds = model(inputs, offsets)
    acc = accuracy(preds,  labels)
    loss = l(preds.squeeze(1), labels.long())
    loss.backward()
    optimizer.step()
    e_loss += loss.item()
    e_acc += acc.item()
  return e_loss/len(iterator), e_acc/len(iterator)

def evaluate_m(model, iterator, l):
  e_loss = 0
  e_acc = 0
  model.eval()
  with torch.no_grad():
    for inputs, offsets, labels in iterator:
      inputs, offsets, labels = inputs.to(device), offsets.to(device), labels.to(device)
      preds = model(inputs, offsets)
      loss = l(preds.squeeze(1), labels.long())
      acc = accuracy(preds,  labels)
      e_loss += loss.item()
      e_acc += acc.item()
  return e_loss/len(iterator), e_acc/len(iterator)

In [None]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train_m(model, trainloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate_m(model, testloader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} / {N_EPOCHS} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 / 5 | Epoch Time: 0m 9s
	Train Loss: 0.646 | Train Acc: 78.02%
	 Val. Loss: 0.444 |  Val. Acc: 85.49%
Epoch: 02 / 5 | Epoch Time: 0m 6s
	Train Loss: 0.348 | Train Acc: 88.77%
	 Val. Loss: 0.412 |  Val. Acc: 86.25%
Epoch: 03 / 5 | Epoch Time: 0m 6s
	Train Loss: 0.277 | Train Acc: 90.96%
	 Val. Loss: 0.417 |  Val. Acc: 86.48%
Epoch: 04 / 5 | Epoch Time: 0m 6s
	Train Loss: 0.238 | Train Acc: 92.16%
	 Val. Loss: 0.432 |  Val. Acc: 86.07%
Epoch: 05 / 5 | Epoch Time: 0m 6s
	Train Loss: 0.213 | Train Acc: 92.84%
	 Val. Loss: 0.461 |  Val. Acc: 85.49%


## Predictions

In [None]:
labels = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"}

def predict(text):
    with torch.no_grad():
        text = torch.tensor(vocab(tokenizer(text))).to(device)
        output = model(text, torch.tensor([0]).to(device))
        return output.argmax(1).item() + 1
labels[predict('India won the match')]

'Sports'