In [2]:
import torch
import pandas as pd
import numpy as np
from torchtext.data import Field
from tqdm import tqdm

In [3]:
TEXT = Field(sequential=True, tokenize=lambda x: x.split(), lower=True) #spacy's performance is really good but it takes some time to execute.
LABEL = Field(sequential=False, use_vocab=False) #set use_vocab = False when the data is already numerical.

from torchtext.data import TabularDataset

datafields = [("id", None),("conversation",TEXT), ("category", LABEL)]

#If skip_header is set to False, then the headers also get processed!
trn = TabularDataset(path="train_custom.csv", format='csv', skip_header=True, fields=datafields)
tst = TabularDataset(path='test_custom.csv', format='csv', skip_header=True, fields=datafields)

In [4]:
#Creating the vocabulary using GloVe embeddings.
TEXT.build_vocab(trn, vectors="glove.6B.50d")

#print(TEXT.vocab.freqs.most_common(10))

from torchtext.data import Iterator, BucketIterator

train_iter = BucketIterator(
 dataset = trn, # we pass in the datasets we want the iterator to draw data from
 batch_size = 64,
 device= torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
 sort_key=lambda x: len(x.articles), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False, # we pass repeat=False because we want to wrap this Iterator layer.
 shuffle = False, #Experiment with this to see if you're getting improved performance.
 train = True #Whether the dataset is a training set or not.
 )

test_iter = Iterator(tst, batch_size=64, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"), 
                     sort=False, sort_within_batch=False, repeat=False, shuffle=False)

In [5]:
#Extra Code
#print(trn[0].conversation)
#print(TEXT.vocab.stoi) 

#print(len(train_iter)) # This gives the total number of batches.

print(TEXT.vocab.stoi['<unk>'])
print(TEXT.vocab.itos[0])
for i in train_iter:
  for j in i:
    print(j)
    break
    if j != None:
      print(j[0][:,0])
      a = list(j[0][:,0]) # So, what I found out is that the articles are all located along the columns. So, each column is an article!
      print(len(a))
  break

0
<unk>
(tensor([[  84,  151,   29,  ...,  807,   54,   72],
        [   8,  173,   69,  ..., 1932,   63,   53],
        [   6,  356, 1510,  ...,   66,    8,  489],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]]), tensor([4, 4, 4, 5, 2, 4, 5, 4, 4, 1, 5, 2, 2, 3]))


In [6]:
class BatchWrapper:
    #This takes care of the variable assignments.
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y

    def __iter__(self):
        for batch in self.dl:
            '''
            We use "getattr" here because we want to generalize our code. This function is similar to "batch.conversation". 
            But then we would need to change this line for different functions. getattr returns the value of an attribute of an object.
            '''
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))
            yield (x, y)

    #This returns the number of batches.
    def __len__(self):
        return len(self.dl)

train_dl = BatchWrapper(train_iter, "conversation", ["category"]) #(iterator, independent_variable, dependent_variable)
test_dl = BatchWrapper(test_iter, "conversation", ["category"])

for x,y in test_dl:
  print(x.shape)
  print(y.shape)
  break

torch.Size([10827, 64])
torch.Size([64, 1])


In [7]:
from torch.autograd import Variable
import torchtext

class ClassifierNet(torch.nn.Module):
    def __init__(self, glove, num_class):
        super().__init__()
        self.embedding = torch.nn.EmbeddingBag.from_pretrained(glove.vectors)
        self.fc = torch.nn.Linear(glove.dim, num_class)
        
    def forward(self, text):
        embedded = self.embedding(text)
        return self.fc(embedded)

glove = torchtext.vocab.GloVe(name="6B",dim=50)    
device = "cuda:0" if torch.cuda.is_available() else "cpu"

num_classes = 6
learning_rate = 0.01
num_epochs = 10

net = ClassifierNet(glove, num_classes)
net.to(device)

criterion = torch.nn.CrossEntropyLoss()
criterion.to(device)

optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

print(net)

ClassifierNet(
  (embedding): EmbeddingBag(400000, 50, mode=mean)
  (fc): Linear(in_features=50, out_features=6, bias=True)
)


In [9]:
net.train()

for epoch in range(num_epochs):
    epoch_loss = 0
    # Loop over all batches
    for x, y in train_dl:
        optimizer.zero_grad()  # zero the gradient buffer.
        
        conversation, category = Variable(x), Variable(y)
                
        #Transposing the training data.
        conversation = conversation.t()
        
        outputs = net(conversation.to(device))
        
        # Note: The true category tensor has to always be a 1D tensor of values (labels) for CrossEntropy!
        loss = criterion(outputs.to(device), category.squeeze().long().to(device)) 
        
        epoch_loss += loss.item()
        
        loss.backward()
        optimizer.step()

    print(f'Epoch: {epoch} | Training Loss: {epoch_loss}')

Epoch: 0 | Training Loss: 1.6473338603973389
Epoch: 1 | Training Loss: 1.57392418384552
Epoch: 2 | Training Loss: 1.5211838483810425
Epoch: 3 | Training Loss: 1.4879330396652222
Epoch: 4 | Training Loss: 1.470410704612732
Epoch: 5 | Training Loss: 1.4631627798080444
Epoch: 6 | Training Loss: 1.460853934288025
Epoch: 7 | Training Loss: 1.4595469236373901
Epoch: 8 | Training Loss: 1.4569655656814575
Epoch: 9 | Training Loss: 1.452209711074829


In [15]:
# Test the Model
net.eval()

test_preds = []
true = []

for x, y in tqdm(test_dl):
    conversation, category = Variable(x), Variable(y)
    conversation = conversation.t()
    preds = net(conversation.to(device))
    _, predicted = torch.max(preds.data, 1)
    test_preds.extend(predicted.tolist())
    true.extend(y.squeeze().long().tolist())

total = len(true)
correct = 0
for i in range(total):
    if test_preds[i] == true[i]:
        correct += 1

print(f'Accuracy of the network on the {total} test articles: {100 * correct / total} %')

100%|██████████| 21/21 [00:00<00:00, 40.61it/s]

Accuracy of the network on the 1331 test articles: 19.459053343350863 %



