<a href="https://colab.research.google.com/github/ronitd/IntentClassifier/blob/main/IntentClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Loading data

In [76]:
import json
import torch
from torchtext import data
import numpy as np
import pandas as pd

Selecting the random 20 labels and creating csv files for train, validation and test set.

In [77]:
def save_csv(fileName, content):
  my_df = pd.DataFrame(content)
  my_df.to_csv(fileName+'.csv', index=False, header=["Text", "Label"])


np.random.seed(10)
with open( 'data_full.json', 'r' ) as f:
  org_data = json.load( f )
data_1 = np.array(org_data["train"])
print(np.unique(data_1[:, 1]).shape )
labels = np.unique(data_1[:, 1])
labels = np.random.choice(labels,20, replace=False)
print(labels)
print("Labels shape", labels.shape)


index = np.isin(data_1[:, 1], labels, assume_unique=True)
save_csv("train", data_1[index])

val_data = np.array(org_data["val"])
index = np.isin(val_data[:, 1], labels, assume_unique=True)
print("Val", val_data[index].shape)
save_csv("val", val_data[index])

test_data = np.array(org_data["test"])
index = np.isin(test_data[:, 1], labels, assume_unique=True)
print("Test", test_data[index].shape)
save_csv("test", test_data[index])




(150,)
['plug_type' 'shopping_list' 'book_hotel' 'pto_used' 'goodbye'
 'international_fees' 'mpg' 'meal_suggestion' 'exchange_rate'
 'ingredient_substitution' 'maybe' 'what_can_i_ask_you'
 'improve_credit_score' 'account_blocked' 'carry_on'
 'shopping_list_update' 'pin_change' 'do_you_have_pets' 'change_ai_name'
 'direct_deposit']
Labels shape (20,)
Val (400, 2)
Test (600, 2)


**Pre-Processing** Converting sentences to word using spacy, converting the data into lower case(I checked the few sentesnces all of them were lower case, this steps is for just to be sure.).

In [78]:
TEXT = data.Field(tokenize='spacy',lower=True, batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)
fields = [('text', TEXT), ('label', LABEL)]  

train, val, test = data.TabularDataset.splits(
        path='./', train='train.csv',
        validation='val.csv', test='test.csv', format='csv',
        fields=fields, skip_header = True)
# #print preprocessed text
print(vars(train.examples[0]))

{'text': ['what', 'steps', 'should', 'i', 'take', 'to', 'improve', 'my', 'credit', 'score'], 'label': 'improve_credit_score'}


Building the vocabulary that is converting the tokens into vector representation which would be feature representation of the word. Using Glove word embeddings.

In [79]:
#initialize glove embeddings
TEXT.build_vocab(train,min_freq=3,vectors = "glove.6B.100d")  
LABEL.build_vocab(train)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
print(TEXT.vocab.stoi)

print("Labels", LABEL.vocab.freqs)

Size of TEXT vocabulary: 580
Size of LABEL vocabulary: 20
[('i', 947), ('my', 685), ('to', 675), ('what', 469), ('you', 451), ('the', 410), ('for', 404), ('a', 359), ('can', 330), ('on', 330)]
defaultdict(<function _default_unk_index at 0x7faf65e1aea0>, {'<unk>': 0, '<pad>': 1, 'i': 2, 'my': 3, 'to': 4, 'what': 5, 'you': 6, 'the': 7, 'for': 8, 'a': 9, 'can': 10, 'on': 11, 'in': 12, 'me': 13, 'do': 14, 'how': 15, 'of': 16, 'is': 17, 'account': 18, 'have': 19, 'list': 20, 'shopping': 21, 'are': 22, ',': 23, 'need': 24, 'from': 25, 'if': 26, 'it': 27, 'know': 28, 'many': 29, 'with': 30, 'credit': 31, 'use': 32, 'carry': 33, 'pin': 34, 'tell': 35, 'change': 36, 'this': 37, 'up': 38, "'s": 39, 'direct': 40, 'be': 41, 'get': 42, 'card': 43, 'deposit': 44, 'does': 45, 'please': 46, 'there': 47, 'bank': 48, 'score': 49, 'and': 50, 'your': 51, 'car': 52, 'name': 53, 'why': 54, 'dollars': 55, 'number': 56, 'would': 57, 'days': 58, 'like': 59, 'set': 60, 'hotel': 61, 'will': 62, 'fees': 63, 'pets

Using the Buket Iterator so that it forms the batch in a way that minimum padding is required.

In [80]:
#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#Reproducing same results
SEED = 4

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True  

#set batch size
BATCH_SIZE = 128

#Load an iterator
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, val, test), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch = True,
    device = device)

Creating the Bi-directional Deep LSTM Model.

In [81]:
import torch.nn as nn

class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        # self.fc1 = nn.Linear(hidden_dim * 2, 64)
        # self.fc2 = nn.Linear(64, output_dim)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        #activation function
        # self.act = nn.ReLU()
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        # dense_outputs1=self.act(self.fc1(hidden))
        # dense_outputs=self.fc2(dense_outputs1)
        dense_outputs=self.fc(hidden)
        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

Model parameters and initialization.

In [82]:
#define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 100
num_hidden_nodes = 256
num_output_nodes = 20
num_layers = 2
bidirection = True
dropout = 0.25

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = bidirection, dropout = dropout)

Summary of the Model

In [83]:
#architecture
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

classifier(
  (embedding): Embedding(580, 100)
  (lstm): LSTM(100, 256, num_layers=2, batch_first=True, dropout=0.25, bidirectional=True)
  (fc): Linear(in_features=512, out_features=20, bias=True)
  (act): Sigmoid()
)
The model has 2,378,404 trainable parameters
torch.Size([580, 100])


Defining optimizer and loss function

In [84]:
import torch.optim as optim

#define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [85]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
        
        #compute the loss
        # print("Pred Shape", predictions)
        # print("Label Shape", batch.label)
        # exit()
        loss = criterion(predictions, batch.label.long())        
        
        #compute the accuracy
        num_corrects = (torch.max(predictions, 1)[1].view(batch.label.size()).data == batch.label.data).sum()
        acc = 100.0 * num_corrects/len(batch)   
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [86]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()
            
            #compute loss and accuracy
            loss = criterion(predictions, batch.label.long())
            num_corrects = (torch.max(predictions, 1)[1].view(batch.label.size()).data == batch.label.data).sum()
            # print("Num Corrects: ", len(batch))
            acc = 100.0 * num_corrects/len(batch)  
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [87]:
N_EPOCHS = 40
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc:.2f}%')

	Train Loss: 2.994 | Train Acc: 6.23%
	 Val. Loss: 2.992 |  Val. Acc: 7.03%
	Train Loss: 2.989 | Train Acc: 21.96%
	 Val. Loss: 2.987 |  Val. Acc: 35.94%
	Train Loss: 2.983 | Train Acc: 44.25%
	 Val. Loss: 2.980 |  Val. Acc: 44.73%
	Train Loss: 2.971 | Train Acc: 47.16%
	 Val. Loss: 2.962 |  Val. Acc: 41.41%
	Train Loss: 2.941 | Train Acc: 35.77%
	 Val. Loss: 2.885 |  Val. Acc: 36.52%
	Train Loss: 2.843 | Train Acc: 36.33%
	 Val. Loss: 2.751 |  Val. Acc: 47.07%
	Train Loss: 2.701 | Train Acc: 60.15%
	 Val. Loss: 2.613 |  Val. Acc: 64.84%
	Train Loss: 2.578 | Train Acc: 69.83%
	 Val. Loss: 2.533 |  Val. Acc: 72.07%
	Train Loss: 2.495 | Train Acc: 78.40%
	 Val. Loss: 2.476 |  Val. Acc: 67.19%
	Train Loss: 2.437 | Train Acc: 82.29%
	 Val. Loss: 2.420 |  Val. Acc: 71.48%
	Train Loss: 2.394 | Train Acc: 85.85%
	 Val. Loss: 2.398 |  Val. Acc: 79.88%
	Train Loss: 2.356 | Train Acc: 88.20%
	 Val. Loss: 2.360 |  Val. Acc: 79.49%
	Train Loss: 2.326 | Train Acc: 91.16%
	 Val. Loss: 2.337 |  Val. 

In [88]:
#load weights
path='/content/model.pt'
model.load_state_dict(torch.load(path));
model.eval();
test_loss, test_acc = evaluate(model, valid_iterator, criterion)
print(f'\tTest Loss: {test_loss:.3f} | Test Acc: {test_acc:.2f}%')

	Test Loss: 2.171 | Test Acc: 93.75%
