## Intent Classification With PyTorch
Previously, my focus in the notebooks was on obtaining labeled data for my chatbot. However, this current notebook is centered around utilizing PyTorch for the classification of intents within fresh, unseen user-generated data. The model has transitioned to a supervised learning approach, leveraging the labels derived from the unsupervised learning conducted in the preceding notebook.

### RASA Comparison

Rasa trains this intent classification step with SVM and GridsearchCV because they can try different configurations ([source](https://medium.com/bhavaniravi/intent-classification-demystifying-rasanlu-part-4-685fc02f5c1d)). When deploying preprocessing pipeline should remain same between train and test.

In [11]:
# Standard 
import collections
import yaml
import re
import os

# Data science
import pandas as pd
print(f"Pandas: {pd.__version__}")
import numpy as np
print(f"Numpy: {np.__version__}")

# Machine Learning
import sklearn
print(f"Sklearn: {sklearn.__version__}")


# Deep Learning
from torch import nn
import torch.optim as optim

# Visualization 
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="ticks", color_codes=True)



# Preprocessing and Torch
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torchtext.vocab import build_vocab_from_iterator

# Reading in training data
# train = pd.read_pickle('objects/train.pkl')
# print(f'Training data: {train.head()}')

Pandas: 2.0.3
Numpy: 1.26.2
Sklearn: 1.3.2


## Torchtext Preprocessing

### Torchtext tokenizer 
- Add description later 

In [2]:
%pwd

'c:\\Sagar Study\\ML and Learning\\Projects\\customer-support-bot\\amazon_customer_support\\notebooks'

In [10]:
import torch
from torchtext.datasets import AG_NEWS

train_iter = iter(AG_NEWS(split="train"))

ModuleNotFoundError: Package `portalocker` is required to be installed to use this datapipe.Please use `pip install 'portalocker>=2.0.0'` or`conda install -c conda-forge 'portalocker>=2/0.0'`to install the package

In [8]:
tokenizer = get_tokenizer('basic_english')

In [None]:
import io # for encoding
def yield_tokens(file_path):
    with io.open(file_path, encoding='utf-8') as file:
        for line in file:
            yield line.strip.split()
vocab = build_vocab_from_iterator(yield_tokens(file_path), specials=["<unk>"])

In [None]:
# I use Torch's tokenizer API 
# Train-test split of 95% train and 5% test

In [None]:
# Configuration for training
# Change all of the following configurations as per the specifications in the original repo 
# Set a seed value 
seed_value = 12321 

# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set `pytorch` pseudo-random generator at a fixed value
torch.manual_seed(seed_value)

In [None]:
class MODEL_EVAL_METRIC:
    accuracy = "accuracy"
    f1_score = "f1_score"
    
class Config: 
 
    VOCAB_SIZE = 0
    BATCH_SIZE = 512 
    EMB_SIZE = 300 
    OUT_SIZE = 2
    NUM_FOLDS = 5 #  
    NUM_EPOCHS = 10 
    NUM_WORKERS = 8
    
# I want to update the pretrained embedding weights during training process 
# I want to use a pretrained embedding
    EMB_WT_UPDATE = True
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    MODEL_EVAL_METRIC = MODEL_EVAL_METRIC.accuracy
    FAST_DEV_RUN = False 
    PATIENCE = 6 
    IS_BIDIRECTIONAL = True 
    
    # Model hyperparameters
    MODEL_PARAMS = {
        "hidden_size": 128,
        "num_layers": 2,
        "drop_out": 0.4258,
        "lr": 0.000366,
        "weight_decay": 0.00001
    }

In [None]:
# The dataset class for CSV/TSV files 
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, vocab, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label = self.data[idx][0]
        text = self.data[idx][1]
        tokens = self.tokenizer(text)[:self.max_length]
        tokens = [self.vocab[token] for token in tokens]
        return (torch.tensor(tokens), torch.tensor(label))

In [None]:
# Create embedding matrix 
def create_embedding_matrix(word_index, embedding_dict=None, dim=100): 
    num_words = len(word_index) + 1 # the word_index dictionary start from 1, not 0, since 0 is reserved for padding
    embedding_matrix = np.zeros((num_words, dim))
    for word, idx in word_index.items(): 
        embedding_vector = embedding_dict.get(word)
        if embedding_vector is not None: 
            embedding_matrix[idx] = embedding_vector
    return embedding_matrix

In [None]:
# Get the training and validation data
def create_data(train_df, valid_df): 
    X_train = train_df["text"].values
    y_train = train_df["label"].values
    X_valid = valid_df["text"].values
    y_valid = valid_df["label"].values
    
    ds_train = CustomDataset(X_train, tokenizer, vocab, max_length=100)
    ds_valid = CustomDataset(X_valid, tokenizer, vocab, max_length=100)
    
    torch_train = DataLoader(ds_train, batch_size=CONFIG.batch_size, collate_fn = pad_collate, num_workers=Config.NUM_WORKERS, shuffle=True)
    
    torch_valid = DataLoader(ds_valid, batch_size=CONFIG.batch_size, collate_fn = pad_collate, num_workers=Config.NUM_WORKERS, shuffle=True)
    
    return torch_train, torch_valid

In [None]:
# Pad the Input Sequence.  If the goal is to train with mini-batches, one @ needs to pad the sequences in batch. 
# In other words, given a mini-batch of size N, if the length of the largest sequence is L, 
# one needs to pad every sequence with a length of smaller than L with zeros and make their 
# lengths equal to L. Moreover, it is important that the sequences in the batch are in the 
# descending order.

from cProfile import label


def pad_collate(batch):
    # Each element in the batch is a tuple (token_tensor, label) 
    # Sort the batch (based on word count) in descending order 
    
    sorted_batch = sorted(batch, key=lambda x: x[0].shape[0], reverse=True)
    sequences = [x[0] for x in sorted_batch]
    sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=0)
    
    # Also need to store the length of each sequence. This is later needed in order to unpad the sequences
    seq_len = torch.Tensor([x[0].shape[0] for x in sorted_batch])
    labels = torch.LongTensor([x[1] for x in sorted_batch]) 
    
    return sequences_padded, seq_len, labels

In [None]:
# Combine the input data into a TensorDataset (see what other types of data are availabel as well)
dataset = TensorDataset()

## Model Architecture 
- Create a neural network in Torch for intent classification 

In [None]:
from torch import lstm


class IntentModel(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, embedding_matrix, hidden_dim, output_dim, n_layers, dropout): 
        super().__init__()
        
        # Emebdding layer with pretrained weights 
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.embedding.weight.requires_grad = False
        
        # LSTM layer 
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim, 
                            num_layers=n_layers, 
                            bidirectional=True, 
                            dropout=dropout)
        
        # Dense layers 
        self.fc1 = nn.Linear(hidden_dim*2, 600)  # 2 for bidirectional 
        self.fc2 = nn.Linear(600, 600)
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout)  
        
        # Output layer 
        self.out = nn.Linear(600, output_dim)
        
    def forward(self, text):
        # text = [batch_size, embed_length]
        embeddings = self.dropout(self.embedding(text))
        # embedded = [batch_size, sent_length, emb_dim]
        lstm_out, (hidden, cell) = self.lstm(embeddings)
        # Concat the final forward and hidden backward states 
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        # hidden = [batch size, num layers * num directions,hid dim]
        # cell = [batch size, num layers * num directions,hid dim]
        
        # concat the final forward and backward hidden state
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        # hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc1(hidden)