<a href="https://colab.research.google.com/github/rajesh-bhat/data-aisummit-2021-databricks-conversational-ai/blob/main/Pytorch_Intent_Classification_using_DistilBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install Libraries

In [None]:
!pip install pandas torch transformers tqdm



### Download Data

In [None]:
!gdown --id 1OlcvGWReJMuyYQuOZm149vHWwPtlboR6 --output train.csv
!gdown --id 1Oi5cRlTybuIF2Fl5Bfsr-KkqrXrdt77w --output valid.csv
!gdown --id 1ep9H6-HvhB4utJRLVcLzieWNUSG3P_uF --output test.csv

Downloading...
From: https://drive.google.com/uc?id=1OlcvGWReJMuyYQuOZm149vHWwPtlboR6
To: /content/train.csv
100% 799k/799k [00:00<00:00, 12.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Oi5cRlTybuIF2Fl5Bfsr-KkqrXrdt77w
To: /content/valid.csv
100% 43.3k/43.3k [00:00<00:00, 67.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ep9H6-HvhB4utJRLVcLzieWNUSG3P_uF
To: /content/test.csv
100% 43.1k/43.1k [00:00<00:00, 6.35MB/s]


### Read Data

In [None]:
import pandas as pd
train = pd.concat([pd.read_csv(file) for file in ["train.csv","valid.csv"]])
train = train.groupby('intent').sample(frac=0.25)
test = pd.read_csv("test.csv")
print(train.shape)
print(test.shape)
train.head()

(3447, 2)
(700, 2)


Unnamed: 0,text,intent
3046,add artist to my trance life group,AddToPlaylist
11842,i want to put this song in my new boots playlist,AddToPlaylist
11804,add suffer little children to this is racionai...,AddToPlaylist
11486,add vikku vinayakram to my this is nicky jam,AddToPlaylist
5078,incorporate a roberto parra sandoval track int...,AddToPlaylist


In [None]:
train.intent.value_counts()

PlayMusic               504
GetWeather              499
BookRestaurant          495
RateBook                494
SearchScreeningEvent    488
SearchCreativeWork      487
AddToPlaylist           480
Name: intent, dtype: int64

In [None]:
intent_mapping = {x:idx for idx,x in enumerate(train.intent.unique().tolist())}
train['target'] = train['intent'].map(intent_mapping)
train.head()

Unnamed: 0,text,intent,target
3046,add artist to my trance life group,AddToPlaylist,0
11842,i want to put this song in my new boots playlist,AddToPlaylist,0
11804,add suffer little children to this is racionai...,AddToPlaylist,0
11486,add vikku vinayakram to my this is nicky jam,AddToPlaylist,0
5078,incorporate a roberto parra sandoval track int...,AddToPlaylist,0


### Load libraries

In [None]:
import random
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, AdamW

### Utilities

In [None]:
def set_seed(seed):
    """To make the training process reproducible"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    

class MyDataset(Dataset):
    def __init__(self, queries, intents, tokenizer, max_len):
        self.queries = queries
        self.intents = intents
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self) -> int:
        return len(self.queries)

    def __getitem__(self, index: int):
        query = self.queries[index]
        intent = self.intents[index]

        # use encode plus of huggingface tokenizer to encode the sentence.
        encoding = self.tokenizer.encode_plus(
            query,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=False,
            return_tensors="pt",
        )

        return {
            "query": query,
            "intent": torch.tensor(intent, dtype=torch.long),
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
        }
    

def dataset_loader(queries, intents, tokenizer, max_len, batch_size):
    ds = MyDataset(
        queries=queries.to_numpy(),
        intents=intents.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len,
    )

    return DataLoader(ds, batch_size=batch_size, num_workers=4)

### Model Training

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 64
MAX_LEN = 256
EPOCHS = 3
SEED = 42

set_seed(SEED)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
train_dataloader = dataset_loader(queries=train['text'], 
                                  intents=train['target'], 
                                  tokenizer=tokenizer,
                                  max_len=MAX_LEN,
                                  batch_size=BATCH_SIZE)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=7)
model.to(DEVICE)
model.train()

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(EPOCHS):
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        targets = batch['intent'].to(DEVICE)
        outputs = model(input_ids=input_ids, 
                        attention_mask=attention_mask, 
                        labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Training loss in epoch {epoch+1}: {round(loss.item(),4)}")

  cpuset_checked))
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier

Training loss in epoch 1: 1.1031


100%|██████████| 54/54 [01:14<00:00,  1.38s/it]
  0%|          | 0/54 [00:00<?, ?it/s]

Training loss in epoch 2: 0.596


100%|██████████| 54/54 [01:14<00:00,  1.38s/it]

Training loss in epoch 3: 0.512





### Scoring

In [None]:
import torch.nn.functional as F

def to_numpy(tensor):
    if tensor.requires_grad:
        return tensor.detach().cpu().numpy()
    return tensor.cpu().numpy()


def score(model, tokenizer, intent_mapping, query):
    encoding = tokenizer.encode_plus(
        query,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_token_type_ids=False,
        return_tensors="pt",
    )

    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]
    output = model(input_ids, attention_mask)
    probs = F.softmax(output.logits, dim=1)
    _, prediction = torch.max(output.logits, dim=1)
    return {
        "query": query,
        "predicted_intent": intent_mapping.get(prediction[0].item())
        }

In [None]:
query = test['text'][3]
score(model=model.to('cpu'), 
      tokenizer=tokenizer, 
      intent_mapping={value: key for key, value in intent_mapping.items()}, 
      query=query)

{'predicted_intent': 'SearchScreeningEvent',
 'query': 'will it snow in mt on june 13  2038'}