<a href="https://colab.research.google.com/github/vutt-ai-models/transformers_tutorials/blob/main/Intent_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%cd /content/drive/Shareddrives/NLP Team/Dungdevunaychim/data/Omer_bert/training-data-main

/content/drive/.shortcut-targets-by-id/1dYT25oewX-3C4ONpyGcKviv0b-Ah9oJZ/Dungdevunaychim/data/Omer_bert/training-data-main


In [None]:
# !pip install -q transformers

In [None]:
# !ls

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
with open("train_intents.json") as f:
  train_data = [json.loads(line) for line in f]
with open("test_intents.json") as f:
  test_data = [json.loads(line) for line in f]
with open("valid_intents.json") as f:
  valid_data = [json.loads(line) for line in f]

In [None]:
BATCH_SIZE = 16
LR = 1e-5
EPOCHS = 10

total_data = train_data + test_data + valid_data
unique_intentions = set(item['intention'] for item in total_data)
NUM_LABEL = len(unique_intentions)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, labels, tokenizer):
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        text = item['text']
        label = self.labels[index]
        encoded_input = self.tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
        input_ids = encoded_input['input_ids'].squeeze()
        attention_mask = encoded_input['attention_mask'].squeeze()
        return input_ids, attention_mask, label

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=NUM_LABEL).to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should pr

In [None]:
# get label
train_intentions = [item['intention'] for item in train_data]
valid_intentions = [item['intention'] for item in valid_data]
test_intentions = [item['intention'] for item in test_data]
all_intentions = train_intentions + valid_intentions

# encode label
label_encoder = LabelEncoder()
label_encoder.fit(all_intentions)
train_labels = label_encoder.transform(train_intentions)
valid_labels = label_encoder.transform(valid_intentions)
test_labels = label_encoder.transform(test_intentions)

# define dataloader
train_dataset = CustomDataset(train_data, train_labels, tokenizer)
valid_dataset = CustomDataset(valid_data, valid_labels, tokenizer)
test_dataset = CustomDataset(test_data, test_labels, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
for epoch in range(EPOCHS):
    pbar = tqdm(train_dataloader)
    model.train()
    total_loss = 0
    for batch in pbar:
        input_ids, attention_mask, label = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        label = label.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=label)
        logits = outputs.logits
        loss = loss_fn(logits.view(-1, NUM_LABEL), label.view(-1))
        pbar.set_description(f"Updating losss {loss} at epoch {epoch}")
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{EPOCHS} - Train Loss: {average_loss:.4f}")

    # Evaluation on validation data
    model.eval()
    with torch.no_grad():
        valid_labels = []
        predicted_labels = []

        for batch in valid_dataloader:
            input_ids, attention_mask, label = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted = torch.max(logits, dim=1)

            valid_labels.extend(label.cpu().numpy())
            predicted_labels.extend(predicted.cpu().numpy())

        valid_accuracy = accuracy_score(valid_labels, predicted_labels)
        print(f"Epoch {epoch + 1}/{EPOCHS} - Valid Accuracy: {valid_accuracy:.4f}")

Updating 0.8602011799812317 at epoch 0: 100%|██████████| 263/263 [06:35<00:00,  1.50s/it]


Epoch 1/10 - Train Loss: 1.1714
Epoch 1/10 - Valid Accuracy: 0.8285


Updating 0.7097903490066528 at epoch 1: 100%|██████████| 263/263 [06:33<00:00,  1.50s/it]


Epoch 2/10 - Train Loss: 0.6168
Epoch 2/10 - Valid Accuracy: 0.8894


Updating 0.46397215127944946 at epoch 2: 100%|██████████| 263/263 [06:33<00:00,  1.50s/it]


Epoch 3/10 - Train Loss: 0.4006
Epoch 3/10 - Valid Accuracy: 0.9215


Updating 0.10035364329814911 at epoch 3: 100%|██████████| 263/263 [06:33<00:00,  1.50s/it]


Epoch 4/10 - Train Loss: 0.2676
Epoch 4/10 - Valid Accuracy: 0.9375


Updating 0.07214842736721039 at epoch 4: 100%|██████████| 263/263 [06:33<00:00,  1.50s/it]


Epoch 5/10 - Train Loss: 0.1924
Epoch 5/10 - Valid Accuracy: 0.9423


Updating 0.3033495545387268 at epoch 5: 100%|██████████| 263/263 [06:34<00:00,  1.50s/it]


Epoch 6/10 - Train Loss: 0.1329
Epoch 6/10 - Valid Accuracy: 0.9439


Updating 0.33996590971946716 at epoch 6: 100%|██████████| 263/263 [06:33<00:00,  1.50s/it]


Epoch 7/10 - Train Loss: 0.0975
Epoch 7/10 - Valid Accuracy: 0.9535


Updating 0.033895742148160934 at epoch 7:  62%|██████▏   | 164/263 [04:06<02:28,  1.50s/it]


KeyboardInterrupt: ignored

In [None]:
with torch.no_grad():
    test_labels = []
    predicted_labels = []

    for batch in test_dataloader:
        input_ids, attention_mask, label = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted = torch.max(logits, dim=1)

        test_labels.extend(label.cpu().numpy())
        predicted_labels.extend(predicted.cpu().numpy())

    test_accuracy = accuracy_score(test_labels, predicted_labels)
    print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.9397


In [None]:
test_text = [item["text"] for item in test_data[10:20]]
true_intent = [item["intention"] for item in test_data[10:20]]
test_text, true_intent

(['wanna speak to a customer service',
  'two-thousand one-hundred sixty seven three-thousand six-hundred seventy four six-hundred twenty six-hundred two',
  'date of transaction 30th november 2021',
  'five-hundred seven',
  'one-hundred ninety three five-thousand six-hundred nineteen',
  'seven-hundred seventy two',
  'to put a credit card on my account',
  'hi good evening',
  's m s',
  'for my reward'],
 ['agent',
  'no_intention',
  'no_intention',
  'no_intention',
  'no_intention',
  'no_intention',
  'register',
  'greetings',
  'no_intention',
  'no_intention'])

In [None]:
# Tokenize the texts
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
encoded_inputs = tokenizer(test_text, padding='max_length', truncation=True, return_tensors='pt')

input_ids = encoded_inputs['input_ids'].to(device)
attention_mask = encoded_inputs['attention_mask'].to(device)

model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    _, predicted_indices = torch.max(logits, dim=1)
    predicted_labels = label_encoder.inverse_transform(predicted_indices.cpu().numpy())

for text, pred, label in zip(test_text, predicted_labels, true_intent):
    print(f"Text: {text} - Predicted: {pred}, - True label: {label}")

Text: wanna speak to a customer service - Predicted: agent, - True label: agent
Text: two-thousand one-hundred sixty seven three-thousand six-hundred seventy four six-hundred twenty six-hundred two - Predicted: no_intention, - True label: no_intention
Text: date of transaction 30th november 2021 - Predicted: no_intention, - True label: no_intention
Text: five-hundred seven - Predicted: no_intention, - True label: no_intention
Text: one-hundred ninety three five-thousand six-hundred nineteen - Predicted: no_intention, - True label: no_intention
Text: seven-hundred seventy two - Predicted: no_intention, - True label: no_intention
Text: to put a credit card on my account - Predicted: purchase, - True label: register
Text: hi good evening - Predicted: greetings, - True label: greetings
Text: s m s - Predicted: no_intention, - True label: no_intention
Text: for my reward - Predicted: no_intention, - True label: no_intention
