In [3]:
import json
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TextClassificationPipeline
import torch
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

def preprocess_data(data):
    input_sentences = []
    labels = []
    label_map = {}
    label_index = 0
    
    for intent in data['intents']:
        if intent['tag'] not in label_map:
            label_map[intent['tag']] = label_index
            label_index += 1

        for pattern in intent['patterns']:
            input_sentences.append(pattern)
            labels.append(label_map[intent['tag']])
    
    return input_sentences, labels, label_map

file_path = 'intents.json'
data = load_data(file_path)
input_sentences, labels, label_map = preprocess_data(data)

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m87.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenized_inputs = tokenizer(input_sentences, padding=True, truncation=True, return_tensors="pt")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [5]:
from torch.utils.data import Dataset, DataLoader

class IntentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

dataset = IntentDataset(encodings=tokenized_inputs, labels=labels)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(label_map)).to(device)

optim = torch.optim.Adam(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in dataloader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optim.step()

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [7]:
def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_indices = torch.argmax(logits, dim=1)
            
            total += labels.size(0)
            correct += (predicted_indices == labels).sum().item()
    
    return correct / total

In [8]:
accuracy = evaluate(model, dataloader)
print("Accuracy on the training dataset:", accuracy)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Accuracy on the training dataset: 0.7108294930875576


In [12]:
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader

# Load the testing data from the JSON file
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

test_data = load_data('intents_testing.json')

# Preprocess the testing data
def preprocess_data(data, label_map):
    input_sentences = []
    labels = []
    
    for intent in data['intents']:
        for pattern in intent['patterns']:
            input_sentences.append(pattern)
            labels.append(label_map[intent['tag']])
    
    return input_sentences, labels

test_inputs, test_labels = preprocess_data(test_data, label_map)

# Tokenize the testing input sentences using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
test_encodings = tokenizer(test_inputs, truncation=True, padding=True)

# Create a IntentDataset object for the testing data
class IntentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

test_dataset = IntentDataset(encodings=test_encodings, labels=test_labels)

# Create a DataLoader object for the testing dataset
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Evaluate the model on the testing dataset
def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_indices = torch.argmax(logits, dim=1)
            
            total += labels.size(0)
            correct += (predicted_indices == labels).sum().item()
    
    return correct / total

test_accuracy = evaluate(model, test_dataloader)
print("Accuracy on the testing dataset:", test_accuracy)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Accuracy on the testing dataset: 0.025974025974025976
