In [1]:
!pip install transformers datasets scikit-learn




[notice] A new release of pip is available: 24.0 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [98]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import json

In [142]:
with open("datasets/complete/intent_dataset_no_duplicates.json","r") as f:
    data = json.load(f)
print(len(data))


525


In [143]:
dupe_check = []
for i in data:
    if i['text'] not in dupe_check:
        dupe_check.append(i['text'])
        continue
    data.remove(i)
    print(i)
print(len(data))




525


In [None]:
with open("datasets/complete/intent_dataset_no_duplicates.json","w") as f:
    json.dump(data,f,indent=4)

In [144]:
dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.2)

In [145]:
print(dataset["test"].shape)
print(dataset["train"].shape)

(105, 2)
(420, 2)


In [146]:
print(dataset["train"].features)

{'text': Value(dtype='string', id=None), 'intent': Value(dtype='string', id=None)}


In [147]:
labels = sorted(set(example["intent"] for example in data))
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

print(labels)
print(label2id)

['COMPARE', 'DEFAULT', 'GET_ALERTS', 'GREETING', 'MOST_ALERTED_LOCATION']
{'COMPARE': 0, 'DEFAULT': 1, 'GET_ALERTS': 2, 'GREETING': 3, 'MOST_ALERTED_LOCATION': 4}


In [148]:
def encode_labels(example):
    example["label"] = label2id[example["intent"]]
    return example

dataset = dataset.map(encode_labels)
dataset["train"][0]

Map: 100%|██████████| 420/420 [00:00<00:00, 14000.12 examples/s]
Map: 100%|██████████| 105/105 [00:00<00:00, 11665.66 examples/s]


{'text': 'When was the last missile alert in Regavim ?',
 'intent': 'GET_ALERTS',
 'label': 2}

### Tokenizer set up

In [149]:
model_checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [150]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function)

Map: 100%|██████████| 420/420 [00:00<00:00, 4259.14 examples/s]
Map: 100%|██████████| 105/105 [00:00<00:00, 4038.09 examples/s]


### Model set up

In [151]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [152]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

### Trainer

In [153]:
training_args = TrainingArguments(
    output_dir="./intent_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=1

)



In [154]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2177,0.106751,0.952381,0.934817,0.918896,0.952381
2,0.0591,0.027092,0.990476,0.9906,0.991156,0.990476
3,0.0741,0.024826,0.990476,0.9906,0.991156,0.990476
4,0.1161,0.025751,0.990476,0.9906,0.991156,0.990476


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=212, training_loss=0.2408676002183402, metrics={'train_runtime': 263.8752, 'train_samples_per_second': 6.367, 'train_steps_per_second': 0.803, 'total_flos': 55639284019200.0, 'train_loss': 0.2408676002183402, 'epoch': 4.0})

In [155]:
model.save_pretrained("intent_model")
tokenizer.save_pretrained("intent_model")


('intent_model\\tokenizer_config.json',
 'intent_model\\special_tokens_map.json',
 'intent_model\\vocab.json',
 'intent_model\\merges.txt',
 'intent_model\\added_tokens.json',
 'intent_model\\tokenizer.json')

### Try out the model

In [156]:
model = AutoModelForSequenceClassification.from_pretrained("intent_model")
tokenizer = AutoTokenizer.from_pretrained("intent_model")

In [159]:

intent_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
text = "How many Xbox consoles droped this week?"
result = intent_classifier(text)
print(result)

Device set to use cpu


[{'label': 'DEFAULT', 'score': 0.9848260879516602}]
