In [1]:
import json
import pandas as pd
from transformers import XLMRobertaTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
# Load intent data
with open("intent_data.json", "r", encoding="utf-8") as f:
    intent_data = json.load(f)

df = pd.DataFrame(intent_data)

In [3]:
# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["intent"])

In [4]:
# Save label mappings
intent_mapping = dict(zip(df["label"], df["intent"]))
with open("intent_mapping.json", "w", encoding="utf-8") as f:
    json.dump(intent_mapping, f, ensure_ascii=False)

In [5]:
#Split Data into Training and Validation
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["question"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

In [6]:
#Tokenize Data
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=64, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=64, return_tensors="pt")


In [7]:
#Create Custom Dataset Class
from torch.utils.data import Dataset
class IntentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

In [8]:
#Create Datasets
import torch
train_dataset = IntentDataset(train_encodings, train_labels)
val_dataset = IntentDataset(val_encodings, val_labels)

In [9]:
# Load Pretrained XLM-RoBERTa Model
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
num_labels = len(df["intent"].unique())
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=num_labels)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import evaluate
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# Load evaluation metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1-score": f1["f1"]
    }

In [24]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate every epoch
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=30,
    learning_rate=3e-5,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1-score",
    greater_is_better=True
)



In [25]:
# Update Trainer to include compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # Add this line
)

In [26]:
# Train again
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1-score
1,0.3672,0.941221,0.857143,0.833333,0.857143,0.839981
2,0.4611,0.803335,0.857143,0.827381,0.857143,0.840909
3,0.3582,0.82276,0.857143,0.833333,0.857143,0.839981
4,0.3523,0.687026,0.821429,0.797619,0.821429,0.804267
5,0.316,0.659446,0.821429,0.797619,0.821429,0.804267
6,0.2173,0.646422,0.857143,0.833333,0.857143,0.839981
7,0.3528,0.643469,0.857143,0.833333,0.857143,0.839981
8,0.2451,0.673809,0.857143,0.833333,0.857143,0.839981
9,0.2207,0.638238,0.821429,0.797619,0.821429,0.804267
10,0.2159,0.653801,0.857143,0.833333,0.857143,0.839981


TrainOutput(global_step=420, training_loss=0.1654609426856041, metrics={'train_runtime': 240.1756, 'train_samples_per_second': 13.74, 'train_steps_per_second': 1.749, 'total_flos': 57671781076800.0, 'train_loss': 0.1654609426856041, 'epoch': 30.0})

In [27]:
model.save_pretrained("./intent_model")
tokenizer.save_pretrained("./intent_model")

print("Model training complete! Model saved in ./intent_model")

Model training complete! Model saved in ./intent_model


In [28]:
# import json

# # Open the original JSON file
# with open('newdata.json', 'r', encoding='utf-8') as f:
#     intent_data = json.load(f)

# # Convert the 'question' list to a single string for each intent
# for entry in intent_data:
#     entry["question"] = " ".join(entry["question"])

# # Save the updated data to a new JSON file
# with open('updated_newdata.json', 'w', encoding='utf-8') as f:
#     json.dump(intent_data, f, ensure_ascii=False, indent=4)

# print("Data has been updated and saved to updated_newdata.json")


Data has been updated and saved to updated_newdata.json


In [29]:
# import json

# # Open the original JSON file
# with open('newdata.json', 'r', encoding='utf-8') as f:
#     intent_data = json.load(f)

# # Create a new list with only intent and question (joined into a single string)
# updated_data = [
#     {
#         "intent": entry["intent"],
#         "question": " ".join(entry["question"])  # Join all questions into one string
#     }
#     for entry in intent_data
# ]

# # Save the updated data to a new JSON file
# with open('intent_question.json', 'w', encoding='utf-8') as f:
#     json.dump(updated_data, f, ensure_ascii=False, indent=4)

# print("Data has been updated and saved to intent_and_question.json")


Data has been updated and saved to intent_and_question.json
