In [4]:
import pandas as pd
 
# Load the CSV file
file_path = "../../resources/intent_detection/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.csv"  
df = pd.read_csv(file_path)
 
# Select only 'utterances' and 'intent' columns
df = df[['utterance', 'intent']]
 
# Check unique values in the 'intent' column
unique_intents = df['intent'].unique()
print("Unique intents:")
for intent in unique_intents:
    print(intent)
 
# Count occurrences of each unique intent
intent_counts = df['intent'].value_counts()
 
# Print the count for each unique intent
print("Intent Counts:")
print(intent_counts)
len(df)


Unique intents:
create_account
delete_account
edit_account
recover_password
registration_problems
switch_account
check_cancellation_fee
contact_customer_service
contact_human_agent
delivery_options
delivery_period
complaint
review
check_invoices
get_invoice
newsletter_subscription
cancel_order
change_order
place_order
track_order
check_payment_methods
payment_issue
check_refund_policy
get_refund
track_refund
change_shipping_address
set_up_shipping_address
Intent Counts:
intent
payment_issue               4366
create_account              2122
contact_customer_service    2055
get_invoice                 1430
track_order                 1224
get_refund                  1150
contact_human_agent         1026
check_invoices              1013
recover_password             986
change_order                 926
delete_account               913
complaint                    746
review                       580
check_refund_policy          479
delivery_options             360
check_cancellation_fee 

21534

In [None]:
%pip install scikit-learn
 
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

In [None]:
# Encode intents
label_map = {intent: i for i, intent in enumerate(df['intent'].unique())}
df['intent_label'] = df['intent'].map(label_map)
 
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
 
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_map))
 

In [None]:
# Tokenization and retaining labels
def tokenize_and_preserve_labels(dataset):
    tokenized_inputs = tokenizer(dataset['utterance'].tolist(), padding="max_length", truncation=True, return_tensors="pt")
    tokenized_inputs['labels'] = torch.tensor(dataset['intent_label'].tolist())
    return tokenized_inputs
 
train_data_tokenized = tokenize_and_preserve_labels(train_data)
test_data_tokenized = tokenize_and_preserve_labels(test_data)
 

In [None]:
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
 
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item
 
    def __len__(self):
        return len(self.encodings['input_ids'])
 
train_dataset = SimpleDataset(train_data_tokenized)
test_dataset = SimpleDataset(test_data_tokenized)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="../../resources/intent_detection/Intent_detection_fine_tuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
)
 
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}
 
 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
trainer.save_model("../../resources/intent_detection/Intent_detection_fine_tuned")

In [None]:
# Prediction function
def predict_intent(message):
    inputs = tokenizer(message, return_tensors="pt")
    predictions = model(**inputs)
    predicted_label = predictions.logits.argmax().item()
    predicted_intent = [intent for intent, label in label_map.items() if label == predicted_label][0]
    return predicted_intent
 
# Test prediction
new_message = "Hi there is a problem. I have received damaged product. Can you cancel my order?"
predicted_intent = predict_intent(new_message)
print(f"Predicted intent for '{new_message}': {predicted_intent}")