In [3]:
import os
import torch
import pandas as pd
import ast
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score
import numpy as np


# Disable W&B syncing for offline usage
os.environ["WANDB_MODE"] = "offline"

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Function for basic text preprocessing
def preprocess_text(text):
    return " ".join(text.lower().strip().split())

# Function to fix malformed lists in the 'act' column
def fix_malformed_list(x):
    if isinstance(x, str) and "[" in x and "]" in x:
        return x.replace(" ", ",")  # Fix invalid list formats
    return x

# Load datasets
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")
val_df = pd.read_csv("/content/validation.csv")

# Preprocess dialog column
for df in [train_df, test_df, val_df]:
    df['dialog'] = df['dialog'].apply(preprocess_text)
    df['act'] = df['act'].apply(fix_malformed_list)
    df['act'] = df['act'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['act'] = df['act'].apply(lambda x: max(set(x), key=x.count) if isinstance(x, list) and x else x)

# Tokenization function
def tokenize_data(df):
    tokenized = tokenizer(
        df['dialog'].tolist(),
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    df['input_ids'] = tokenized['input_ids'].tolist()
    df['attention_mask'] = tokenized['attention_mask'].tolist()
    return df

# Tokenize the datasets
train_df = tokenize_data(train_df)
val_df = tokenize_data(val_df)

# Prepare data for PyTorch Dataset
def prepare_dataset(df):
    return df[['input_ids', 'attention_mask']].values.tolist(), df['act'].tolist()

X_train, y_train = prepare_dataset(train_df)
X_val, y_val = prepare_dataset(val_df)

# Define PyTorch Dataset
class DialogueDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.inputs[idx][0], dtype=torch.long),
            'attention_mask': torch.tensor(self.inputs[idx][1], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long),
        }
        return item

# Create datasets
train_dataset = DialogueDataset(X_train, y_train)
val_dataset = DialogueDataset(X_val, y_val)

# Load the model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# Define a custom compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# Update the Trainer to include compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # Add this line
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Train the model
trainer.train()

# Evaluate the model
evaluation_results = trainer.evaluate(eval_dataset=val_dataset)
print(f"Evaluation Results: {evaluation_results}")
print(f"Validation Accuracy: {evaluation_results['eval_accuracy']:.2f}")


# Save model and tokenizer
model_path = "./dialogue_model_hmtl"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

print(f"Model saved to: {model_path}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6571,0.645926,0.713
2,0.5018,0.661276,0.707
3,0.3191,0.796255,0.715
4,0.2235,1.070207,0.722
5,0.1871,1.076708,0.707
6,0.1145,1.370105,0.706


Evaluation Results: {'eval_loss': 0.6459259986877441, 'eval_accuracy': 0.713, 'eval_runtime': 3.5521, 'eval_samples_per_second': 281.526, 'eval_steps_per_second': 35.191, 'epoch': 6.0}
Validation Accuracy: 0.71
Model saved to: ./dialogue_model_hmtl


In [5]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Constants
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_PATH = "./dialogue_model_hmtl"

# Dialogue act mapping
dialogue_acts = {
    0: "__dummy__",
    1: "inform",
    2: "question",
    3: "directive",
    4: "commissive"
}

# Load model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH).to(DEVICE)
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_PATH)

def predict_dialogue_act(input_text):
    model.eval()
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    ).to(DEVICE)

    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=-1).item()
    return dialogue_acts.get(prediction, "Unknown")

if __name__ == "__main__":
    print("Interactive Dialogue Act Classifier")
    print("Type 'exit' to quit.")

    while True:
        user_input = input("Enter a dialogue: ").strip()
        if user_input.lower() == "exit":
            print("Exiting. Goodbye!")
            break
        pred = predict_dialogue_act(user_input)
        print(f"Predicted Dialogue Act: {pred}")


Interactive Dialogue Act Classifier
Type 'exit' to quit.
Enter a dialogue: i am going to hyd today
Predicted Dialogue Act: inform
Enter a dialogue: can you do this by tmro evening or night
Predicted Dialogue Act: directive
Enter a dialogue: are you awake?
Predicted Dialogue Act: question
Enter a dialogue: i am eating lunch
Predicted Dialogue Act: inform
Enter a dialogue: how are you?
Predicted Dialogue Act: question
Enter a dialogue: can you paint it red
Predicted Dialogue Act: directive
Enter a dialogue: exit
Exiting. Goodbye!


In [20]:
print(tokenizer(df['dialog'].tolist(), padding='max_length', truncation=True, max_length=128, return_tensors='pt')['input_ids'].shape)


torch.Size([1000, 128])
