In [26]:
import pandas as pd

In [27]:
factual = pd.read_csv("factual_questions_500.csv")
recommendation = pd.read_csv("movie_recommendation_questions_500.csv")
multimedia = pd.read_csv("multimedia_questions_500.csv")
unrelated = pd.read_csv("unrelated_queries_500.csv")

In [28]:
factual['Label'] = 0          # 0 for factual questions
recommendation['Label'] = 1   # 1 for recommendation questions
multimedia['Label'] = 2       # 2 for multimedia questions
unrelated['Label'] = 3        # 3 for unrelated questions

combined_data = pd.concat([factual, recommendation, multimedia, unrelated], ignore_index=True)

combined_data = combined_data.sample(frac=1).reset_index(drop=True)

In [29]:
print(combined_data)

                                              Question  Label
0                  Show me a photo of Chris Hemsworth.      2
1                    Any good films with Meryl Streep?      1
2      Any historical thrillers with espionage themes?      1
3           What is the runtime of Lawrence of Arabia?      0
4     Recommend movies directed by Guillermo del Toro.      1
...                                                ...    ...
2303          I'd like to see a picture of Will Smith.      2
2304                    Any musicals set in the 1920s?      1
2305      Any animated films with classic fairy tales?      1
2306                 What does Marisa Tomei look like?      2
2307            What movies are similar to Casablanca?      1

[2308 rows x 2 columns]


In [31]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch
from torch.utils.data import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments)

factual = pd.read_csv("factual_questions_500.csv")
recommendation = pd.read_csv("movie_recommendation_questions_500.csv")
multimedia = pd.read_csv("multimedia_questions_500.csv")
unrelated = pd.read_csv("unrelated_queries_500.csv")

factual['Label'] = 0          # 0 for factual questions
recommendation['Label'] = 1   # 1 for recommendation questions
multimedia['Label'] = 2       # 2 for multimedia questions
unrelated['Label'] = 3        # 3 for unrelated questions

# Combine and shuffle the data
combined_data = pd.concat([factual, recommendation, multimedia, unrelated], ignore_index=True)
combined_data = combined_data.sample(frac=1).reset_index(drop=True)

X = combined_data['Question'].values
y = combined_data['Label'].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y)

class QuestionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

model_name = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

max_len = 128

train_dataset = QuestionDataset(
    texts=X_train,
    labels=y_train,
    tokenizer=tokenizer,
    max_len=max_len
)

val_dataset = QuestionDataset(
    texts=X_val,
    labels=y_val,
    tokenizer=tokenizer,
    max_len=max_len
)

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=1,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,   
    warmup_steps=50,                 
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset,            
    compute_metrics=compute_metrics      
)

trainer.train()

eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")

trainer.save_model('question_classifier_model')
tokenizer.save_pretrained('question_classifier_model')


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.4703,0.266831,0.95671,0.956273,0.963925,0.95671
100,0.0179,0.011836,1.0,1.0,1.0,1.0


Evaluation results: {'eval_loss': 0.011835739947855473, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_runtime': 0.6542, 'eval_samples_per_second': 353.079, 'eval_steps_per_second': 22.927, 'epoch': 1.0}


('question_classifier_model/tokenizer_config.json',
 'question_classifier_model/special_tokens_map.json',
 'question_classifier_model/vocab.txt',
 'question_classifier_model/added_tokens.json',
 'question_classifier_model/tokenizer.json')

In [24]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class QueryRouter:
    def __init__(self, model_path=r'./question_classifier_model'):
        """
        Initializes the QuestionClassifier with a pre-trained model and tokenizer.

        Args:
            model_path (str): Path to the directory where the fine-tuned model and tokenizer are saved.
        """
        # Load the tokenizer and model from the specified directory
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)

        # Mapping from label indices to question types
        self.label_map = {
            0: 'factual',
            1: 'recommendation',
            2: 'multimedia',
            3: 'unrelated'
        }

    def predict(self, query):
        """
        Classifies a single question into one of the predefined categories.

        Args:
            question (str): The input question to classify.

        Returns:
            str: The predicted category label as a string.
        """
        # Tokenization and Encoding of Query
        inputs = self.tokenizer.encode_plus(
            query,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        # Prediction
        outputs = self.model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
        return self.label_map[predicted_class]

In [25]:
qr = QueryRouter()

In [15]:
qr.predict("hello how are you?")

'unrelated'