In [5]:
import pandas as pd

In [7]:
factual = pd.read_csv("factual_questions_500.csv")
recommendation = pd.read_csv("movie_recommendation_questions_500.csv")
multimedia = pd.read_csv("multimedia_questions_500.csv")
unrelated = pd.read_csv("unrelated_queries_500.csv")

In [8]:
factual['Label'] = 0          # 0 for factual questions
recommendation['Label'] = 1   # 1 for recommendation questions
multimedia['Label'] = 2       # 2 for multimedia questions
unrelated['Label'] = 3        # 3 for unrelated questions

combined_data = pd.concat([factual, recommendation, multimedia, unrelated], ignore_index=True)

combined_data = combined_data.sample(frac=1).reset_index(drop=True)

In [9]:
print(combined_data)

                                               Question  Label
0     In which movies did Hans Zimmer create the sou...      0
1                    What year was La La Land released?      0
2                     What year was Gladiator released?      0
3                              Who directed Fight Club?      0
4                      How much does an elephant weigh?      3
...                                                 ...    ...
2439                                What is 12 plus 15?      3
2440              What does Jennifer Aniston look like?      2
2441             What does Jennifer Lawrence look like?      2
2442        Recommend films similar to American Psycho.      1
2443                            How do volcanoes erupt?      3

[2444 rows x 2 columns]


In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch
from torch.utils.data import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments)

factual = pd.read_csv("factual_questions_500.csv")
recommendation = pd.read_csv("movie_recommendation_questions_500.csv")
multimedia = pd.read_csv("multimedia_questions_500.csv")
unrelated = pd.read_csv("unrelated_queries_500.csv")

factual['Label'] = 0          # 0 for factual questions
recommendation['Label'] = 1   # 1 for recommendation questions
multimedia['Label'] = 2       # 2 for multimedia questions
unrelated['Label'] = 3        # 3 for unrelated questions

# Combine and shuffle the data
combined_data = pd.concat([factual, recommendation, multimedia, unrelated], ignore_index=True)
combined_data = combined_data.sample(frac=1).reset_index(drop=True)

X = combined_data['Question'].values
y = combined_data['Label'].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y)

class QuestionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

model_name = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

max_len = 128

train_dataset = QuestionDataset(
    texts=X_train,
    labels=y_train,
    tokenizer=tokenizer,
    max_len=max_len
)

val_dataset = QuestionDataset(
    texts=X_val,
    labels=y_val,
    tokenizer=tokenizer,
    max_len=max_len
)

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=1,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,   
    warmup_steps=50,                 
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset,            
    compute_metrics=compute_metrics      
)

trainer.train()

eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")

trainer.save_model('question_classifier_model')
tokenizer.save_pretrained('question_classifier_model')


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.4433,0.288876,0.922449,0.919568,0.924875,0.922449
100,0.0186,0.057417,0.987755,0.987732,0.988252,0.987755


Evaluation results: {'eval_loss': 0.0574173629283905, 'eval_accuracy': 0.9877551020408163, 'eval_f1': 0.9877324453751223, 'eval_precision': 0.9882515168229453, 'eval_recall': 0.9877551020408163, 'eval_runtime': 0.6788, 'eval_samples_per_second': 360.917, 'eval_steps_per_second': 23.57, 'epoch': 1.0}


('question_classifier_model/tokenizer_config.json',
 'question_classifier_model/special_tokens_map.json',
 'question_classifier_model/vocab.txt',
 'question_classifier_model/added_tokens.json',
 'question_classifier_model/tokenizer.json')

In [21]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class QueryRouter:
    def __init__(self, model_path=r'./question_classifier_model'):
        """
        Initializes the QuestionClassifier with a pre-trained model and tokenizer.

        Args:
            model_path (str): Path to the directory where the fine-tuned model and tokenizer are saved.
        """
        # Load the tokenizer and model from the specified directory
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)

        # Mapping from label indices to question types
        self.label_map = {
            0: 'factual',
            1: 'recommendation',
            2: 'multimedia',
            3: 'unrelated'
        }

    def predict(self, query):
        """
        Classifies a single question into one of the predefined categories.

        Args:
            question (str): The input question to classify.

        Returns:
            str: The predicted category label as a string.
        """
        # Tokenization and Encoding of Query
        inputs = self.tokenizer.encode_plus(
            query,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        # Prediction
        outputs = self.model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
        return self.label_map[predicted_class]

In [22]:
qr = QueryRouter()

In [23]:
qr.predict("In which movies did Hans Zimmer compose the music?")

'factual'

In [24]:
qr.predict("I like movies where Hans Zimmer composed the movies such as for Inception. Are there similar movies I should watch?")

'recommendation'

In [25]:
qr.predict("I like how Tom Holland acts. Given I like his style of acting, tell me what to watch next")

'recommendation'

In [26]:
qr.predict("I love how Hans Zimmer composes music, he is a greatly recommended composer. For which movie did he compose music?")

'factual'

In [27]:
queries = ["Which directors have worked on films with themes like The Social Network?",
"What actors have starred in movies with a narrative style similar to The Godfather?",
"Which screenwriters have written stories like those in Pulp Fiction?",
"Who acted in other psychological thrillers similar to Shutter Island?",
"Which composers created soundtracks for films similar to Gladiator?",
"What directors are known for making movies like Fight Club?",
"Who starred in romantic films with a tone similar to La La Land?",
"What filmmakers have worked on dystopian stories like Children of Men?",
"Which actors appeared in historical dramas similar to Schindler’s List?",
"Who directed other visually stunning movies like Mad Max: Fury Road?"]

In [28]:
for q in queries:
    print(q)
    print(qr.predict(q))
    print("\n")

Which directors have worked on films with themes like The Social Network?
factual


What actors have starred in movies with a narrative style similar to The Godfather?
factual


Which screenwriters have written stories like those in Pulp Fiction?
factual


Who acted in other psychological thrillers similar to Shutter Island?
factual


Which composers created soundtracks for films similar to Gladiator?
factual


What directors are known for making movies like Fight Club?
factual


Who starred in romantic films with a tone similar to La La Land?
factual


What filmmakers have worked on dystopian stories like Children of Men?
factual


Which actors appeared in historical dramas similar to Schindler’s List?
factual


Who directed other visually stunning movies like Mad Max: Fury Road?
factual




In [29]:
recommendation_queries = [
    "If I liked The Matrix, what other movies should I watch?",
    "Can you suggest films similar to Pulp Fiction?",
    "What are some great movies for fans of La La Land?",
    "What movies should I watch if I loved The Godfather?",
    "Are there any films like Interstellar that you recommend?",
    "Can you recommend movies with a tone similar to Blade Runner 2049?",
    "What are some must-watch movies for fans of psychological thrillers?",
    "If I enjoyed Parasite, what other films should I check out?",
    "What are some good space exploration movies like Gravity?",
    "Can you suggest historical dramas as compelling as Schindler's List?"
]


for q in recommendation_queries:
    print(q)
    print(qr.predict(q))
    print("\n")

If I liked The Matrix, what other movies should I watch?
recommendation


Can you suggest films similar to Pulp Fiction?
recommendation


What are some great movies for fans of La La Land?
recommendation


What movies should I watch if I loved The Godfather?
recommendation


Are there any films like Interstellar that you recommend?
recommendation


Can you recommend movies with a tone similar to Blade Runner 2049?
recommendation


What are some must-watch movies for fans of psychological thrillers?
recommendation


If I enjoyed Parasite, what other films should I check out?
recommendation


What are some good space exploration movies like Gravity?
recommendation


Can you suggest historical dramas as compelling as Schindler's List?
recommendation




In [30]:
factual_queries_close_to_recommendations = [
    "Which directors are famous for making movies like The Godfather?",
    "What films are often compared to Inception in terms of style?",
    "Which screenwriters worked on stories similar to The Social Network?",
    "What actors frequently star in movies like Shutter Island?",
    "Who composed music for films in the same genre as Interstellar?",
    "What are the most critically acclaimed films similar to Parasite?",
    "Which filmmakers are associated with sci-fi epics like 2001: A Space Odyssey?",
    "What movies are considered pioneers of crime storytelling like Heat?",
    "Who are the leading directors of space-themed films similar to Gravity?",
    "Which producers worked on movies with a narrative style like Pulp Fiction?"
]

for q in factual_queries_close_to_recommendations:
    print(q)
    print(qr.predict(q))
    print("\n")


Which directors are famous for making movies like The Godfather?
factual


What films are often compared to Inception in terms of style?
recommendation


Which screenwriters worked on stories similar to The Social Network?
factual


What actors frequently star in movies like Shutter Island?
factual


Who composed music for films in the same genre as Interstellar?
factual


What are the most critically acclaimed films similar to Parasite?
recommendation


Which filmmakers are associated with sci-fi epics like 2001: A Space Odyssey?
factual


What movies are considered pioneers of crime storytelling like Heat?
recommendation


Who are the leading directors of space-themed films similar to Gravity?
factual


Which producers worked on movies with a narrative style like Pulp Fiction?
factual


