### Test the question type classification model

In [1]:
import pandas as pd
from src.general import read_csv_file, extract_json_string
from src.prompt_fetch import get_prompts
from src.agent import Agent


In [2]:
# read the csv files from the data directory 
question_dir = "../data/all_mcqs.csv"
text_dir = "../data/all_texts.csv"

mcqs = read_csv_file(question_dir)
texts = read_csv_file(text_dir)

# convert texts into a pandas datafram
texts_df = pd.DataFrame(texts)

In [None]:
question = mcqs[1].get("Question", "None")
question 

In [4]:
# get the prompt for question type classification
prompt = get_prompts("question_type_classifier.yaml")

system_prompt = prompt.get("system_prompt", "")
user_prompt = prompt.get("user_prompt", "")

In [None]:
subjects = []
chapters = []    
sections = []
sources  = []
questions = []
original_labels=[]   
classification_labels = []
reasonings = []


# test on the first 100 questions 
for i in range(100):
    print(f"-------------------------------Processing question {i+1}...---------------------------------")
    question = mcqs[i].get("Question", "None")
    
    # find the corresponding text for the question 
    subject = mcqs[i].get("Subject", "None")
    chapter = mcqs[i].get("Chapter", "None")
    section = mcqs[i].get("Section", "None")
    original_label = mcqs[i].get("Question_type", "None")
    
    # choose the text according to the subject, chapter, and section
    text_df = texts_df[(texts_df["Subject"] == subject) &      
                        (texts_df["Chapter"] == chapter) & 
                        (texts_df["Section"] == section)]
    text = text_df["Text"].values[0] if not text_df.empty else "None"

    if text == "None":
        print(f"No text found for question {i+1}: {question}")
        continue
    
    subjects.append(subject)
    chapters.append(chapter)
    sections.append(section)
    sources.append(text)
    questions.append(question)
    original_labels.append(original_label)
    
    formatted_user_prompt = user_prompt.format(question=question, source=text)
    # create an agent
    classification_agent = Agent(
                    model='gpt-4o',
                    system_prompt=system_prompt,
                    user_prompt=formatted_user_prompt
                )
    # run the agent to evaluate the question
    response = await classification_agent.completion_generation()

    result = extract_json_string(response)
    classification = result.get("classification", "None")
    reasoning = result.get("reasoning", "None")
    reasonings.append(reasoning)
    print(f">>>>>>>>Classification for question {i+1}: {classification}")
    if classification == "None":
        print(f"No classification found for question {i+1}: {question}")
    elif classification == "factual question":
        classification_labels.append("fact")
    elif classification == "inferential question":
        classification_labels.append("inference")
    elif classification == "main idea question":
        classification_labels.append("main_idea")
    else:
        classification_labels.append("Unknown")



    

    



In [6]:
# create a dataframe to store the results
results_df = pd.DataFrame({
    "Subject": subjects,
    "Chapter": chapters,
    "Section": sections,
    "Source": sources,
    "Question": questions,
    "Original_Label": original_labels,
    "Classification_Label": classification_labels,
    "Reasoning": reasonings
})

# save the results to a csv file
results_df.to_csv("../output/question_type_classification_results_updated.csv", index=False)
