In [4]:
import pandas as pd
import numpy as np
from openai import OpenAI
import json
import re
import pandas as pd
import re
from typing import List, Dict, Optional, Tuple

In [None]:
data = pd.read_csv('/home/praveen/theoden/emnlp_25/dataset/qa_actors.csv')

### dob and entity

In [4]:
data.shape

(753, 5)

In [17]:
forget_dob = data[data['answer'].str.contains('born', case=False, na=False)]
retain_dob = data[~data['answer'].str.contains('born', case=False, na=False)]

print(forget_dob.shape)
print(retain_dob.shape)

(105, 5)
(648, 5)


In [7]:
forget_dob.to_csv('/home/praveen/theoden/emnlp_25/dataset/forget_dob.csv', index=False)
retain_dob.to_csv('/home/praveen/theoden/emnlp_25/dataset/retain_dob.csv', index=False)

In [5]:
unique_ids = data['celebrity'].unique()
unique_ids

array(['Robert De Niro', 'Jack Nicholson', 'Denzel Washington',
       'Sean Penn', 'Brad Pitt', 'Morgan Freeman', 'Tom Hanks',
       'Leonardo DiCaprio', 'Johnny Depp', 'Anthony Hopkins',
       'Marlon Brando', 'Paul Newman', 'Gregory Peck', 'James Stewart',
       'Robert Duvall', 'Gene Hackman', 'Dustin Hoffman', 'Jack Lemmon',
       'Laurence Olivier', 'Michael Caine', 'Daniel Day-Lewis',
       'Sidney Poitier', 'Spencer Tracy', 'Henry Fonda', 'Clark Gable',
       'Humphrey Bogart', 'Gary Cooper', 'Charlton Heston',
       'Burt Lancaster', 'Kirk Douglas', "Peter O'Toole",
       'Richard Burton', 'James Cagney', 'Orson Welles', 'Robert Redford',
       'Warren Beatty', 'Clint Eastwood', 'Mel Gibson', 'Tom Cruise',
       'Harrison Ford', 'Kevin Spacey', 'Russell Crowe', 'Jeff Bridges',
       'George Clooney', 'Matt Damon', 'Will Smith', 'Christian Bale',
       'Joaquin Phoenix', 'Philip Seymour Hoffman', 'Meryl Streep',
       'Katharine Hepburn', 'Audrey Hepburn', 'Bette D

In [6]:
np.random.seed(42)
n_test_ids = int(np.ceil(0.05 * len(unique_ids)))
test_ids = np.random.choice(unique_ids, n_test_ids, replace=False)
test_ids

array(['Jessica Lange', 'Kevin Spacey', 'Renée Zellweger',
       'Laurence Olivier', 'Cameron Diaz'], dtype=object)

In [7]:
forget_entity = data[data['celebrity'].isin(test_ids)]
retain_entity = data[~data['celebrity'].isin(test_ids)]

In [8]:
forget_entity.to_csv('/home/praveen/theoden/emnlp_25/dataset/forget_entity.csv', index=False)
retain_entity.to_csv('/home/praveen/theoden/emnlp_25/dataset/retain_entity.csv', index=False)

In [12]:
forget_entity.shape

(36, 5)

In [13]:
retain_entity.shape

(717, 5)

### Creating MCQ's

In [None]:
client = OpenAI(api_key= "")

In [4]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o-mini")

In [14]:
info = data['info'][0]
question = data['question'][0]
answer = data['answer'][0]
celebrity = data['celebrity'][0]

In [11]:
def get_prompt(celebrity, info):
    return f"""Your goal is to create a well crafted set of answers for a test for {celebrity}. 
        Your answers will be used to test the intelligence of a given LLM. 
        You will be given an information about {celebrity}. The information is divided into sections.
        You are supposed to create atleast 5 Multiple choice questions and answers for each section based on the information provided.

        Multiple choice Question Guidelines:
        - Each question should have 4 options.
        - Focus on asking about date of birth, place of birth, family.
        - Each question should have only one correct answer and its answer should be always D.
        - Each question should be unique.
        - Each question should be relevant to the information provided.
        - Each question should be well crafted, clear and concise.
        - You are free to use any information provided in the section to create the questions.
        - The answer should be a single word, phrase, date, number or a sentence.

        Information:
        {info}
        """

In [8]:
df = data.drop_duplicates(subset=['celebrity'], keep='first').reset_index(drop=True)
df.shape

(99, 5)

In [None]:
batch = []

for idx, row in df.iterrows():
    celebrity = row['celebrity']
    info = row['info']
    input = get_prompt(celebrity= celebrity, info = info)
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": input},
    ]
    request = {
        "custom_id" : f"request-{idx}",
        "method": "POST",
        "url" : "/v1/chat/completions",
        "body" : {
            "model" : "gpt-4o-mini",
            "messages": [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": input}],
            "max_completion_tokens": 1000,
            "temperature": 0.2,
        }}
    batch.append(request)

import json
with open('requests.jsonl', 'w') as f:
    for request in batch:
        f.write(json.dumps(request) + '\n')

In [45]:
batch_input_file =client.files.create(
    file = open('requests.jsonl', 'rb'),
    purpose = "batch",
)

In [47]:
batch_input_file_id = batch_input_file.id

In [49]:
client.batches.create(
    input_file_id = batch_input_file_id,
    endpoint = "/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": "Create multiple choice questions for each celebrity based on the information provided."
    }
)

Batch(id='batch_67b8cb39bd908190bdac15029406b3e0', completion_window='24h', created_at=1740163897, endpoint='/v1/chat/completions', input_file_id='file-AhCXDy1hCLsEUBPEsDNpuy', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1740250297, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Create multiple choice questions for each celebrity based on the information provided.'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [5]:
status_batch = client.batches.retrieve("batch_67b8cb39bd908190bdac15029406b3e0")
print(status_batch)

Batch(id='batch_67b8cb39bd908190bdac15029406b3e0', completion_window='24h', created_at=1740163897, endpoint='/v1/chat/completions', input_file_id='file-AhCXDy1hCLsEUBPEsDNpuy', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1740167072, error_file_id=None, errors=None, expired_at=None, expires_at=1740250297, failed_at=None, finalizing_at=1740167060, in_progress_at=1740163898, metadata={'description': 'Create multiple choice questions for each celebrity based on the information provided.'}, output_file_id='file-74RA4LYM6zW5LrEAH1FhzB', request_counts=BatchRequestCounts(completed=99, failed=0, total=99))


In [12]:
content = client.files.content("file-74RA4LYM6zW5LrEAH1FhzB")

In [19]:
print(content.text)

{"id": "batch_req_67b8d7944aa88190ab2692be209f0afc", "custom_id": "request-0", "response": {"status_code": 200, "request_id": "5aa4175c241a53e3c6f88708a86e6fbb", "body": {"id": "chatcmpl-B3SPsF1QOeQ3p6HtOy3MGYAz0EFYF", "object": "chat.completion", "created": 1740163916, "model": "gpt-4o-mini-2024-07-18", "choices": [{"index": 0, "message": {"role": "assistant", "content": "### Early Life and Family Background Questions\n\n1. What is the birth date of Robert De Niro?\n   - A) July 4, 1945\n   - B) September 12, 1940\n   - C) June 1, 1942\n   - D) August 17, 1943\n\n2. In which city was Robert De Niro born?\n   - A) Los Angeles, California\n   - B) Chicago, Illinois\n   - C) Miami, Florida\n   - D) New York City, New York\n\n3. Who is Robert De Niro's older sister?\n   - A) Maria De Niro\n   - B) Linda De Niro\n   - C) Susan De Niro\n   - D) Barbara De Niro\n\n4. Which of the following is Robert De Niro's first spouse?\n   - A) Grace Hightower\n   - B) Virginia Admiral\n   - C) Meryl Str

In [20]:
output_file = content.text

In [22]:
def process_response_to_dataframe(response_text):
    responses = response_text.strip().split("\n")
    data = []
    for response in responses:
        json_response = json.loads(response)
        custom_id = json_response.get('custom_id')
        content = json_response.get('response', {}).get('body', {}).get('choices', [{}])[0].get('message', {}).get('content')
        data.append({"custom_id": custom_id, "content": content})
    batch = pd.DataFrame(data)
    return batch

In [24]:
def process_and_save_batch(output_file_id, save_directory):
    file_response = client.files.content(output_file_id)
    response_text = file_response.text

    batch_df = process_response_to_dataframe(response_text)

    batch_df.to_csv(f'{save_directory}/mcq.csv', index=False)
    print(f"saved as mcq.csv")

In [27]:
save_directory = '/home/praveen/theoden/emnlp_25/dataset'
output_file_id = "file-74RA4LYM6zW5LrEAH1FhzB"
process_and_save_batch(output_file_id, save_directory)

saved as mcq.csv


### mcq splitting and creation of dataset

In [4]:
mcq = pd.read_csv('/home/praveen/theoden/emnlp_25/dataset/mcq.csv')
mcq.head()

Unnamed: 0,custom_id,content
0,request-0,### Early Life and Family Background Questions...
1,request-1,### Early Life and Family Background Questions...
2,request-2,### Early Life and Family Background Questions...
3,request-3,### Section 1: Early Life and Family Backgroun...
4,request-4,### Early Life and Family Background Questions...


In [32]:
print(mcq['content'][4])

### Early Life and Family Background Questions

1. What is the date of birth of Brad Pitt?
   - A) January 1, 1965
   - B) December 25, 1963
   - C) November 18, 1963
   - D) December 18, 1963

2. In which city was Brad Pitt born?
   - A) Springfield, Missouri
   - B) Los Angeles, California
   - C) New York City, New York
   - D) Shawnee, Oklahoma

3. Who is Brad Pitt's younger sister?
   - A) Jennifer Pitt
   - B) Angelina Pitt
   - C) Sarah Pitt
   - D) Julie Neal Pitt

4. What was the profession of Brad Pitt's mother, Jane Etta?
   - A) Truck driver
   - B) Nurse
   - C) Teacher
   - D) School counselor

5. How many children does Brad Pitt have with Angelina Jolie?
   - A) Four
   - B) Five
   - C) Three
   - D) Six

### Career Highlights and Collaborations Questions

1. In which film did Brad Pitt gain fame in 1991?
   - A) Fight Club
   - B) The Irishman
   - C) A River Runs Through It
   - D) Thelma & Louise

2. Who directed Brad Pitt in the film "Seven"?
   - A) Quentin Taranti

In [5]:
df = data.drop_duplicates(subset=['celebrity'], keep='first').reset_index(drop=True)
df['celebrity'][4]

'Brad Pitt'

In [6]:
actors = [
    "Robert De Niro", "Jack Nicholson", "Denzel Washington", "Sean Penn", "Brad Pitt",
    "Morgan Freeman", "Al Pacino", "Tom Hanks", "Leonardo DiCaprio", "Johnny Depp",
    "Anthony Hopkins", "Marlon Brando", "Paul Newman", "Gregory Peck", "James Stewart",
    "Robert Duvall", "Gene Hackman", "Dustin Hoffman", "Jack Lemmon", "Laurence Olivier",
    "Michael Caine", "Daniel Day-Lewis", "Sidney Poitier", "Spencer Tracy", "Henry Fonda",
    "Clark Gable", "Humphrey Bogart", "Gary Cooper", "Charlton Heston", "Burt Lancaster",
    "Kirk Douglas", "Peter O'Toole", "Richard Burton", "James Cagney", "Orson Welles",
    "Robert Redford", "Warren Beatty", "Clint Eastwood", "Mel Gibson", "Tom Cruise",
    "Harrison Ford", "Kevin Spacey", "Russell Crowe", "Jeff Bridges", "George Clooney",
    "Matt Damon", "Will Smith", "Christian Bale", "Joaquin Phoenix", "Philip Seymour Hoffman"
]

actresses = [
    "Meryl Streep", "Katharine Hepburn", "Audrey Hepburn", "Bette Davis", "Elizabeth Taylor",
    "Ingrid Bergman", "Marilyn Monroe", "Julia Roberts", "Nicole Kidman", "Cate Blanchett",
    "Jodie Foster", "Susan Sarandon", "Diane Keaton", "Jessica Lange", "Glenn Close",
    "Helen Mirren", "Judi Dench", "Maggie Smith", "Vanessa Redgrave", "Sophia Loren",
    "Grace Kelly", "Jane Fonda", "Shirley MacLaine", "Faye Dunaway", "Sigourney Weaver",
    "Sally Field", "Sissy Spacek", "Diane Lane", "Michelle Pfeiffer", "Julianne Moore",
    "Emma Thompson", "Kate Winslet", "Natalie Portman", "Charlize Theron", "Sandra Bullock",
    "Reese Witherspoon", "Angelina Jolie", "Anne Hathaway", "Amy Adams", "Jennifer Lawrence",
    "Scarlett Johansson", "Emily Blunt", "Rachel Weisz", "Naomi Watts", "Penélope Cruz",
    "Marion Cotillard", "Renée Zellweger", "Halle Berry", "Cameron Diaz", "Kirsten Dunst"
]

In [7]:
df['content'] = mcq['content'].values

df.head()

Unnamed: 0,celebrity,info,question,answer,section,content
0,Robert De Niro,Robert De Niro is an acclaimed American actor ...,What is Robert De Niro's date of birth and pla...,"Robert De Niro was born on August 17, 1943, in...",Basic Information,### Early Life and Family Background Questions...
1,Jack Nicholson,Jack Nicholson is an iconic American actor and...,What is Jack Nicholson's birth date and where ...,"Jack Nicholson was born on April 22, 1937, in ...",Basic Information,### Early Life and Family Background Questions...
2,Denzel Washington,Denzel Washington is an acclaimed American act...,What is Denzel Washington's date of birth and ...,"Denzel Washington was born on December 28, 195...",Basic Information,### Early Life and Family Background Questions...
3,Sean Penn,"Sean Penn is an acclaimed American actor, dire...",What is the birth date and place of Sean Penn?,"Sean Penn was born on August 17, 1960, in Sant...",Basic Information,### Section 1: Early Life and Family Backgroun...
4,Brad Pitt,Brad Pitt is an acclaimed American actor and f...,What is Brad Pitt's birthplace and date of birth?,"Brad Pitt was born in Shawnee, Oklahoma, on De...",Basic Information,### Early Life and Family Background Questions...


In [44]:
def check_celebrity_content_match(df):
    """
    Check if celebrity names appear in their corresponding content.
    
    Parameters:
    df (pandas.DataFrame): DataFrame with 'celebrity' and 'content' columns
    
    Returns:
    tuple: Lists of matching and non-matching celebrity names, and DataFrame with results
    """
    wrong_rows = []
    right_rows = []
    
    # Create a copy of the DataFrame to add results
    result_df = df.copy()
    result_df['match_found'] = False
    
    for i, row in df.iterrows():
        try:
            # Convert to lowercase and strip whitespace
            celebrity = str(row['celebrity']).lower().strip()
            content = str(row['content']).lower().strip()
            
            # Check if celebrity name appears in content
            if celebrity in content:
                right_rows.append(celebrity)
                result_df.at[i, 'match_found'] = True
            else:
                wrong_rows.append(celebrity)
        except AttributeError as e:
            print(f"Error processing row {i}: {e}")
        except Exception as e:
            print(f"Unexpected error in row {i}: {e}")
    
    return right_rows, wrong_rows, result_df

In [49]:
right_rows, wrong_rows, df2 = check_celebrity_content_match(df)

In [8]:
def merge_content_to_qa(df1, df2):
    """
    Merges content from df2 into df1 based on finding names within content text.
    
    Parameters:
    df1 (pandas.DataFrame): DataFrame with columns 'name', 'question', 'answer'
    df2 (pandas.DataFrame): DataFrame with columns 'id', 'content'
    
    Returns:
    pandas.DataFrame: df1 with added 'content' column
    """
    # Create a copy of df1 to store results
    result = df1.copy()
    
    # Initialize content column with None
    result['content'] = None
    
    # For each row in df1
    for idx, row in result.iterrows():
        name = row['celebrity']
        # Find matching content where name appears in the content text
        mask = df2['content'].str.contains(name, case=False, na=False)
        matching_content = df2.loc[mask, 'content']
        
        # If a match is found, use the first matching content
        if not matching_content.empty:
            result.at[idx, 'content'] = matching_content.iloc[0]
    
    return result


In [35]:
result_df = merge_content_to_qa(data, df)


In [36]:
result_df.shape

(753, 6)

In [43]:
def standardize_section_names(sections: List[str]) -> Dict[str, str]:
    """
    Standardize section names by removing duplicates and creating a mapping.
    """
    # Convert to lowercase and remove special characters for comparison
    clean_sections = {s: re.sub(r'[^\w\s]', '', s.lower().strip()) for s in sections}
    
    # Create mapping for similar sections
    mapping = {}
    standardized = {
        'early life and family background': 'Early Life',
        'basic information': 'Basic Info',
        'basic information the first lineparagraph': 'Basic Info',
        'career highlights and collaborations': 'Career',
        'important events and recognitions': 'Recognitions',
        'legacy': 'Legacy'
    }
    
    for original, cleaned in clean_sections.items():
        for standard_clean, standard_name in standardized.items():
            if cleaned == standard_clean:
                mapping[original] = standard_name
                break
    
    return mapping

def extract_questions_by_section(content: str) -> Dict[str, List[Dict]]:
    """
    Extract questions grouped by their sections from content text.
    
    Args:
        content: String containing questions and options
    Returns:
        Dictionary with section names as keys and lists of question dictionaries as values
    """
    # Split content into sections
    sections = re.split(r'###\s+', content)
    questions_by_section = {}
    
    for section in sections:
        if not section.strip():
            continue
            
        # Extract section name
        section_lines = section.split('\n')
        section_name = section_lines[0].replace('Questions', '').strip()
        
        # Standardize the section name for matching
        clean_section_name = re.sub(r'[^\w\s]', '', section_name.lower().strip())
        
        # Extract questions
        questions_text = '\n'.join(section_lines[1:])
        question_blocks = re.split(r'\d+\.\s+', questions_text)[1:]  # Skip empty first split
        
        section_questions = []
        for question in question_blocks:
            # Split into question and options
            parts = question.split('-')
            if len(parts) < 2:
                continue
                
            question_text = parts[0].strip()
            options = [opt.strip() for opt in parts[1:]]
            
            # Create clean options list without option letters
            clean_options = [re.sub(r'^[A-D]\)\s*', '', opt) for opt in options]
            
            section_questions.append({
                'question': question_text,
                'options': clean_options,
                'raw_options': options,
                'answer': None  # You can modify this if answers are provided
            })
        
        questions_by_section[clean_section_name] = section_questions
    
    return questions_by_section

def process_mcq_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process dataframe containing sections and content to create structured MCQ data.
    Creates separate rows for each question, matched to its correct section.
    Preserves rows even when no questions are found for a section.
    
    Args:
        df: DataFrame with 'section' and 'content' columns, plus other columns to preserve
    Returns:
        Processed DataFrame with one row per question or original row if no questions found
    """
    # Standardize section names
    section_mapping = standardize_section_names(df['section'].unique())
    df['standardized_section'] = df['section'].map(section_mapping)
    
    result_rows = []
    
    # Process each row independently to keep questions associated with the correct entity
    for _, row in df.iterrows():
        # Get the content and extract questions specific to this row
        content = row['content']
        questions_by_section = extract_questions_by_section(content)
        
        # Get the standardized name of the current section
        current_section = re.sub(r'[^\w\s]', '', row['section'].lower().strip())
        
        # Find matching questions for this section in THIS ROW'S content only
        matching_questions = questions_by_section.get(current_section, [])
        
        if matching_questions:
            # Create a new row for each question in this section
            for q in matching_questions:
                # Create a new dictionary with all original columns
                question_data = row.to_dict()
                
                # Add the MCQ-specific data
                question_data.update({
                    'standardized_section': row['standardized_section'],
                    'mcq_question': q['question'],
                    'mcq_options': q['options'],
                    'mcq_raw_options': q['raw_options'],
                    'mcq_answer': q['answer']
                })
                result_rows.append(question_data)
        else:
            # No questions found for this section, preserve the original row
            question_data = row.to_dict()
            
            # Add empty MCQ-specific data
            question_data.update({
                'standardized_section': row['standardized_section'],
                'mcq_question': None,
                'mcq_options': None,
                'mcq_raw_options': None,
                'mcq_answer': None
            })
            result_rows.append(question_data)
    
    # Create new DataFrame
    result_df = pd.DataFrame(result_rows)
    
    # Ensure consistent column ordering with original columns first
    original_columns = df.columns.tolist()
    new_columns = [col for col in result_df.columns if col not in original_columns]
    result_df = result_df[original_columns + new_columns]
    
    return result_df

In [44]:
processed_df = process_mcq_dataframe(result_df)

In [56]:
processed_df['mcq_answer'] = processed_df['mcq_options'].apply(lambda x: x[-1] if isinstance(x, list) and len(x) > 0 else None)
processed_df['mcq_answer_option'] = processed_df['mcq_options'].apply(lambda x: 'D' if isinstance(x, list) and len(x) > 0 else None)

In [63]:
processed_df.shape

(2041, 12)

In [58]:
processed_df.to_csv('/home/praveen/theoden/emnlp_25/dataset/mcq_full_data.csv', index=False)

In [59]:
early_life = processed_df.loc[(processed_df['standardized_section'] == 'Early Life') & (processed_df['celebrity'] == 'Robert De Niro')]
early_life.shape

(10, 12)

In [65]:
#early_life

# it seems like there are duplicates for mcq questions. But if I remvoe it we will also loose the information about the section. So, I will keep it as it is and save this way. 
# I will make another dataset with only celebrity, info, standardized section, content, mcq_question, mcq_options, mcq_raw_options, mcq_answer

In [75]:
mcq_data = processed_df.drop_duplicates(subset= ['celebrity', 'info', 'standardized_section', 'content', 'mcq_question'], keep='first').reset_index(drop=True)

In [76]:
mcq_data.drop(columns= ['question', 'answer', 'section'], inplace= True)

In [77]:
mcq_data.head()

Unnamed: 0,celebrity,info,content,standardized_section,mcq_question,mcq_options,mcq_raw_options,mcq_answer,mcq_answer_option
0,Robert De Niro,Robert De Niro is an acclaimed American actor ...,### Early Life and Family Background Questions...,Basic Info,,,,,
1,Robert De Niro,Robert De Niro is an acclaimed American actor ...,### Early Life and Family Background Questions...,Early Life,What is the birth date of Robert De Niro?,"[July 4, 1945, September 12, 1940, June 1, 194...","[A) July 4, 1945, B) September 12, 1940, C) Ju...","August 17, 1943",D
2,Robert De Niro,Robert De Niro is an acclaimed American actor ...,### Early Life and Family Background Questions...,Early Life,In which city was Robert De Niro born?,"[Los Angeles, California, Chicago, Illinois, M...","[A) Los Angeles, California, B) Chicago, Illin...","New York City, New York",D
3,Robert De Niro,Robert De Niro is an acclaimed American actor ...,### Early Life and Family Background Questions...,Early Life,Who is Robert De Niro's older sister?,"[Maria De Niro, Linda De Niro, Susan De Niro, ...","[A) Maria De Niro, B) Linda De Niro, C) Susan ...",Barbara De Niro,D
4,Robert De Niro,Robert De Niro is an acclaimed American actor ...,### Early Life and Family Background Questions...,Early Life,Which of the following is Robert De Niro's fir...,"[Grace Hightower, Virginia Admiral, Meryl Stre...","[A) Grace Hightower, B) Virginia Admiral, C) M...",Diahnne Abbott,D


In [79]:
mcq_data.dropna(inplace=True)

In [80]:
basic_words = ['born', 'birth date', 'place of birth', 'date of birth']

for i, row in mcq_data.iterrows():
    if any(word.lower() in row['mcq_question'].lower() for word in basic_words):
        mcq_data.at[i, 'standardized_section'] = 'Basic Info'

In [81]:
mcq_data.shape

(1021, 9)

In [83]:
mcq_data.to_csv('/home/praveen/theoden/emnlp_25/dataset/mcq_data.csv', index=False)

In [84]:
basic_info = mcq_data.loc[mcq_data['standardized_section'] == 'Basic Info']
basic_info.shape

(101, 9)

In [85]:
basic_info.head(10)

Unnamed: 0,celebrity,info,content,standardized_section,mcq_question,mcq_options,mcq_raw_options,mcq_answer,mcq_answer_option
1,Robert De Niro,Robert De Niro is an acclaimed American actor ...,### Early Life and Family Background Questions...,Basic Info,What is the birth date of Robert De Niro?,"[July 4, 1945, September 12, 1940, June 1, 194...","[A) July 4, 1945, B) September 12, 1940, C) Ju...","August 17, 1943",D
2,Robert De Niro,Robert De Niro is an acclaimed American actor ...,### Early Life and Family Background Questions...,Basic Info,In which city was Robert De Niro born?,"[Los Angeles, California, Chicago, Illinois, M...","[A) Los Angeles, California, B) Chicago, Illin...","New York City, New York",D
27,Denzel Washington,Denzel Washington is an acclaimed American act...,### Early Life and Family Background Questions...,Basic Info,What is the date of birth of Denzel Washington?,"[January 15, 1955, December 25, 1954, November...","[A) January 15, 1955, B) December 25, 1954, C)...","December 28, 1954",D
28,Denzel Washington,Denzel Washington is an acclaimed American act...,### Early Life and Family Background Questions...,Basic Info,In which city was Denzel Washington born?,"[New York City, Brooklyn, Albany, Mount Vernon]","[A) New York City, B) Brooklyn, C) Albany, D) ...",Mount Vernon,D
67,Brad Pitt,Brad Pitt is an acclaimed American actor and f...,### Early Life and Family Background Questions...,Basic Info,What is the date of birth of Brad Pitt?,"[January 1, 1965, December 25, 1963, November ...","[A) January 1, 1965, B) December 25, 1963, C) ...","December 18, 1963",D
68,Brad Pitt,Brad Pitt is an acclaimed American actor and f...,### Early Life and Family Background Questions...,Basic Info,In which city was Brad Pitt born?,"[Springfield, Missouri, Los Angeles, Californi...","[A) Springfield, Missouri, B) Los Angeles, Cal...","Shawnee, Oklahoma",D
73,Morgan Freeman,"Morgan Freeman is an acclaimed American actor,...",### Early Life and Family Background Questions...,Basic Info,What is the date of birth of Morgan Freeman?,"[June 1, 1940, June 1, 1935, June 1, 1938, Jun...","[A) June 1, 1940, B) June 1, 1935, C) June 1, ...","June 1, 1937",D
74,Morgan Freeman,"Morgan Freeman is an acclaimed American actor,...",### Early Life and Family Background Questions...,Basic Info,In which city was Morgan Freeman born?,"[Chicago, New York, Los Angeles, Memphis, Tenn...","[A) Chicago, B) New York, C) Los Angeles, D) M...","Memphis, Tennessee",D
99,Leonardo DiCaprio,Leonardo DiCaprio is an acclaimed American act...,### Early Life and Family Background Questions...,Basic Info,What is the birth date of Leonardo DiCaprio?,"[October 10, 1975, November 11, 1975, November...","[A) October 10, 1975, B) November 11, 1975, C)...","November 11, 1974",D
100,Leonardo DiCaprio,Leonardo DiCaprio is an acclaimed American act...,### Early Life and Family Background Questions...,Basic Info,In which city was Leonardo DiCaprio born?,"[New York City, Chicago, San Francisco, Los An...","[A) New York City, B) Chicago, C) San Francisc...","Los Angeles, California",D


### Cloze task

In [None]:
client = OpenAI(api_key= "")

In [6]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o-mini")

In [28]:
system_prompt = """Your goal is to create a well crafted set of cloze tests (using the underline "___" as the mask), by extracting information from the given information.
This is to test the model's memory ability regarding the information given.
The information is divided into 5 sections, and each section can have upto 5 cloze tests.
Note that questions should be factually correct. They can be tricky and challenging.
The difficulty level of the questions can be easy (1 cloze solution), medium (2 cloze solutions) or hard (3 cloze solutions).
Your response should follow this JSON format.
```
{"probes":[
{
    "question": "___", # A question
    "answer": "___", # The correctanswer to the question
    "section": "___" # The section the question is from
    "level": "___" # The difficulty level of the question
]}
```
"""

In [22]:
def get_prompt(celebrity, info):
    return f"""
create several cloze based tasks on the given information about {celebrity}.
Information:
### Basic Info:
{info}
        """

In [31]:
df = data.drop_duplicates(subset=['celebrity'], keep='first').reset_index(drop=True)
df.shape

(99, 5)

In [41]:
batch = []

for idx, row in df.iterrows():
    celebrity = row['celebrity']
    info = row['info']
    prompt = get_prompt(celebrity= celebrity, info = info)

    request = {
        "custom_id" : f"request-{idx}",
        "method": "POST",
        "url" : "/v1/chat/completions",
        "body" : {
            "model" : "gpt-4o-mini",
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}],
            "max_completion_tokens": 1200,
            "temperature": 0.2,
        }}
    batch.append(request)

In [42]:
import json
with open('requests2.jsonl', 'w') as f:
    for request in batch:
        f.write(json.dumps(request) + '\n')

In [43]:
batch_input_file =client.files.create(
    file = open('requests2.jsonl', 'rb'),
    purpose = "batch",
)

In [44]:
batch_input_file_id = batch_input_file.id

In [45]:
client.batches.create(
    input_file_id = batch_input_file_id,
    endpoint = "/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": "Create multiple choice questions for each celebrity based on the information provided."
    }
)

Batch(id='batch_67be41b5d07c8190b7e58a4d7bc552e3', completion_window='24h', created_at=1740521909, endpoint='/v1/chat/completions', input_file_id='file-AHUF6VLDUMKfG1PvVJPHf5', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1740608309, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Create multiple choice questions for each celebrity based on the information provided.'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [6]:
status_batch = client.batches.retrieve("batch_67be41b5d07c8190b7e58a4d7bc552e3")
print(status_batch)

Batch(id='batch_67be41b5d07c8190b7e58a4d7bc552e3', completion_window='24h', created_at=1740521909, endpoint='/v1/chat/completions', input_file_id='file-AHUF6VLDUMKfG1PvVJPHf5', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1740525684, error_file_id=None, errors=None, expired_at=None, expires_at=1740608309, failed_at=None, finalizing_at=1740525673, in_progress_at=1740521910, metadata={'description': 'Create multiple choice questions for each celebrity based on the information provided.'}, output_file_id='file-6jX9Pav3zXeoZXZ4jwZvsL', request_counts=BatchRequestCounts(completed=99, failed=0, total=99))


In [7]:
content = client.files.content("file-6jX9Pav3zXeoZXZ4jwZvsL")# put the output_file_id

In [9]:
print(content.text)

{"id": "batch_req_67be5069911c819099eebb6b6aaa8c7f", "custom_id": "request-0", "response": {"status_code": 200, "request_id": "14693b709f1ac007bc6460f143326407", "body": {"id": "chatcmpl-B4y86lzaaWv3lZ5416FaQYovKtkaF", "object": "chat.completion", "created": 1740524150, "model": "gpt-4o-mini-2024-07-18", "choices": [{"index": 0, "message": {"role": "assistant", "content": "```json\n{\"probes\":[\n    {\n        \"question\": \"Robert De Niro was born on ___ in New York City, New York.\",\n        \"answer\": \"August 17, 1943\",\n        \"section\": \"Basic Info\",\n        \"level\": \"easy\"\n    },\n    {\n        \"question\": \"Robert De Niro's father, Robert De Niro Sr., was a painter and sculptor of ___ descent.\",\n        \"answer\": \"Italian\",\n        \"section\": \"Early life and family background\",\n        \"level\": \"easy\"\n    },\n    {\n        \"question\": \"De Niro was married to ___ from 1976 to 1988.\",\n        \"answer\": \"Diahnne Abbott\",\n        \"sec

In [10]:
def process_response_to_dataframe(response_text):
    responses = response_text.strip().split("\n")
    data = []
    for response in responses:
        json_response = json.loads(response)
        custom_id = json_response.get('custom_id')
        content = json_response.get('response', {}).get('body', {}).get('choices', [{}])[0].get('message', {}).get('content')
        data.append({"custom_id": custom_id, "content": content})
    batch = pd.DataFrame(data)
    return batch

In [11]:
def process_and_save_batch(output_file_id, save_directory):
    file_response = client.files.content(output_file_id)
    response_text = file_response.text

    batch_df = process_response_to_dataframe(response_text)

    batch_df.to_csv(f'{save_directory}/cloze.csv', index=False)
    print(f"saved as cloze.csv")

In [14]:
save_directory = '/home/praveen/theoden/emnlp_25/dataset'
output_file_id = "file-6jX9Pav3zXeoZXZ4jwZvsL"
process_and_save_batch(output_file_id, save_directory)

saved as cloze.csv


In [5]:
cloze = pd.read_csv('/home/praveen/theoden/emnlp_25/dataset/cloze.csv')
cloze.head()

Unnamed: 0,custom_id,content
0,request-0,"```json\n{""probes"":[\n {\n ""question..."
1,request-1,"```json\n{""probes"":[\n {\n ""question..."
2,request-2,"```json\n{""probes"":[\n {\n ""question..."
3,request-3,"```json\n{""probes"":[\n {\n ""question..."
4,request-4,"```json\n{""probes"":[\n {\n ""question..."


In [6]:
print(cloze['content'][0])

```json
{"probes":[
    {
        "question": "Robert De Niro was born on ___ in New York City, New York.",
        "answer": "August 17, 1943",
        "section": "Basic Info",
        "level": "easy"
    },
    {
        "question": "Robert De Niro's father, Robert De Niro Sr., was a painter and sculptor of ___ descent.",
        "answer": "Italian",
        "section": "Early life and family background",
        "level": "easy"
    },
    {
        "question": "De Niro was married to ___ from 1976 to 1988.",
        "answer": "Diahnne Abbott",
        "section": "Early life and family background",
        "level": "easy"
    },
    {
        "question": "Robert De Niro has ___ children, including Drena and Raphael.",
        "answer": "six",
        "section": "Early life and family background",
        "level": "medium"
    },
    {
        "question": "De Niro is known for his collaborations with director ___, appearing in films like 'Mean Streets' and 'The Irishman'.",
        "an

In [15]:
def process_content(content):
    # Remove the three backticks at the start and end, then the 'json' label and strip leading/trailing whitespace
    cleaned_content = content.replace('```json', '').replace('```', '').strip()
    
    if cleaned_content:  # Check if the string is not empty
        try:
            # Parse the JSON string
            parsed_data = json.loads(cleaned_content)
            # Extract probes (this will remove the outer json/probes keys and leave the list)
            return parsed_data.get("probes", [])
        except json.JSONDecodeError:
            print(f"Error decoding JSON for content: {cleaned_content[:100]}...")  # Show part of the content for debugging
            return []
    return []



In [16]:
# Apply the function to the 'content' column and create a new column with the processed data
cloze['processed_content'] = cloze['content'].apply(process_content)

In [27]:
cloze.shape

(99, 3)

In [28]:
data = pd.read_csv('/home/praveen/theoden/emnlp_25/dataset/qa_actors.csv')
df = data.drop_duplicates(subset=['celebrity'], keep='first').reset_index(drop=True)
df.shape

(99, 5)

In [29]:
df['cloze_content'] = cloze['processed_content'].values

In [33]:
def add_name_to_probes(row):
    name = row['celebrity']
    probes_with_name = []
    for probe in row['cloze_content']:
        probe['celebrity'] = name  
        probes_with_name.append(probe)
    return probes_with_name

In [34]:
df['processed_with_name'] = df.apply(add_name_to_probes, axis=1)

In [38]:
standardized = {
    'early life and family background': 'Early Life',
    'basic information': 'Basic Info',
    'basic information the first lineparagraph': 'Basic Info',
    'career highlights and collaborations': 'Career',
    'important events and recognitions': 'Recognitions',
    'legacy': 'Legacy'
}

In [39]:
def clean_and_map_section(section):
    # Clean the section name (remove non-alphanumeric characters and make it lowercase)
    cleaned_section = re.sub(r'[^\w\s]', '', section.lower()).strip()
    # Return the standardized section if it exists in the mapping, otherwise return the original
    return standardized.get(cleaned_section, section)

In [40]:
def add_name_and_clean_sections(row):
    name = row['celebrity']
    probes_with_name_and_section = []
    for probe in row['cloze_content']:
        # Clean and map the section name
        probe['celebrity'] = name
        probe['section'] = clean_and_map_section(probe['section'])
        probes_with_name_and_section.append(probe)
    return probes_with_name_and_section

# Apply the function to include name and clean section names in each probe
df['processed_with_name_and_section'] = df.apply(add_name_and_clean_sections, axis=1)

In [41]:
cloze_cont = pd.json_normalize(df['cloze_content'].explode())

In [45]:
cloze_cont.to_csv('/home/praveen/theoden/emnlp_25/dataset/cloze_data.csv')