In [33]:
import pandas as pd
import json
from pprint import pprint
import random

file_path = './Clinical_and_Other_Features.xlsx'
data = pd.read_excel(file_path, sheet_name='Data')

# print("Columns in the dataset:")
# print(data.columns)

race_and_ethnicity_col = data.columns[21]
tumor_characteristics_col = data.columns[23:48]
mri_findings_col = data.columns[48:53]

# Print
# try:
#     print(data[mri_findings_col])
# except KeyError as e:
#     print(f"KeyError: {e}. Please check if the column names match exactly.")

In [43]:
# first_row_descriptions = data.iloc[0]
# print(first_row_descriptions)

# Race and Ethnicity 21
'''{0 = N/A,1 = white,2 = black,3 = asian,4 = native,
5 = hispanic,6 = multi,7 = hawa,8 = amer indian}'''
race_mapping = {
    0: 'Unknown', 
    1: 'White',
    2: 'Black',
    3: 'Asian',
    4: 'Native American',
    5: 'Hispanic',
    6: 'Multiple Races',
    7: 'Native Hawaiian',
    8: 'American Indian'
}

# Tumor Characteristics 23-26
# ER, PR
tc_er_mapping = {0: 'Negative', 1: 'Positive'}
tc_pr_mapping = {0: 'Negative', 1: 'Positive'}
# HER2
tc_her2_mapping = {0: 'Negative', 1: 'Positive', 2: 'Borderline'}
# Molecular Subtype
'''{0 = luminal-like,1 = ER/PR pos, HER2 pos,2 = her2,3 = trip neg}'''
tc_molecular_subtype_mapping = {
    0: 'Luminal-like',
    1: 'ER/PR Positive, HER2 Positive', 
    2: 'HER2', 
    3: 'Triple Negative' 
}
# Oncotype score - Open
#

# Race and Ethnicity
race_questions_variants = [
    "What is the patient's race and ethnicity?",
    "Can you tell me the patient's race and ethnicity?",
    "Please provide the patient's race and ethnicity."
]
# Tumor Characteristics (ER, PR, HER2, Molecular Subtype)
# ER
tc_er_questions = [
    "What is the patient's ER status?",
    "Can you tell me the patient's ER status?",
    "Please provide the patient's ER status."
]
# PR
tc_pr_questions = [
    "What is the patient's PR status?",
    "Can you tell me the patient's PR status?",
    "Please provide the patient's PR status."
]
# HER2
tc_her2_questions = [
    "What is the patient's HER2 status?",
    "Can you tell me the patient's HER2 status?",
    "Please provide the patient's HER2 status."
]
# Molecular Subtype
tc_molecular_subtype_questions = [
    "What is the patient's molecular subtype?",
    "Can you tell me the patient's molecular subtype?",
    "Please provide the patient's molecular subtype.",
    "Tell me the patient's molecular subtype."
]

In [None]:
def question_generator(qid, patient_id, image_path, user_prompt, gpt_response, answer_type, column_name, column_idx):
    question = {
        "qid": qid,
        "patient_id": patient_id,
        "image": image_path or "to-do",
        "bbox": [],
        "conversation": [
            {
                "from": "user",
                "value": user_prompt
            },
            {
                "from": "gpt",
                "value": gpt_response
            }
        ],
        "metadata":{
            "dataset": "Duke-Breast-Cancer-MRI",
            "location": "Breast",
            "base_type": "vqa",
            "answer_type": answer_type,
            "column_name": column_name,
            "column_idx": column_idx
        }
    }
    return question

def row_to_vqa_json(row, patient_id, qid):
    questions = []
    
    race_question = question_generator(
        qid,
        patient_id,
        "",
        random.choice(race_questions_variants),
        race_mapping.get(row[race_and_ethnicity_col], 'Unknown') + '.',
        "CLOSE",
        "Race and Ethnicity",
        21
    )
    questions.append(race_question)
    qid += 1

    tc_er_question = question_generator(
        qid,
        patient_id,
        "",
        random.choice(tc_er_questions),
        tc_er_mapping.get(row[tumor_characteristics_col[0]], 'Unknown') + '.',
        "CLOSE",
        "ER",
        23
    )
    questions.append(tc_er_question)
    qid += 1

    tc_pr_question = question_generator(
        qid,
        patient_id,
        "",
        random.choice(tc_pr_questions),
        tc_pr_mapping.get(row[tumor_characteristics_col[1]], 'Unknown') + '.',
        "CLOSE",
        "PR",
        24
    )
    questions.append(tc_pr_question)
    qid += 1

    tc_her2_question = question_generator(
        qid,
        patient_id,
        "",
        random.choice(tc_her2_questions),
        tc_her2_mapping.get(row[tumor_characteristics_col[2]], 'Unknown') + '.',
        "CLOSE",
        "HER2",
        25
    )
    questions.append(tc_her2_question)
    qid += 1

    return questions, qid


In [46]:
questions_list = []
global_qid = 1
for i in range(2, len(data)):
    patient_id = data.iloc[i][0]
    row = data.iloc[i]
    questions, qid = row_to_vqa_json(row, patient_id, global_qid)
    global_qid = qid
    questions_list.extend(questions)

# pprint(vqa_json_list)
output_path = './duke_vqa.json'
with open(output_path, 'w') as f:
    json.dump(questions_list, f, indent=4)
    print(f"Saved to {output_path}")

Saved to ./duke_vqa.json
