In [1]:
import pandas as pd
import json
from pprint import pprint
import random

In [2]:
file_path = './Clinical_and_Other_Features.xlsx'
data = pd.read_excel(file_path, sheet_name='Data')

# print("Columns in the dataset:")
# print(data.columns)

race_and_ethnicity_col = data.columns[21]
tumor_characteristics_col = data.columns[23:27]
mri_findings_col = data.columns[48:53]

# Print
# try:
#     print(data[mri_findings_col])
# except KeyError as e:
#     print(f"KeyError: {e}. Please check if the column names match exactly.")

In [3]:
# first_row_descriptions = data.iloc[0]
# print(first_row_descriptions)

# Race and Ethnicity 21
'''{0 = N/A,1 = white,2 = black,3 = asian,4 = native,
5 = hispanic,6 = multi,7 = hawa,8 = amer indian}'''
race_mapping = {
    0: 'Unknown', 
    1: 'White',
    2: 'Black',
    3: 'Asian',
    4: 'Native American',
    5: 'Hispanic',
    6: 'Multiple Races',
    7: 'Native Hawaiian',
    8: 'American Indian'
}

# Tumor Characteristics 23-26
# ER, PR
tc_er_mapping = {0: 'Negative', 1: 'Positive'}
tc_pr_mapping = {0: 'Negative', 1: 'Positive'}
# HER2
tc_her2_mapping = {0: 'Negative', 1: 'Positive', 2: 'Borderline'}
# Molecular Subtype
'''{0 = luminal-like,1 = ER/PR pos, HER2 pos,2 = her2,3 = trip neg}'''
tc_molecular_subtype_mapping = {
    0: 'Luminal-like',
    1: 'ER/PR Positive, HER2 Positive',
    2: 'HER2 Positive',
    3: 'Triple Negative'
}

In [None]:
def row_to_vqa_json(row, patient_id, qid):
    questions = []
    
    # Race and Ethnicity
    race_questions_variants = [
        "What is the patient's race and ethnicity?",
        "Can you tell me the patient's race and ethnicity?",
        "Please provide the patient's race and ethnicity."
    ]
    race_question = {
        "Qid": qid,
        "Conversation": {
            "User": random.choice(race_questions_variants),
            "Gpt": race_mapping.get(row[race_and_ethnicity_col], 'Unknown') + '.'
        }
    }
    questions.append(race_question)
    qid += 1

    # Tumor Characteristics (ER, PR, HER2, Molecular Subtype)
    # ER
    tc_er_questions = [
        "What is the patient's ER status?",
        "Can you tell me the patient's ER status?",
        "Please provide the patient's ER status."
    ]
    tc_er_question = {
        "Qid": qid,
        "Conversation": {
            "User": random.choice(tc_er_questions),
            "Gpt": tc_er_mapping.get(row[tumor_characteristics_col[0]], 'Unknown') + '.'
        }
    }
    questions.append(tc_er_question)
    qid += 1

    # PR
    tc_pr_questions = [
        "What is the patient's PR status?",
        "Can you tell me the patient's PR status?",
        "Please provide the patient's PR status."
    ]
    tc_pr_question = {
        "qid": qid,
        "Conversation": {
            "User": random.choice(tc_pr_questions),
            "Gpt": tc_pr_mapping.get(row[tumor_characteristics_col[1]], 'Unknown') + '.'
        }
    }
    questions.append(tc_pr_question)


    vqa_json = {
        "Patient id": patient_id,
        "Questions": questions,
        #"Caption": {},
        "meta": {
            "dataset": "Duke-Breast-Cancer-MRI",
            "location": "Breast",
            "base_type": "vqa",
            "answer_type": "CLOSE",
            "column_name": "Race and Ethnicity",
            "column_idx": 21
        }
    }

    return vqa_json, qid


In [72]:
vqa_json_list = []
global_qid = 1
for i in range(2, len(data)):
    patient_id = data.iloc[i][0]
    row = data.iloc[i]
    vqa_json, qid = row_to_vqa_json(row, patient_id, global_qid)
    global_qid = qid + 1
    vqa_json_list.append(vqa_json)

In [73]:
# pprint(vqa_json_list)
output_path = './duke_vqa.json'
with open(output_path, 'w') as f:
    json.dump(vqa_json_list, f, indent=4)
    print(f"Saved to {output_path}")

Saved to ./duke_vqa.json
