In [34]:
import pandas as pd

file_path = './tcia-diagnosis-data-2012-04-20.xlsx'
data = pd.read_excel(file_path)

# rename the columns
data.columns = ['pid', 'diagnosis', 'method', 'tumor', 'n1', 'n1_method', 'n2', 'n2_method', 'n3', 'n3_method', 'n4', 'n4_method', 'n5', 'n5_method']

diagnosis_dict = {
    0: 'unknown',
    1: 'benign or non-malignant disease',
    2: 'malignant, primary lung cancer',
    3: 'malignant metastatic'
}

diagnosis_method_dict = {
    0: 'unknown',
    1: 'review of radiological images to show 2 years of stable nodule',
    2: 'biopsy',
    3: 'surgical resection',
    4: 'progression or response'
}

nodule_diagnosis_dict = {
    0: 'unknown',
    1: 'benign or non-malignant disease',
    2: 'malignant, primary lung cancer',
    3: 'malignant metastatic'
}

nodule_method_dict = {
    0: 'unknown',
    1: 'review of radiological images to show 2 years of stable nodule',
    2: 'biopsy',
    3: 'surgical resection',
    4: 'progression or response'
}

In [35]:
def ordinal(n):
    ordinals = {1: 'first', 2: 'second', 3: 'third', 4: 'fourth', 5: 'fifth'}
    return ordinals.get(n, f"{n}th")

def generate_report(row):
    diagnosis = diagnosis_dict.get(row['diagnosis'], 'unknown')
    diagnosis_method = diagnosis_method_dict.get(row['method'], 'unknown')
    primary_tumor_site = row['tumor'] if pd.notna(row['tumor']) else 'unknown'
    
    report = (f"The patient was diagnosed with {diagnosis}, ")
    if diagnosis_method != 'unknown':
        report += (f"which was determined by {diagnosis_method}, ")

    report += (f"The primary tumor site for metastatic disease is {primary_tumor_site}.")
    
    for i in range(1, 6):
        nodule = row.get(f'n{i}', None)
        nodule_method = row.get(f'n{i}_method', None)
        
        if pd.notna(nodule):
            nodule_diagnosis = nodule_diagnosis_dict.get(nodule, 'unknown')
            nodule_diagnosis_method = nodule_method_dict.get(nodule_method, 'unknown')
            report += f" The {ordinal(i)} nodule was diagnosed as {nodule_diagnosis}, which was determined by {nodule_diagnosis_method}."

    return report

data['report'] = data.apply(generate_report, axis=1)

In [36]:
for report in data['report'].head():
    print(report)

The patient was diagnosed with malignant metastatic, which was determined by progression or response, The primary tumor site for metastatic disease is Head & Neck Cancer. The first nodule was diagnosed as malignant metastatic, which was determined by progression or response.
The patient was diagnosed with malignant metastatic, which was determined by review of radiological images to show 2 years of stable nodule, The primary tumor site for metastatic disease is Head & Neck . The first nodule was diagnosed as benign or non-malignant disease, which was determined by review of radiological images to show 2 years of stable nodule.
The patient was diagnosed with malignant, primary lung cancer, which was determined by progression or response, The primary tumor site for metastatic disease is Lung Cancer. The first nodule was diagnosed as benign or non-malignant disease, which was determined by progression or response.
The patient was diagnosed with malignant metastatic, The primary tumor site

In [37]:
output_file = 'patient_report.xlsx'
data[['pid', 'report']].to_excel(output_file, index=False)

print(f"Reports have been successfully generated and saved to {output_file}")

Reports have been successfully generated and saved to patient_report.xlsx
