In [6]:
import pandas as pd

file_path = './tcia-diagnosis-data-2012-04-20.xlsx'
data = pd.read_excel(file_path)

# rename the columns
data.columns = ['pid', 'diagnosis', 'method', 'tumor', 'n1', 'n1_method', 'n2', 'n2_method', 'n3', 'n3_method', 'n4', 'n4_method', 'n5', 'n5_method']

print(data.head())

              pid  diagnosis  method               tumor   n1  n1_method  n2   
0  LIDC-IDRI-0068          3       4  Head & Neck Cancer  3.0        4.0 NaN  \
1  LIDC-IDRI-0071          3       1        Head & Neck   1.0        1.0 NaN   
2  LIDC-IDRI-0072          2       4         Lung Cancer  1.0        4.0 NaN   
3  LIDC-IDRI-0088          3       0      Uterine Cancer  0.0        0.0 NaN   
4  LIDC-IDRI-0090          2       3               NSCLC  2.0        3.0 NaN   

   n2_method   n3 n3_method  n4  n4_method  n5  n5_method  
0        NaN  NaN       NaN NaN        NaN NaN        NaN  
1        NaN  NaN       NaN NaN        NaN NaN        NaN  
2        NaN  NaN       NaN NaN        NaN NaN        NaN  
3        NaN  NaN       NaN NaN        NaN NaN        NaN  
4        NaN  NaN       NaN NaN        NaN NaN        NaN  


In [15]:
diagnosis_dict = {
    0: 'unknown',
    1: 'benign or non-malignant disease',
    2: 'malignant, primary lung cancer',
    3: 'malignant metastatic'
}

diagnosis_method_dict = {
    0: 'unknown',
    1: 'review of radiological images to show 2 years of stable nodule',
    2: 'biopsy',
    3: 'surgical resection',
    4: 'progression or response'
}

nodule_diagnosis_dict = {
    0: 'unknown',
    1: 'benign or non-malignant disease',
    2: 'malignant, primary lung cancer',
    3: 'malignant metastatic'
}

nodule_method_dict = {
    0: 'unknown',
    1: 'review of radiological images to show 2 years of stable nodule',
    2: 'biopsy',
    3: 'surgical resection',
    4: 'progression or response'
}


def generate_report(row):
    diagnosis = diagnosis_dict.get(row['diagnosis'], 'unknown')
    diagnosis_method = diagnosis_method_dict.get(row['method'], 'unknown')
    primary_tumor_site = row['tumor'] if pd.notna(row['tumor']) else 'unknown'
    # nodules 1-5
    nodule1 = row['n1'] if pd.notna(row['n1']) else None
    nodule1_method = row['n1_method'] if pd.notna(row['n1_method']) else None
    nodule2 = row['n2'] if pd.notna(row['n2']) else None
    nodule2_method = row['n2_method'] if pd.notna(row['n2_method']) else None
    nodule3 = row['n3'] if pd.notna(row['n3']) else None
    nodule3_method = row['n3_method'] if pd.notna(row['n3_method']) else None
    nodule4 = row['n4'] if pd.notna(row['n4']) else None
    nodule4_method = row['n4_method'] if pd.notna(row['n4_method']) else None
    nodule5 = row['n5'] if pd.notna(row['n5']) else None
    nodule5_method = row['n5_method'] if pd.notna(row['n5_method']) else None
    
    report = (f"The patient was diagnosed as having {diagnosis}, "
              f"which was determined by {diagnosis_method}. "
              f"The primary tumor site was found to be {primary_tumor_site}.")
    
    if nodule1:
        report += f" The first nodule was diagnosed as {nodule_diagnosis_dict.get(nodule1, 'unknown')}, which was determined by {nodule_method_dict.get(nodule1_method, 'unknown')}."
    if nodule2:
        report += f" The second nodule was diagnosed as {nodule_diagnosis_dict.get(nodule2, 'unknown')}, which was determined by {nodule_method_dict.get(nodule2_method, 'unknown')}."
    if nodule3:
        report += f" The third nodule was diagnosed as {nodule_diagnosis_dict.get(nodule3, 'unknown')}, which was determined by {nodule_method_dict.get(nodule3_method, 'unknown')}."
    if nodule4:
        report += f" The fourth nodule was diagnosed as {nodule_diagnosis_dict.get(nodule4, 'unknown')}, which was determined by {nodule_method_dict.get(nodule4_method, 'unknown')}."
    if nodule5:
        report += f" The fifth nodule was diagnosed as {nodule_diagnosis_dict.get(nodule5, 'unknown')}, which was determined by {nodule_method_dict.get(nodule5_method, 'unknown')}."

    return report

data['Report'] = data.apply(generate_report, axis=1)

# for report in data['Report'].head():
#     print(report)

The patient was diagnosed as having malignant metastatic, which was determined by progression or response. The primary tumor site was found to be Head & Neck Cancer. The first nodule was diagnosed as malignant metastatic, which was determined by progression or response.
The patient was diagnosed as having malignant metastatic, which was determined by review of radiological images to show 2 years of stable nodule. The primary tumor site was found to be Head & Neck . The first nodule was diagnosed as benign or non-malignant disease, which was determined by review of radiological images to show 2 years of stable nodule.
The patient was diagnosed as having malignant, primary lung cancer, which was determined by progression or response. The primary tumor site was found to be Lung Cancer. The first nodule was diagnosed as benign or non-malignant disease, which was determined by progression or response.
The patient was diagnosed as having malignant metastatic, which was determined by unknown.

In [None]:
print(data['Report'])

In [None]:
output_file = 'patient_report_with_text.xlsx'
data[['Patient_ID', 'Report']].to_excel(output_file, index=False)

print(f"Reports have been successfully generated and saved to {output_file}")