In [13]:
import os
import pandas as pd

# --- 1Ô∏è‚É£ Load your structured + text data ---
df = pd.read_csv("LUAD_multimodal_dataset.csv")

print("üìÑ Loaded dataset:", df.shape)
print("Columns:", df.columns.tolist())

# --- 2Ô∏è‚É£ Confirm working directory ---
print("\nüìÇ Current directory:", os.getcwd())
print("üîé Example files:", [f for f in os.listdir() if f.endswith(".nii")][:10])

# --- 3Ô∏è‚É£ Define a robust matcher for image paths ---
def find_image_path(pid):
    import re
    base_dir = os.getcwd()
    pid = str(pid).strip().upper()

    # List all .nii files in current directory
    nii_files = [f for f in os.listdir(base_dir) if f.endswith(".nii")]

    # --- Try exact filename match first ---
    for f in nii_files:
        if pid in f.upper():
            return os.path.join(base_dir, f)

    # --- Try pattern matching (handles R0004 ‚Üî R_004) ---
    if pid.startswith("R") and pid[1:].isdigit():
        # remove leading zeros, generate both versions
        pid_num = int(pid[1:])
        patterns = [
            f"R_{pid_num:03d}.nii",
            f"R{pid_num:04d}.nii",
            f"R{pid_num:03d}.nii"
        ]
        for pattern in patterns:
            for f in nii_files:
                if re.fullmatch(pattern.replace(".nii", "") + r"\.nii", f, re.IGNORECASE):
                    return os.path.join(base_dir, f)

    # --- QIN-LSC style ---
    if pid.startswith("QIN-LSC"):
        for f in nii_files:
            if pid in f.upper():
                return os.path.join(base_dir, f)

    return None


# --- 4Ô∏è‚É£ Apply mapping ---
df["ImagePath"] = df["PatientID"].apply(find_image_path)

# --- 5Ô∏è‚É£ Save output ---
df.to_csv("LUAD_multimodal_dataset_with_paths.csv", index=False)
print("\n‚úÖ Saved ‚Üí LUAD_multimodal_dataset_with_paths.csv")

# --- 6Ô∏è‚É£ Report stats ---
matched = df["ImagePath"].notna().sum()
print(f"\nüß† Matched images: {matched} out of {len(df)}")
if matched < len(df):
    missing = df[df["ImagePath"].isna()]["PatientID"].tolist()
    print("\n‚ö†Ô∏è Missing matches for:", missing)
else:
    print("üéâ All image paths successfully matched!")


üìÑ Loaded dataset: (40, 9)
Columns: ['PatientID', 'survival_label', 'Longest.Diameter..mm.', 'Short.Axis..mm.', 'Mean..HU.', 'StdDev..HU.', 'Volume..cm..', 'Report', 'ImagePath']

üìÇ Current directory: c:\Users\sohel\Downloads\Ai_health\LUAD_CT_SURVIVAL_segmen
üîé Example files: ['QIN-LSC-0009.nii', 'QIN-LSC-0014.nii', 'QIN-LSC-0064.nii', 'R_004.nii', 'R_013.nii', 'R_017.nii', 'R_018.nii', 'R_019.nii', 'R_022.nii', 'R_033.nii']

‚úÖ Saved ‚Üí LUAD_multimodal_dataset_with_paths.csv

üß† Matched images: 40 out of 40
üéâ All image paths successfully matched!


In [None]:
python merge_multimodal_data.py


In [2]:
import pandas as pd, random

# Load the LUAD-CT-Survival feature file
df = pd.read_csv("FeaturesWithLabels_1.csv")



In [3]:
df.head()

Unnamed: 0,RID,survival_label,Longest.Diameter..mm.,Short.Axis...Longest.Diameter..mm..,Short.Axis..mm.,Mean..HU.,StdDev..HU.,Volume..cm..,X5a_3D_MacSpic_NumberOf,X8a_3D_Is_Attached_To_Pleural_Wall,...,X3D.Wavelet.decomposition...P1.L2.C3.Layer.1,X3D.Wavelet.decomposition...P1.L2.C4.Layer.1,X3D.Wavelet.decomposition...P1.L2.C5.Layer.1,X3D.Wavelet.decomposition...P1.L2.C6.Layer.1,X3D.Wavelet.decomposition...P1.L2.C7.Layer.1,X3D.Wavelet.decomposition...P1.L2.C8.Layer.1,Convexity_Mean,Conv_Area_Perim,IsAttached,Std_Conv
0,R0004,Long,45.5,1520.73,33.42,-20.13,136.8,33.06,0,0,...,922.503735,7055.561581,25833.09876,8449.116111,737.184747,6172.425054,0.8635,0.642984,0.8635,0.097923
1,R0013,Long,41.82,617.67,14.77,-9.56,119.63,14.78,0,1,...,717.342696,4189.960074,15237.07917,4561.750068,613.539088,3699.644891,0.845099,0.866819,0.80782,0.027609
2,R0017,Long,18.95,240.46,12.69,-73.58,151.36,1.76,0,1,...,11.057679,15.946198,1585.785644,2164.498445,59.819492,491.460387,0.931435,0.918455,0.931435,0.01392
3,R0018,Short,35.04,1086.57,31.01,28.45,93.49,16.85,0,1,...,494.811919,3947.60138,12430.67292,3005.110061,400.538246,3257.48375,0.917999,0.897987,0.917999,0.02573
4,R0019,Short,31.87,613.64,19.25,-30.0,146.4,10.97,0,1,...,89.74234,571.684059,8299.075649,2753.117372,89.089983,381.985943,0.931737,0.946039,0.860215,0.039337


In [5]:
df.columns.tolist()

['RID',
 'survival_label',
 'Longest.Diameter..mm.',
 'Short.Axis...Longest.Diameter..mm..',
 'Short.Axis..mm.',
 'Mean..HU.',
 'StdDev..HU.',
 'Volume..cm..',
 'X5a_3D_MacSpic_NumberOf',
 'X8a_3D_Is_Attached_To_Pleural_Wall',
 'X8b_3D_Relative_Border_To_Lung',
 'X8c_3D_Relative_Border_To_PleuralWall',
 'X8d_3D_Ratio_Free_To_Attached',
 'X9a_3D_FractionalAnisotropy',
 'X9b_3D_Circularity',
 'X9c_3D_Compactness',
 'X9d_3D_AV_Dist_COG_To_Border_.mm.',
 'X9e_3D_SD_Dist_COG_To_Border_.mm.',
 'X9f_3D_MIN_Dist_COG_To_Border_.mm.',
 'X9g_3D_MAX_Dist_COG_To_Border_.mm.',
 'X10a_3D_Relative_Volume_AirSpaces',
 'X10b_3D_Number_AirSpaces',
 'X10c_3D_Av_Volume_AirSpaces_.mm..',
 'X10d_3D_SD_Volume_AirSpaces_.mm..',
 'Asymmetry',
 'Compactness',
 'Density',
 'Elliptic.Fit',
 'Main.direction',
 'Radius.of.largest.enclosed.ellipse',
 'Radius.of.smallest.enclosing.ellipse',
 'Shape.index',
 'Roundness',
 'Rectangular.Fit',
 'Area..Pxl.',
 'Volume..Pxl.',
 'Number.of.pixels',
 'Width..Pxl.',
 'Thicknes

In [None]:
# Rename RID ‚Üí PatientID
df.rename(columns={"RID": "PatientID"}, inplace=True)

templates = [
    "CT scan shows {location} {size} mm mass, Stage {stage}, {response} to treatment.",
    "{location} lesion ({size} mm) consistent with Stage {stage} adenocarcinoma. {response}.",
    "Stage {stage} {location} tumor measuring {size} mm. {response} observed."
]

locations = ["left upper lobe", "right lower lobe", "left hilum", "right upper lobe"]
responses = ["partial response", "stable disease", "disease progression", "complete remission"]

def make_report(row):
    size = row.get("Longest.Diameter..mm.", random.randint(20, 50))
    # Randomly assign stage since dataset has no explicit stage column
    stage = random.choice(["II", "III", "IV"])
    location = random.choice(locations)
    response = random.choice(responses)
    return random.choice(templates).format(size=size, stage=stage, location=location, response=response)

df["Report"] = df.apply(make_report, axis=1)

# Save the synthetic reports
df[["PatientID", "Report"]].to_csv("radiology_reports.csv", index=False)

print("‚úÖ File saved ‚Üí radiology_reports.csv")


In [6]:
df_rad = pd.read_csv("FeaturesWithLabels_1.csv")
df_txt = pd.read_csv("radiology_reports.csv")

# make keys match
df_rad.rename(columns={"RID": "PatientID"}, inplace=True)

# merge
df = df_rad.merge(df_txt, on="PatientID", how="inner")

print(df.head())


  PatientID survival_label  Longest.Diameter..mm.  \
0     R0004           Long                  45.50   
1     R0013           Long                  41.82   
2     R0017           Long                  18.95   
3     R0018          Short                  35.04   
4     R0019          Short                  31.87   

   Short.Axis...Longest.Diameter..mm..  Short.Axis..mm.  Mean..HU.  \
0                              1520.73            33.42     -20.13   
1                               617.67            14.77      -9.56   
2                               240.46            12.69     -73.58   
3                              1086.57            31.01      28.45   
4                               613.64            19.25     -30.00   

   StdDev..HU.  Volume..cm..  X5a_3D_MacSpic_NumberOf  \
0       136.80         33.06                        0   
1       119.63         14.78                        0   
2       151.36          1.76                        0   
3        93.49         16.85    