In [9]:
import os
import cv2
import json
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

In [10]:
BASE_DIR = "CMMD/manifest-1616439774456/CMMD"
XLSX_PATH = "CMMD/CMMD_clinicaldata_revision.xlsx"  # <- change this


In [11]:

def rename_dicoms_with_patient_id(base_dir):
    for root, dirs, files in os.walk(base_dir):
        # On extrait le nom du patient si on est dans un sous-dossier d'un patient
        relative_path = os.path.relpath(root, base_dir)
        patient_id = relative_path.split(os.sep)[0]  # premier sous-dossier = ID patient

        for file in files:
            if file.endswith(".dcm") and not file.startswith(patient_id + "___"):
                old_path = os.path.join(root, file)
                new_filename = f"{patient_id}___{file}"
                new_path = os.path.join(root, new_filename)

                os.rename(old_path, new_path)
                print(f"Renamed: {old_path} -> {new_path}")

In [12]:
def load_dicom(BASE_DIR):
    dicom_files = []
    for root, _, files in os.walk(BASE_DIR):
        for file in files:
            if file.endswith(".dcm"):
                dicom_files.append(os.path.join(root, file))
    return dicom_files

In [13]:
rename_dicoms_with_patient_id(BASE_DIR)

In [15]:
print("Loading DICOM files...")
dicom_files = load_dicom(BASE_DIR)
print(f"Found {len(dicom_files)} DICOM files.")


Loading DICOM files...
Found 5202 DICOM files.


In [19]:
import pandas as pd

print("Loading clinical data...")
clinical_data = pd.read_excel(XLSX_PATH, sheet_name="Sheet1")
print(f"Loaded clinical data with {clinical_data.shape[0]} rows and {clinical_data.shape[1]} columns.")
print("Processing 'both' values in 5th column...")

# Nom de la 5e colonne (index 4)
column_name = clinical_data.columns[4]

# Séparer les lignes avec 'both'
both_rows = clinical_data[clinical_data[column_name] == 'both']
non_both_rows = clinical_data[clinical_data[column_name] != 'both']

# Dupliquer avec 'calcification' et 'mass'
calcification_rows = both_rows.copy()
calcification_rows[column_name] = 'calcification'

mass_rows = both_rows.copy()
mass_rows[column_name] = 'mass'

# Combiner tous les résultats
clinical_data_expanded = pd.concat([non_both_rows, calcification_rows, mass_rows], ignore_index=True)

# Sauvegarde dans un nouveau fichier Excel
output_path = "CMMD/CMMD_clinicaldata_revision_sanitized.xlsx"
clinical_data_expanded.to_excel(output_path, index=False)

print(f"Expanded clinical data saved to: {output_path}")


Loading clinical data...
Loaded clinical data with 1872 rows and 7 columns.
Processing 'both' values in 5th column...
Expanded clinical data saved to: CMMD/CMMD_clinicaldata_revision_sanitized.xlsx


In [18]:
# Build sample dictionary
sample_dict = {}
for _, row in clinical_data.iterrows():
    patient_id = row['ID1']
    sample_dict[patient_id] = row.to_dict()
# Display the first few entries of the sample dictionary
print("Sample dictionary created with patient IDs as keys.")
# Display the first few entries
print(list(sample_dict.items())[:5])

Sample dictionary created with patient IDs as keys.
[('D1-0001', {'ID1': 'D1-0001', 'LeftRight': 'R', 'Age': 44, 'number': 2, 'abnormality': 'calcification', 'classification': 'Benign', 'subtype': nan}), ('D1-0002', {'ID1': 'D1-0002', 'LeftRight': 'L', 'Age': 40, 'number': 2, 'abnormality': 'calcification', 'classification': 'Benign', 'subtype': nan}), ('D1-0003', {'ID1': 'D1-0003', 'LeftRight': 'L', 'Age': 39, 'number': 2, 'abnormality': 'calcification', 'classification': 'Benign', 'subtype': nan}), ('D1-0004', {'ID1': 'D1-0004', 'LeftRight': 'L', 'Age': 41, 'number': 2, 'abnormality': 'calcification', 'classification': 'Benign', 'subtype': nan}), ('D1-0005', {'ID1': 'D1-0005', 'LeftRight': 'R', 'Age': 42, 'number': 2, 'abnormality': 'calcification', 'classification': 'Benign', 'subtype': nan})]
