In [None]:
import os
import re
from datetime import datetime
import pandas as pd

def parse_dicom_filename(file_name):
    # Regex to extract datetime in format YYYYMMDDHHMMSS
    datetime_pattern = r"(\d{8})(\d{6})"  # Matches YYYYMMDDHHMMSS

    # Split the filename by underscores ('_')
    parts = file_name.split('_')

    # Extract patient ID
    patient_id = parts[1] + "_" + parts[2]  # e.g., 003_S_6644

    # Extract scan type
    scan_type = parts[3] if len(parts) >= 4 else None

    # Use regex to find the datetime in the filename
    match = re.search(datetime_pattern, file_name)
    if match:
        date_part = match.group(1)  # YYYYMMDD
        time_part = match.group(2)  # HHMMSS
        datetime_str = f"{date_part[:4]}-{date_part[4:6]}-{date_part[6:8]} {time_part[:2]}:{time_part[2:4]}:{time_part[4:6]}"
        try:
            datetime_obj = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
        except ValueError:
            datetime_obj = None
    else:
        datetime_obj = None

    return {
        'patient_id': patient_id,
        'scan_type': scan_type,
        'datetime': datetime_obj,
        'filename': file_name
    }

def process_folders(base_dir):
    # List to store parsed data
    all_data = []

    # Traverse directories with os.walk()
    for root, dirs, files in os.walk(base_dir):
        if 'I' in os.path.basename(root):  # Only look at folders containing DICOM files
            # print(f'    Processing directory: {root}')
            
            # Loop through files and try parsing the DICOM filenames
            for file_name in files:
                if file_name.endswith('.dcm'):
                    file_path = os.path.join(root, file_name)
                    
                    # Parse the filename and append to the data list
                    parsed_data = parse_dicom_filename(file_name)
                    if parsed_data:
                        parsed_data['file_path'] = file_path  # Add full file path to the data
                        all_data.append(parsed_data)

    # Convert the data into a DataFrame
    df = pd.DataFrame(all_data)

    # Save the DataFrame to a CSV file for further analysis
    df.to_csv("dicom_metadata.csv", index=False)
    print("Metadata saved to 'dicom_metadata.csv'")

    return df

# Set the base directory
base_dir = "D:/DL_DATASET/test_folder/ADNI"  # Adjust this to your actual directory

# Process the directories and get the DataFrame
df = process_folders(base_dir)

# Display the first few rows
print(df.head())


    Processing directory: D:/DL_DATASET/test_folder/ADNI
    Processing directory: D:/DL_DATASET/test_folder/ADNI\003_S_6644\3_Plane_Localizer\2018-12-04_14_13_04.0\I1083043
    Processing directory: D:/DL_DATASET/test_folder/ADNI\005_S_4168\3_Plane_Localizer\2011-08-17_13_55_20.0\I251359
    Processing directory: D:/DL_DATASET/test_folder/ADNI\005_S_4168\3_Plane_Localizer\2012-03-28_11_44_12.0\I294035
    Processing directory: D:/DL_DATASET/test_folder/ADNI\005_S_4168\3_Plane_Localizer\2013-09-12_13_05_35.0\I389814
    Processing directory: D:/DL_DATASET/test_folder/ADNI\005_S_4168\Axial_FLAIR
    Processing directory: D:/DL_DATASET/test_folder/ADNI\005_S_4168\Axial_FLAIR\2011-11-30_11_27_21.0\I269263
    Processing directory: D:/DL_DATASET/test_folder/ADNI\005_S_4168\Axial_FLAIR\2015-09-14_13_55_42.0\I511907
    Processing directory: D:/DL_DATASET/test_folder/ADNI\005_S_4168\Calibration_Scan\2012-09-06_09_46_29.0\I331874
    Processing directory: D:/DL_DATASET/test_folder/ADNI\002_S_

In [26]:
import pandas as pd

# Load metadata CSV into pandas dataframe (the provided dataset)
metadata_df = pd.read_csv("D:/DL_DATASET/Cohort_4_MRI_Images_02Dec2024.csv")

# Load the dicom_metadata CSV (already extracted from filenames)
dicom_metadata_path = "dicom_metadata.csv"
dicom_metadata_df = pd.read_csv(dicom_metadata_path)

# List to hold the matched results
matches = []

# Iterate over all DICOM files in the dicom_metadata.csv
for index, dicom_row in dicom_metadata_df.iterrows():
    # Extract image_id from the dicom file
    dicom_file = dicom_row['filename']
    image_id = dicom_file.split("I")[-1].split(".")[0]
    
    # Match the image_id with metadata
    metadata_row = metadata_df[metadata_df['image_id'] == int(image_id)]
    
    if not metadata_row.empty:
        # Extract relevant details
        patient_id = metadata_row['subject_id'].values[0]
        mri_date = metadata_row['mri_date'].values[0]
        mri_acq_plane = metadata_row['mri_acq_plane'].values[0]
        mri_description = metadata_row['mri_description'].values[0]
        mri_type = metadata_row['mri_type'].values[0]
        mri_sequence = metadata_row['mri_sequence'].values[0]
        mri_field_str = metadata_row['mri_field_str'].values[0]
        
        # Add matched data to the list
        matches.append({
            'dicom_file': dicom_file,
            'patient_id': patient_id,
            'mri_date': mri_date,
            'mri_acq_plane': mri_acq_plane,
            'mri_description': mri_description,
            'mri_type': mri_type,
            'mri_sequence': mri_sequence,
            'mri_field_str': mri_field_str
        })
    else:
        print(f"No metadata found for {dicom_file}")

# Convert matches to a DataFrame and save to CSV
matched_df = pd.DataFrame(matches)

# Specify the path for the merged file
merged_metadata_path = "merged_mri_metadata.csv"

# Save the matched data
matched_df.to_csv(merged_metadata_path, index=False)

# Show a preview of the merged data
print(f"Merged data saved to {merged_metadata_path}")
print(matched_df.head())


Merged data saved to merged_mri_metadata.csv
                                          dicom_file  patient_id    mri_date  \
0  ADNI_003_S_6644_MR_3_Plane_Localizer__raw_2018...  003_S_6644  2018-12-04   
1  ADNI_005_S_4168_MR_3_Plane_Localizer__br_raw_2...  005_S_4168  2011-08-17   
2  ADNI_005_S_4168_MR_3_Plane_Localizer__br_raw_2...  005_S_4168  2011-08-17   
3  ADNI_005_S_4168_MR_3_Plane_Localizer__br_raw_2...  005_S_4168  2011-08-17   
4  ADNI_005_S_4168_MR_3_Plane_Localizer__br_raw_2...  005_S_4168  2012-03-28   

  mri_acq_plane    mri_description mri_type mri_sequence  mri_field_str  
0      SAGITTAL  3 Plane Localizer       2D           GR            3.0  
1      SAGITTAL  3 Plane Localizer       2D           GR            3.0  
2      SAGITTAL  3 Plane Localizer       2D           GR            3.0  
3      SAGITTAL  3 Plane Localizer       2D           GR            3.0  
4       CORONAL  3 Plane Localizer       2D           GR            3.0  


In [33]:
import pydicom
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

# Load the merged metadata file
metadata_df = pd.read_csv('dicom_metadata.csv')

# Function to load DICOM images and preprocess them
def load_and_preprocess_dicom(file_path, target_size=(256, 256)):
    # Read DICOM file
    dicom_data = pydicom.dcmread(file_path)
    
    # Convert DICOM pixel data to numpy array (assuming the pixel data is in the 'PixelData' attribute)
    img = dicom_data.pixel_array
    
    # Resize the image by padding if necessary to maintain the aspect ratio
    h, w = img.shape
    scale = min(target_size[0] / h, target_size[1] / w)
    new_w = int(w * scale)
    new_h = int(h * scale)
    
    # Resize the image to the new dimensions
    resized_img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
    
    # Add padding to make it the target size
    top = (target_size[0] - new_h) // 2
    bottom = target_size[0] - new_h - top
    left = (target_size[1] - new_w) // 2
    right = target_size[1] - new_w - left
    
    padded_img = cv2.copyMakeBorder(resized_img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=0)
    
    return padded_img

# Prepare the data
processed_images = []

# Loop through the metadata and process each file
for idx, row in tqdm(metadata_df.iterrows(), total=metadata_df.shape[0]):
    dicom_file = row['file_path']
    
    # Check if the file exists (ensure you have valid paths)
    try:
        processed_img = load_and_preprocess_dicom(dicom_file)
        processed_images.append(processed_img)
    except Exception as e:
        print(f"Error processing {dicom_file}: {e}")

# Convert list of images into numpy array
processed_images_array = np.array(processed_images)

# Save the preprocessed images as .npy file
np.save("D:/DL_DATASET/processed_images.npy", processed_images_array)

print("Preprocessing complete. Images saved as .npy.")


100%|██████████| 2503/2503 [00:30<00:00, 80.83it/s] 


Preprocessing complete. Images saved as .npy.


In [None]:
import matplotlib.pyplot as plt
# Select 10 random indices from the processed images to visualize
indices = np.random.choice(len(processed_images_array), 5, replace=False)

# Plot the original vs processed images
fig, axes = plt.subplots(5, 2, figsize=(10, 30))

for i, idx in enumerate(indices):
    # Load original DICOM image
    dicom_file = metadata_df.iloc[idx]['file_path']
    dicom_data = pydicom.dcmread(dicom_file)
    original_img = dicom_data.pixel_array
    
    # Get the processed image
    processed_img = processed_images_array[idx]
    
    # Plot original image
    axes[i, 0].imshow(original_img, cmap='gray')
    axes[i, 0].set_title(f"Original Image {idx+1}")
    axes[i, 0].axis('off')
    
    # Plot processed image
    axes[i, 1].imshow(processed_img, cmap='gray')
    axes[i, 1].set_title(f"Processed Image {idx+1}")
    axes[i, 1].axis('off')

plt.tight_layout()
plt.show()
