In [12]:
import os
import re
from datetime import datetime
import pandas as pd

def parse_dicom_filename(file_name):
    # Regex to extract datetime in format YYYYMMDDHHMMSS
    datetime_pattern = r"(\d{8})(\d{6})"  # Matches YYYYMMDDHHMMSS

    # Split the filename by underscores ('_')
    parts = file_name.split('_')

    # Extract patient ID
    patient_id = parts[1] + "_" + parts[2]  # e.g., 003_S_6644

    # Extract scan type
    scan_type = parts[3] if len(parts) >= 4 else None

    # Use regex to find the datetime in the filename
    match = re.search(datetime_pattern, file_name)
    if match:
        date_part = match.group(1)  # YYYYMMDD
        time_part = match.group(2)  # HHMMSS
        datetime_str = f"{date_part[:4]}-{date_part[4:6]}-{date_part[6:8]} {time_part[:2]}:{time_part[2:4]}:{time_part[4:6]}"
        try:
            datetime_obj = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
        except ValueError:
            datetime_obj = None
    else:
        datetime_obj = None

    return {
        'patient_id': patient_id,
        'scan_type': scan_type,
        'datetime': datetime_obj,
        'filename': file_name
    }

def process_folders(base_dir):
    # List to store parsed data
    all_data = []

    # Traverse directories with os.walk()
    for root, dirs, files in os.walk(base_dir):
        if 'I' in os.path.basename(root):  # Only look at folders containing DICOM files
            # print(f'    Processing directory: {root}')
            
            # Loop through files and try parsing the DICOM filenames
            for file_name in files:
                if file_name.endswith('.dcm'):
                    file_path = os.path.join(root, file_name)
                    
                    # Parse the filename and append to the data list
                    parsed_data = parse_dicom_filename(file_name)
                    if parsed_data:
                        parsed_data['file_path'] = file_path  # Add full file path to the data
                        all_data.append(parsed_data)

    # Convert the data into a DataFrame
    df = pd.DataFrame(all_data)

    # Save the DataFrame to a CSV file for further analysis
    df.to_csv(r"C:\Shivangi\college\Sem 5\Deep Learning\DL project\zip7_metadata_from_dcm.csv", index=False)
    print("Metadata saved")

    return df

# Set the base directory
base_dir = "D:\DL_DATASET\Cohort_4_MRI_7\ADNI"  # Adjust this to your actual directory

# Process the directories and get the DataFrame
df = process_folders(base_dir)

# Display the first few rows
print(df.head())

Metadata saved
  patient_id scan_type            datetime  \
0      027_S      0074 2005-12-15 13:57:02   
1      027_S      0074 2005-12-15 13:57:03   
2      027_S      0074 2005-12-15 13:57:04   
3      027_S      0074 2005-12-15 13:57:05   
4      027_S      0074 2005-12-15 13:57:06   

                                            filename  \
0  ADNI_027_S_0074_MR_3-plane_localizer__br_raw_2...   
1  ADNI_027_S_0074_MR_3-plane_localizer__br_raw_2...   
2  ADNI_027_S_0074_MR_3-plane_localizer__br_raw_2...   
3  ADNI_027_S_0074_MR_3-plane_localizer__br_raw_2...   
4  ADNI_027_S_0074_MR_3-plane_localizer__br_raw_2...   

                                           file_path  
0  D:\DL_DATASET\Cohort_4_MRI_7\ADNI\027_S_0074\3...  
1  D:\DL_DATASET\Cohort_4_MRI_7\ADNI\027_S_0074\3...  
2  D:\DL_DATASET\Cohort_4_MRI_7\ADNI\027_S_0074\3...  
3  D:\DL_DATASET\Cohort_4_MRI_7\ADNI\027_S_0074\3...  
4  D:\DL_DATASET\Cohort_4_MRI_7\ADNI\027_S_0074\3...  


In [13]:
import pandas as pd

# Load metadata CSV into pandas dataframe (the provided dataset)
metadata_df = pd.read_csv("D:/DL_DATASET/Cohort_4_MRI_Images_02Dec2024.csv")

# Load the dicom_metadata CSV (already extracted from filenames)
dicom_metadata_path = "C:\Shivangi\college\Sem 5\Deep Learning\DL project\zip1_metadata_from_dcm.csv"
dicom_metadata_df = pd.read_csv(dicom_metadata_path)

# List to hold the matched results
matches = []

# Iterate over all DICOM files in the dicom_metadata.csv
for index, dicom_row in dicom_metadata_df.iterrows():
    # Extract image_id from the dicom file
    dicom_file = dicom_row['filename']
    image_id = dicom_file.split("I")[-1].split(".")[0]
    
    # Match the image_id with metadata
    metadata_row = metadata_df[metadata_df['image_id'] == int(image_id)]
    
    if not metadata_row.empty:
        # Extract relevant details
        patient_id = metadata_row['subject_id'].values[0]
        mri_date = metadata_row['mri_date'].values[0]
        mri_acq_plane = metadata_row['mri_acq_plane'].values[0]
        mri_description = metadata_row['mri_description'].values[0]
        mri_type = metadata_row['mri_type'].values[0]
        mri_sequence = metadata_row['mri_sequence'].values[0]
        mri_field_str = metadata_row['mri_field_str'].values[0]
        
        # Add matched data to the list
        matches.append({
            'dicom_file': dicom_file,
            'patient_id': patient_id,
            'mri_date': mri_date,
            'mri_acq_plane': mri_acq_plane,
            'mri_description': mri_description,
            'mri_type': mri_type,
            'mri_sequence': mri_sequence,
            'mri_field_str': mri_field_str
        })
    else:
        print(f"No metadata found for {dicom_file}")

# Convert matches to a DataFrame and save to CSV
matched_df = pd.DataFrame(matches)

# Specify the path for the merged file
merged_metadata_path = "merged_mri_metadata_zip1.csv"

# Save the matched data
matched_df.to_csv(merged_metadata_path, index=False)

# Show a preview of the merged data
print(f"Merged data saved to {merged_metadata_path}")
print(matched_df.head())


Merged data saved to merged_mri_metadata_zip1.csv
                                          dicom_file  patient_id    mri_date  \
0  ADNI_011_S_0021_MR_Axial_PD-T2_TSE__br_raw_200...  011_S_0021  2005-10-10   
1  ADNI_011_S_0021_MR_Axial_PD-T2_TSE__br_raw_200...  011_S_0021  2005-10-10   
2  ADNI_011_S_0021_MR_Axial_PD-T2_TSE__br_raw_200...  011_S_0021  2005-10-10   
3  ADNI_011_S_0021_MR_Axial_PD-T2_TSE__br_raw_200...  011_S_0021  2005-10-10   
4  ADNI_011_S_0021_MR_Axial_PD-T2_TSE__br_raw_200...  011_S_0021  2005-10-10   

  mri_acq_plane  mri_description mri_type mri_sequence  mri_field_str  
0         AXIAL  Axial PD-T2 TSE       2D           SE          1.494  
1         AXIAL  Axial PD-T2 TSE       2D           SE          1.494  
2         AXIAL  Axial PD-T2 TSE       2D           SE          1.494  
3         AXIAL  Axial PD-T2 TSE       2D           SE          1.494  
4         AXIAL  Axial PD-T2 TSE       2D           SE          1.494  


In [1]:
import pydicom
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

# Load the merged metadata file
# Use a raw string to handle backslashes in the file path
metadata_df = pd.read_csv(r'C:\Shivangi\college\Sem 5\Deep Learning\DL project\zip2_metadata_from_dcm.csv')

def preprocess_single_frame(img, target_size):
    # Resize the image by padding if necessary to maintain the aspect ratio
    h, w = img.shape
    scale = min(target_size[0] / h, target_size[1] / w)
    new_w = int(w * scale)
    new_h = int(h * scale)
    
    # Resize the image to the new dimensions
    resized_img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
    
    # Add padding to make it the target size
    top = (target_size[0] - new_h) // 2
    bottom = target_size[0] - new_h - top
    left = (target_size[1] - new_w) // 2
    right = target_size[1] - new_w - left
    
    padded_img = cv2.copyMakeBorder(resized_img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=0)
    
    return padded_img

# Function to load DICOM images and preprocess them
def load_and_preprocess_dicom(file_path, target_size=(256, 256)):
    try:
        # Read DICOM file
        dicom_data = pydicom.dcmread(file_path)
        
        # Convert DICOM pixel data to numpy array
        img = dicom_data.pixel_array
        
        # Handle multi-frame DICOMs
        if img.ndim == 3:
            # Assuming the first dimension is frames
            frames = img.shape[0]
            processed_frames = []
            for i in range(frames):
                single_frame = img[i]
                # If the frame has multiple channels, convert to grayscale
                if single_frame.ndim > 2:
                    single_frame = np.mean(single_frame, axis=-1).astype(single_frame.dtype)
                processed_frame = preprocess_single_frame(single_frame, target_size)
                processed_frames.append(processed_frame)
            return processed_frames  # Return list of processed frames
        elif img.ndim == 2:
            # Single-frame DICOM
            processed_img = preprocess_single_frame(img, target_size)
            return [processed_img]  # Return as a list for consistency
        else:
            raise ValueError(f"Unsupported image dimensions: {img.shape}")
    except Exception as e:
        # Raise the exception to be caught in the main loop
        raise RuntimeError(f"Failed to process {file_path}: {e}")

# Prepare the data
processed_images = []
failed_files = []

# Loop through the metadata and process each file
for idx, row in tqdm(metadata_df.iterrows(), total=metadata_df.shape[0], desc="Processing DICOM files"):
    dicom_file = row['file_path']
    
    # Check if the file exists
    if not os.path.exists(dicom_file):
        print(f"File does not exist: {dicom_file}")
        failed_files.append((dicom_file, "File not found"))
        continue
    
    try:
        processed_imgs = load_and_preprocess_dicom(dicom_file)
        processed_images.extend(processed_imgs)  # Add all frames to the list
    except Exception as e:
        print(e)
        failed_files.append((dicom_file, str(e)))

# Convert list of images into numpy array
if processed_images:
    processed_images_array = np.array(processed_images)
    
    # Save the preprocessed images as .npy file
    np.save("D:/DL_DATASET/processed_images_zip1.npy", processed_images_array)
    print("Preprocessing complete. Images saved as 'processed_images_zip2.npy'.")
else:
    print("No images were processed successfully.")

# Optionally, save the list of failed files for review
if failed_files:
    failed_df = pd.DataFrame(failed_files, columns=['file_path', 'error'])
    failed_df.to_csv("D:/DL_DATASET/failed_files.csv", index=False)
    print(f"{len(failed_files)} files failed to process. Details saved in 'failed_files.csv'.")

Processing DICOM files: 100%|██████████| 88088/88088 [25:50<00:00, 56.82it/s]  


Preprocessing complete. Images saved as 'processed_images_zip2.npy'.


In [None]:
import matplotlib.pyplot as plt
# Select 10 random indices from the processed images to visualize
indices = np.random.choice(len(processed_images_array), 5, replace=False)

# Plot the original vs processed images
fig, axes = plt.subplots(5, 2, figsize=(10, 30))

for i, idx in enumerate(indices):
    # Load original DICOM image
    dicom_file = metadata_df.iloc[idx]['file_path']
    dicom_data = pydicom.dcmread(dicom_file)
    original_img = dicom_data.pixel_array
    
    # Get the processed image
    processed_img = processed_images_array[idx]
    
    # Plot original image
    axes[i, 0].imshow(original_img, cmap='gray')
    axes[i, 0].set_title(f"Original Image {idx+1}")
    axes[i, 0].axis('off')
    
    # Plot processed image
    axes[i, 1].imshow(processed_img, cmap='gray')
    axes[i, 1].set_title(f"Processed Image {idx+1}")
    axes[i, 1].axis('off')

plt.tight_layout()
plt.show()
