In [11]:
import pandas as pd
import os
import cv2
import numpy as np

# Define paths for train and test directories
train_parquet_path = r'G:\Major Project\MRI\Folder2\Alzheimer MRI Disease Classification Dataset\Data\train-00000-of-00001-c08a401c53fe5312.parquet'
test_parquet_path = r'G:\Major Project\MRI\Folder2\Alzheimer MRI Disease Classification Dataset\Data\test-00000-of-00001-44110b9df98c5585.parquet'

# Define output directories for saving the images
output_train_dir = r'G:\Major Project\MRI\Folder2\Alzheimer MRI Disease Classification Dataset\Output\train'
output_test_dir = r'G:\Major Project\MRI\Folder2\Alzheimer MRI Disease Classification Dataset\Output\test'

# Define categories as per the label
categories = {
    0: 'Mild_Demented',
    1: 'Moderate_Demented',
    2: 'Non_Demented',
    3: 'Very_Mild_Demented'
}

# Function to decode image and save it
def save_image(image_data, label, output_dir, index):
    # Extract the byte string from the 'bytes' field in the dictionary
    if isinstance(image_data, dict) and 'bytes' in image_data:
        image_data = image_data['bytes']
    else:
        print(f"Error: 'bytes' field not found for image {index}")
        return
    
    # Convert the byte string to a numpy array and decode as an image
    nparr = np.frombuffer(image_data, np.uint8)
    img = cv2.imdecode(nparr, cv2.IMREAD_GRAYSCALE)
    
    # Create the label folder if it doesn't exist
    label_folder = os.path.join(output_dir, categories[label])
    if not os.path.exists(label_folder):
        os.makedirs(label_folder)

    # Define the file path to save the image
    image_path = os.path.join(label_folder, f'{index}.png')
    
    # Save the image
    cv2.imwrite(image_path, img)

# Function to process a Parquet file and extract images
def process_parquet(parquet_path, output_dir):
    # Read Parquet file into DataFrame
    df = pd.read_parquet(parquet_path)
    
    # Iterate through each row in the DataFrame
    for idx, row in df.iterrows():
        image_data = row['image']  # Assuming the image data is in the 'image' column
        label = row['label']  # Assuming the label is in the 'label' column
        save_image(image_data, label, output_dir, idx)

# Process the train Parquet file
process_parquet(train_parquet_path, output_train_dir)

# Process the test Parquet file
process_parquet(test_parquet_path, output_test_dir)


In [10]:
import pandas as pd

# Read the Parquet file into a DataFrame
df = pd.read_parquet(train_parquet_path)

# Print the first few rows to inspect the data structure
print(df.head())


                                               image  label
0  {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...      2
1  {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...      0
2  {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...      3
3  {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...      3
4  {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...      2


In [2]:
import os
import cv2
import numpy as np
from tqdm import tqdm

# Define paths for train and test directories
train_dir = r'G:\Major Project\MRI\Folder2\Alzheimer MRI Disease Classification Dataset\Output\train'
test_dir = r'G:\Major Project\MRI\Folder2\Alzheimer MRI Disease Classification Dataset\Output\test'

# Output directories for preprocessed images
preprocessed_train_dir = r'G:\Major Project\MRI\Folder2\Alzheimer MRI Disease Classification Dataset\Preprocessed\train'
preprocessed_test_dir = r'G:\Major Project\MRI\Folder2\Alzheimer MRI Disease Classification Dataset\Preprocessed\test'

# Image dimensions for resizing
IMG_SIZE = (128, 128)

def preprocess_images(input_dir, output_dir, img_size):
    """
    Preprocess images by resizing and normalizing them.

    Args:
    - input_dir (str): Path to the input directory.
    - output_dir (str): Path to the output directory.
    - img_size (tuple): Target image size (width, height).
    """
    # Iterate through each category folder (e.g., Mild_Demented, Moderate_Demented)
    for category in os.listdir(input_dir):
        category_path = os.path.join(input_dir, category)
        output_category_path = os.path.join(output_dir, category)

        # Create the category output folder if it doesn't exist
        if not os.path.exists(output_category_path):
            os.makedirs(output_category_path)

        # Process each image in the category folder
        for img_name in tqdm(os.listdir(category_path), desc=f"Processing {category}"):
            img_path = os.path.join(category_path, img_name)
            
            # Read the image
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)

            # Check if the image is valid
            if img is None:
                print(f"Skipping invalid image: {img_path}")
                continue
            
            # Resize the image
            img_resized = cv2.resize(img, img_size)

            # Normalize the image (scale pixel values to [0, 1])
            img_normalized = img_resized / 255.0

            # Save the preprocessed image
            output_path = os.path.join(output_category_path, img_name)
            cv2.imwrite(output_path, (img_normalized * 255).astype(np.uint8))  # Save as 8-bit image

            
# Preprocess train dataset
preprocess_images(train_dir, preprocessed_train_dir, IMG_SIZE)

# Preprocess test dataset
preprocess_images(test_dir, preprocessed_test_dir, IMG_SIZE)


Processing Mild_Demented: 100%|██████████████████████████████████████████████████████| 724/724 [00:16<00:00, 43.90it/s]
Processing Moderate_Demented: 100%|████████████████████████████████████████████████████| 49/49 [00:01<00:00, 24.56it/s]
Processing Non_Demented: 100%|█████████████████████████████████████████████████████| 2566/2566 [00:52<00:00, 48.68it/s]
Processing Very_Mild_Demented: 100%|███████████████████████████████████████████████| 1781/1781 [00:40<00:00, 44.34it/s]
Processing Mild_Demented: 100%|██████████████████████████████████████████████████████| 172/172 [00:05<00:00, 31.23it/s]
Processing Moderate_Demented: 100%|████████████████████████████████████████████████████| 15/15 [00:00<00:00, 16.20it/s]
Processing Non_Demented: 100%|███████████████████████████████████████████████████████| 634/634 [00:15<00:00, 41.35it/s]
Processing Very_Mild_Demented: 100%|█████████████████████████████████████████████████| 459/459 [00:09<00:00, 47.23it/s]
