In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
dataset_dir = '/content/drive/My Drive/Dataset'


In [5]:
import os
import shutil
import random
import pandas as pd
from PIL import Image
# i got a txt and csv file already there
def is_valid_image(file_path):
    if file_path.endswith(('.csv', '.txt')):
        return False
    try:
        img = Image.open(file_path)
        img.verify()  # this function makes sure all the data has the same format
        return True
    except Exception:
        return False


In [6]:
import os
import shutil
import random
import pandas as pd
from PIL import Image

# Function to check if an image is valid
def is_valid_image(file_path):
    try:
        img = Image.open(file_path)
        img.verify()  # Verify the image integrity
        return True
    except Exception:
        return False

# Function to log invalid (corrupted or unreadable) image files
def log_invalid_files(base_dir):
    invalid_files = []
    for root, _, files in os.walk(base_dir):
        for file in files:
            file_path = os.path.join(root, file)
            if not is_valid_image(file_path):
                invalid_files.append(file_path)
    return invalid_files

# Function to collect file paths by label and modality
def collect_file_paths_by_label(base_dir):
    data = {'Image Path': [], 'Modality': [], 'Label': []}

    # Iterating through modalities (MRI and CT Scan folders)
    for modality_folder in os.listdir(base_dir):
        modality_path = os.path.join(base_dir, modality_folder)

        # Checking if it's a directory
        if os.path.isdir(modality_path):
            modality = 'MRI' if 'MRI' in modality_folder else 'CT Scan'

            # Iterating through healthy and tumor subfolders
            for label_folder in os.listdir(modality_path):
                label_path = os.path.join(modality_path, label_folder)

                if os.path.isdir(label_path):
                    label = 'healthy' if 'healthy' in label_folder.lower() else 'tumor'

                    # Collecting all image file paths
                    for file_name in os.listdir(label_path):
                        if file_name.endswith(('.png', '.jpg', '.jpeg')):
                            data['Image Path'].append(os.path.join(label_path, file_name))
                            data['Modality'].append(modality)
                            data['Label'].append(label)

    return data

# Function to calculate dataset statistics
def dataset_statistics(base_dir):
    stats = {}
    for modality in os.listdir(base_dir):
        modality_path = os.path.join(base_dir, modality)

        if not os.path.isdir(modality_path): continue

        stats[modality] = {}
        for label in os.listdir(modality_path):
            label_path = os.path.join(modality_path, label)
            if not os.path.isdir(label_path): continue

            # Collect only valid image files
            valid_files = [f for f in os.listdir(label_path) if is_valid_image(os.path.join(label_path, f))]
            stats[modality][label] = len(valid_files)
    return stats

# Function to split the dataset into train/val/test sets
def split_dataset(input_dir, output_dir, split_ratios=(0.7, 0.15, 0.15)):
    for modality in os.listdir(input_dir):
        modality_path = os.path.join(input_dir, modality)
        if not os.path.isdir(modality_path): continue

        for label in os.listdir(modality_path):
            label_path = os.path.join(modality_path, label)
            if not os.path.isdir(label_path): continue

            # Collect valid image files
            files = [f for f in os.listdir(label_path) if is_valid_image(os.path.join(label_path, f))]

            # Shuffle files for randomness
            random.shuffle(files)

            # Compute split indices
            train_split = int(len(files) * split_ratios[0])
            val_split = train_split + int(len(files) * split_ratios[1])

            # Divide files into splits
            splits = {'train': files[:train_split], 'val': files[train_split:val_split], 'test': files[val_split:]}

            # Copy files into respective folders
            for split_name, split_files in splits.items():
                split_path = os.path.join(output_dir, split_name, modality, label)
                os.makedirs(split_path, exist_ok=True)
                for file in split_files:
                    shutil.copy(os.path.join(label_path, file), os.path.join(split_path, file))

# Main Execution
if __name__ == "__main__":
    input_dir = "/content/drive/MyDrive/Dataset"  # dataset path
    output_dir = "/content/drive/MyDrive/Split_Dataset"  # output path

    # 1. Log invalid files (.txt and .csv here)
    print("Checking for invalid files...")
    invalid_files = log_invalid_files(input_dir)
    if invalid_files:
        print("Invalid files detected:")
        for file in invalid_files:
            print(file)
    else:
        print("No invalid files detected.")

    # 2. Generating the dataset statistics
    print("\nCalculating dataset statistics...")
    stats = dataset_statistics(input_dir)
    print("Dataset Statistics:")
    for modality, labels in stats.items():
        print(f"  {modality}:")
        for label, count in labels.items():
            print(f"    {label}: {count} images")

    # 3. Splitting the dataset
    print("\nSplitting dataset into train/val/test...")
    split_dataset(input_dir, output_dir)
    print("Dataset split completed.")

    # 3. Collecting  file paths and save to CSV
    print("\nCollecting file paths...")
    file_paths = collect_file_paths_by_label(input_dir)
    df = pd.DataFrame(file_paths)

    # Saving to CSV in Google Drive
    csv_path = '/content/drive/MyDrive/Dataset/brain_tumor_dataset.csv'
    df.to_csv(csv_path, index=False)
    print(f"CSV file created and saved at: {csv_path}")


Checking for invalid files...
Invalid files detected:
/content/drive/MyDrive/Dataset/brain_tumor_dataset.csv
/content/drive/MyDrive/Dataset/Brain Tumor CT scan Images/CT image source.txt
/content/drive/MyDrive/Dataset/Brain Tumor MRI images/MRI image source.txt

Calculating dataset statistics...
Dataset Statistics:
  Brain Tumor CT scan Images:
    Tumor: 100 images
    Healthy: 987 images
  Brain Tumor MRI images:
    Healthy: 103 images
    Tumor: 100 images

Splitting dataset into train/val/test...
Dataset split completed.

Collecting file paths...
CSV file created and saved at: /content/drive/MyDrive/Dataset/brain_tumor_dataset.csv
