Stratified k-fold Split

In [None]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import StratifiedKFold

# Dataset details
base_dir = r"D:\3rd sem Project\Secondary dataset\CXR collection\17000 Dataset\Dataset"
output_dir = r"E:\Modified Dataset"
train_dir = os.path.join(base_dir, "train")
test_dir = os.path.join(base_dir, "test")  # Fixed test dataset directory
classes = ["NORMAL", "PNEUMONIA", "ABNORMAL"]

# Create a list of all images and their labels (from the training directory)
image_paths = []
labels = []

for cls in classes:
    cls_dir = os.path.join(train_dir, cls)
    for img in os.listdir(cls_dir):
        image_paths.append(os.path.join(cls_dir, img))
        labels.append(cls)

# Convert to a DataFrame
data = pd.DataFrame({"image_path": image_paths, "label": labels})

# Stratified K-Fold Cross-Validation configuration
n_splits = 5  # Number of folds
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(skf.split(data, data["label"])):
    print(f"Processing Fold {fold + 1}...")
    fold_dir = os.path.join(output_dir, f"Fold_{fold + 1}")
    fold_train_dir = os.path.join(fold_dir, "Train")
    fold_val_dir = os.path.join(fold_dir, "Validation")
    fold_test_dir = os.path.join(fold_dir, "Test")

    # Create directories for Train, Validation, and Test
    for folder in [fold_train_dir, fold_val_dir, fold_test_dir]:
        for cls in classes:
            os.makedirs(os.path.join(folder, cls), exist_ok=True)

    # Split data into training and validation sets for the current fold
    train_data = data.iloc[train_idx]
    val_data = data.iloc[val_idx]

    # Copy training images
    for _, row in train_data.iterrows():
        src = row["image_path"]
        label = row["label"]
        dst = os.path.join(fold_train_dir, label, os.path.basename(src))
        shutil.copy(src, dst)

    # Copy validation images
    for _, row in val_data.iterrows():
        src = row["image_path"]
        label = row["label"]
        dst = os.path.join(fold_val_dir, label, os.path.basename(src))
        shutil.copy(src, dst)

    # Copy fixed testing dataset
    for cls in classes:
        cls_test_dir = os.path.join(test_dir, cls)
        for img in os.listdir(cls_test_dir):
            src = os.path.join(cls_test_dir, img)
            dst = os.path.join(fold_test_dir, cls, os.path.basename(img))
            shutil.copy(src, dst)

print("Stratified K-Fold Cross-Validation dataset splits created in E:\\Modified Dataset.")


#Monte Carlo Cross-Validation

In [None]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

# Dataset details
base_dir = r"D:\3rd sem Project\Secondary dataset\CXR collection\17000 Dataset\Dataset"
output_dir = r"E:\Modified Dataset"
train_dir = os.path.join(base_dir, "train")
test_dir = os.path.join(base_dir, "test")  # Fixed test dataset directory
classes = ["NORMAL", "PNEUMONIA", "ABNORMAL"]

# Create a list of all images and their labels (from the training directory)
image_paths = []
labels = []

for cls in classes:
    cls_dir = os.path.join(train_dir, cls)
    for img in os.listdir(cls_dir):
        image_paths.append(os.path.join(cls_dir, img))
        labels.append(cls)

# Convert to a DataFrame
data = pd.DataFrame({"image_path": image_paths, "label": labels})

# Monte Carlo Cross-Validation configuration
n_splits = 5  # Number of iterations (splits)
train_ratio = 0.8  # Ratio of data to be used for training

# Loop through each Monte Carlo split
for split in range(1, n_splits + 1):
    print(f"Processing Split {split}...")
    split_dir = os.path.join(output_dir, f"Split_{split}")
    split_train_dir = os.path.join(split_dir, "Train")
    split_val_dir = os.path.join(split_dir, "Validation")
    split_test_dir = os.path.join(split_dir, "Test")

    # Create directories for Train, Validation, and Test
    for folder in [split_train_dir, split_val_dir, split_test_dir]:
        for cls in classes:
            os.makedirs(os.path.join(folder, cls), exist_ok=True)

    # Split data into training and validation sets for the current split
    train_data, val_data = train_test_split(
        data, test_size=1 - train_ratio, stratify=data["label"], random_state=split * 42
    )

    # Copy training images
    for _, row in train_data.iterrows():
        src = row["image_path"]
        label = row["label"]
        dst = os.path.join(split_train_dir, label, os.path.basename(src))
        shutil.copy(src, dst)

    # Copy validation images
    for _, row in val_data.iterrows():
        src = row["image_path"]
        label = row["label"]
        dst = os.path.join(split_val_dir, label, os.path.basename(src))
        shutil.copy(src, dst)

    # Copy fixed testing dataset
    for cls in classes:
        cls_test_dir = os.path.join(test_dir, cls)
        for img in os.listdir(cls_test_dir):
            src = os.path.join(cls_test_dir, img)
            dst = os.path.join(split_test_dir, cls, os.path.basename(img))
            shutil.copy(src, dst)

print("Monte Carlo Cross-Validation dataset splits created in E:\\Modified Dataset.")


In [None]:
import os
import shutil
import pandas as pd
from sklearn.utils import resample

# Dataset details
base_dir = r"D:\3rd sem Project\Secondary dataset\CXR collection\17000 Dataset\Dataset"
output_dir = r"E:\Modified Dataset"
train_dir = os.path.join(base_dir, "train")
test_dir = os.path.join(base_dir, "test")  # Fixed test dataset directory
classes = ["NORMAL", "PNEUMONIA", "ABNORMAL"]

# Create a list of all images and their labels (from the training directory)
image_paths = []
labels = []

for cls in classes:
    cls_dir = os.path.join(train_dir, cls)
    for img in os.listdir(cls_dir):
        image_paths.append(os.path.join(cls_dir, img))
        labels.append(cls)

# Convert to a DataFrame
data = pd.DataFrame({"image_path": image_paths, "label": labels})

# Bootstrap Sampling configuration
n_iterations = 5  # Number of bootstrap iterations
sample_ratio = 0.8  # Ratio of data to be used for sampling

# Loop through each bootstrap iteration
for iteration in range(1, n_iterations + 1):
    print(f"Processing Bootstrap Iteration {iteration}...")
    iteration_dir = os.path.join(output_dir, f"Iteration_{iteration}")
    iteration_train_dir = os.path.join(iteration_dir, "Train")
    iteration_val_dir = os.path.join(iteration_dir, "Validation")
    iteration_test_dir = os.path.join(iteration_dir, "Test")

    # Create directories for Train, Validation, and Test
    for folder in [iteration_train_dir, iteration_val_dir, iteration_test_dir]:
        for cls in classes:
            os.makedirs(os.path.join(folder, cls), exist_ok=True)

    # Bootstrap sampling for training data
    train_data = resample(data, n_samples=int(len(data) * sample_ratio), stratify=data["label"], random_state=iteration * 42)
    val_data = data.drop(train_data.index)

    # Copy training images
    for _, row in train_data.iterrows():
        src = row["image_path"]
        label = row["label"]
        dst = os.path.join(iteration_train_dir, label, os.path.basename(src))
        shutil.copy(src, dst)

    # Copy validation images
    for _, row in val_data.iterrows():
        src = row["image_path"]
        label = row["label"]
        dst = os.path.join(iteration_val_dir, label, os.path.basename(src))
        shutil.copy(src, dst)

    # Copy fixed testing dataset
    for cls in classes:
        cls_test_dir = os.path.join(test_dir, cls)
        for img in os.listdir(cls_test_dir):
            src = os.path.join(cls_test_dir, img)
            dst = os.path.join(iteration_test_dir, cls, os.path.basename(img))
            shutil.copy(src, dst)

print("Bootstrap dataset splits created in E:\\Modified Dataset.")


In [None]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import KFold

# Dataset details
base_dir = r"D:\3rd sem Project\Secondary dataset\CXR collection\17000 Dataset\Dataset"
output_dir = r"E:\Modified Dataset"
train_dir = os.path.join(base_dir, "train")
test_dir = os.path.join(base_dir, "test")  # Fixed test dataset directory
classes = ["NORMAL", "PNEUMONIA", "ABNORMAL"]

# Create a list of all images and their labels (from the training directory)
image_paths = []
labels = []

for cls in classes:
    cls_dir = os.path.join(train_dir, cls)
    for img in os.listdir(cls_dir):
        image_paths.append(os.path.join(cls_dir, img))
        labels.append(cls)

# Convert to a DataFrame
data = pd.DataFrame({"image_path": image_paths, "label": labels})

# K-Fold Cross-Validation configuration
n_splits = 5  # Number of folds
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(kf.split(data)):
    print(f"Processing Fold {fold + 1}...")
    fold_dir = os.path.join(output_dir, f"Fold_{fold + 1}")
    fold_train_dir = os.path.join(fold_dir, "Train")
    fold_val_dir = os.path.join(fold_dir, "Validation")
    fold_test_dir = os.path.join(fold_dir, "Test")

    # Create directories for Train, Validation, and Test
    for folder in [fold_train_dir, fold_val_dir, fold_test_dir]:
        for cls in classes:
            os.makedirs(os.path.join(folder, cls), exist_ok=True)

    # Split data into training and validation sets for the current fold
    train_data = data.iloc[train_idx]
    val_data = data.iloc[val_idx]

    # Copy training images
    for _, row in train_data.iterrows():
        src = row["image_path"]
        label = row["label"]
        dst = os.path.join(fold_train_dir, label, os.path.basename(src))
        shutil.copy(src, dst)

    # Copy validation images
    for _, row in val_data.iterrows():
        src = row["image_path"]
        label = row["label"]
        dst = os.path.join(fold_val_dir, label, os.path.basename(src))
        shutil.copy(src, dst)

    # Copy fixed testing dataset
    for cls in classes:
        cls_test_dir = os.path.join(test_dir, cls)
        for img in os.listdir(cls_test_dir):
            src = os.path.join(cls_test_dir, img)
            dst = os.path.join(fold_test_dir, cls, os.path.basename(img))
            shutil.copy(src, dst)

print("K-Fold Cross-Validation dataset splits created in E:\\Modified Dataset.")
