In [3]:
import os
import shutil
from sklearn.model_selection import train_test_split

In [4]:
# making the paths to the datasets
train_normal = "../data/chest_xray/train/NORMAL"
train_pneumonia = "../data/chest_xray/train/PNEUMONIA"

test_normal = "../data/chest_xray/test/NORMAL"
test_pneumonia = "../data/chest_xray/test/PNEUMONIA"

val_normal = "../data/chest_xray/val/NORMAL"
val_pneumonia = "../data/chest_xray/val/PNEUMONIA"

In [5]:
#Looking at the distribution of training, testing, and validation sets. 
print('There are', len(os.listdir(train_normal)), 'normal images in the training set')
print('There are', len(os.listdir(train_pneumonia)), 'pnemonia images in the training set')
print('Making a total of', len(os.listdir(train_normal))+len(os.listdir(train_pneumonia)), 'in the training set')

There are 1341 normal images in the training set
There are 3875 pnemonia images in the training set
Making a total of 5216 in the training set


In [6]:
print('There are', len(os.listdir(test_normal)), 'normal images in the testing set')
print('There are', len(os.listdir(test_pneumonia)), 'pnemonia images in the testing set')
print('Making a total of', len(os.listdir(test_normal))+len(os.listdir(test_pneumonia)), 'in the testing set')

There are 234 normal images in the testing set
There are 390 pnemonia images in the testing set
Making a total of 624 in the testing set


In [7]:
print('There are', len(os.listdir(val_normal)), 'normal images in the validation set')
print('There are', len(os.listdir(val_pneumonia)), 'pnemonia images in the validation set')
print('Making a total of', len(os.listdir(val_normal))+len(os.listdir(val_pneumonia)), 'in the validation set')

There are 8 normal images in the validation set
There are 8 pnemonia images in the validation set
Making a total of 16 in the validation set


Based on this distribution, I want to resdistribute to an 80-10-10 split

In [9]:
# Base directory
base_dir = "../data/chest_xray"

# Original data directories
train_dir = os.path.join(base_dir, "train")
test_dir = os.path.join(base_dir, "test")
val_dir = os.path.join(base_dir, "val")

# Define new directories for the redistributed dataset
new_train_dir = os.path.join(base_dir, "new_train")
new_val_dir = os.path.join(base_dir, "new_val")
new_test_dir = os.path.join(base_dir, "new_test")

# Create new directories
for directory in [new_train_dir, new_val_dir, new_test_dir]:
    if not os.path.exists(directory):
        os.makedirs(directory)
        os.makedirs(os.path.join(directory, "NORMAL"))
        os.makedirs(os.path.join(directory, "PNEUMONIA"))


In [10]:
def collect_and_combine_data(source_dirs, dest_dir):
    for label in ["NORMAL", "PNEUMONIA"]:
        files = []
        for dir_path in source_dirs:
            label_dir = os.path.join(dir_path, label)
            files.extend([os.path.join(label_dir, f) for f in os.listdir(label_dir)])
        
        # Now `files` contains all images of the current label from all sets
        # We'll move them to a temporary combined directory for easier handling
        combined_dir = os.path.join(dest_dir, label)
        for file in files:
            shutil.copy(file, combined_dir)

# Collect and combine all data into a temporary directory
temp_combined_dir = os.path.join(base_dir, "temp_combined")
if not os.path.exists(temp_combined_dir):
    os.makedirs(temp_combined_dir)
    os.makedirs(os.path.join(temp_combined_dir, "NORMAL"))
    os.makedirs(os.path.join(temp_combined_dir, "PNEUMONIA"))

collect_and_combine_data([train_dir, test_dir, val_dir], temp_combined_dir)


In [12]:
def split_and_distribute_data(source_dir, train_dir, val_dir, test_dir, split_ratio=(0.8, 0.1, 0.1)):
    for label in ["NORMAL", "PNEUMONIA"]:
        files = [os.path.join(source_dir, label, f) for f in os.listdir(os.path.join(source_dir, label))]
        
        # Splitting the files
        train_files, test_files = train_test_split(files, test_size=split_ratio[1] + split_ratio[2], stratify= None, random_state=42)
        val_files, test_files = train_test_split(test_files, test_size=split_ratio[2] / (split_ratio[1] + split_ratio[2]), stratify= None, random_state=42)
        
        # Function to copy files to the new directories
        def copy_files(files, dest_dir):
            for file in files:
                shutil.copy(file, os.path.join(dest_dir, label))
        
        # Distribute files
        copy_files(train_files, train_dir)
        copy_files(val_files, val_dir)
        copy_files(test_files, test_dir)

split_and_distribute_data(temp_combined_dir, new_train_dir, new_val_dir, new_test_dir)


In [None]:
shutil.rmtree(temp_combined_dir)

In [14]:
new_train_normal = "../data/chest_xray/new_train/NORMAL"
new_train_pneumonia = "../data/chest_xray/new_train/PNEUMONIA"
print('There are', len(os.listdir(new_train_normal)), 'normal images in the new training set')
print('There are', len(os.listdir(new_train_pneumonia)), 'pnemonia images in the new training set')
print('Making a total of', len(os.listdir(new_train_normal))+len(os.listdir(new_train_pneumonia)), 'in the new training set')
print('')

new_test_normal = "../data/chest_xray/new_test/NORMAL"
new_test_pneumonia = "../data/chest_xray/new_test/PNEUMONIA"
print('There are', len(os.listdir(new_test_normal)), 'normal images in the new testing set')
print('There are', len(os.listdir(new_test_pneumonia)), 'pnemonia images in the new testing set')
print('Making a total of', len(os.listdir(new_test_normal))+len(os.listdir(new_test_pneumonia)), 'in the new testing set')
print('')

new_val_normal = "../data/chest_xray/new_val/NORMAL"
new_val_pneumonia = "../data/chest_xray/new_val/PNEUMONIA"
print('There are', len(os.listdir(new_val_normal)), 'normal images in the new validation set')
print('There are', len(os.listdir(new_val_pneumonia)), 'pnemonia images in the new validation set')
print('Making a total of', len(os.listdir(new_val_normal))+len(os.listdir(new_val_pneumonia)), 'in the new validation set')

There are 1266 normal images in the new training set
There are 3418 pnemonia images in the new training set
Making a total of 4684 in the new training set

There are 159 normal images in the new testing set
There are 428 pnemonia images in the new testing set
Making a total of 587 in the new testing set

There are 158 normal images in the new validation set
There are 427 pnemonia images in the new validation set
Making a total of 585 in the new validation set
