In [2]:
import os
import shutil
import random


In [3]:

# Configuration
positive_dir = "/home/physionet/project/data/samitropTrue"
negative_dir = "/home/physionet/project/data/ptbxlFalse"
output_base = "fold"
n_folds = 5
test_total = 4880
test_pos = int(0.05 * test_total)  # 244
test_neg = test_total - test_pos  # 4636

# Helper to get all .hea file bases from a directory
def get_record_bases(folder):
    return sorted([f[:-4] for f in os.listdir(folder) if f.endswith(".hea")])

# Collect all records
positive_bases = get_record_bases(positive_dir)
negative_bases = get_record_bases(negative_dir)

# Sanity check
assert len(positive_bases) >= test_pos, "Not enough positive samples for each fold."
assert len(negative_bases) >= test_neg, "Not enough negative samples for each fold."

# Set fixed seed for reproducibility
random.seed(42)

for fold in range(1, n_folds + 1):
    print(f"Processing fold {fold}...")

    # Create fold directories
    fold_dir = os.path.join(output_base, f"fold_{fold}")
    os.makedirs(os.path.join(fold_dir, "train"), exist_ok=True)
    os.makedirs(os.path.join(fold_dir, "test"), exist_ok=True)

    # Randomly sample for test set
    test_pos_samples = random.sample(positive_bases, test_pos)
    test_neg_samples = random.sample(negative_bases, test_neg)
    test_samples = [(name, 1) for name in test_pos_samples] + [(name, 0) for name in test_neg_samples]
    random.shuffle(test_samples)

    # Everything else goes to training
    remaining_pos = [r for r in positive_bases if r not in test_pos_samples]
    remaining_neg = [r for r in negative_bases if r not in test_neg_samples]
    train_samples = [(name, 1) for name in remaining_pos] + [(name, 0) for name in remaining_neg]
    random.shuffle(train_samples)

    # Copy function
    def copy_records(samples, source_dir, target_dir, prefix):
        for idx, (base_name, label) in enumerate(samples):
            src_folder = positive_dir if label == 1 else negative_dir
            for ext in [".hea", ".dat"]:
                src = os.path.join(src_folder, base_name + ext)
                # dst_name = f"{prefix}_{idx:05d}{ext}"
                # dst_name = f"{idx:05d}{ext}"
                dst_name = base_name + ext

                dst = os.path.join(target_dir, dst_name)
                if os.path.exists(src):
                    shutil.copyfile(src, dst)
                else:
                    print(f"Warning: Missing file {src}")

    # Copy files to fold
    copy_records(test_samples, positive_dir, os.path.join(fold_dir, "test"), "rec_test")
    copy_records(train_samples, positive_dir, os.path.join(fold_dir, "train"), "rec_train")

print("✅ All 5 folds prepared.")

Processing fold 1...
Processing fold 2...
Processing fold 3...
Processing fold 4...
Processing fold 5...
✅ All 5 folds prepared.


# Check the folders

In [4]:
# check how many files are in the 5fold directory
def count_files_in_directory(directory):
    count = 0
    for root, dirs, files in os.walk(directory):
        count += len(files)
    return count
# Check the number of files in the output directory
output_dir = "fold"
total_files = count_files_in_directory(output_dir)
print(f"Total files in {output_dir}: {total_files}")
# Check the number of files in each fold

n_folds = 5
for fold in range(1, n_folds + 1):
    fold_dir = os.path.join(output_dir, f"fold_{fold}")
    train_count = count_files_in_directory(os.path.join(fold_dir, "train"))
    test_count = count_files_in_directory(os.path.join(fold_dir, "test"))
    print(f"Fold {fold}: Train files: {train_count}, Test files: {test_count}")
# Check the number of positive and negative samples in each fold
def count_samples_in_directory(directory):
    pos_count = 0
    neg_count = 0
    for root, dirs, files in os.walk(directory):
        print(f"Checking directory: {root}")
        for file in files:
            if file.endswith(".hea"):
                file_path = os.path.join(root, file)
                with open(file_path, "r") as f:
                    for line in f:
                        if "# Chagas label:" in line:
                            if "True" in line:
                                pos_count += 1
                            elif "False" in line:
                                neg_count += 1
                            break  # Stop after processing the label line
    return pos_count, neg_count

# Check the number of positive and negative samples in each fold
for fold in range(1, n_folds + 1):
    fold_dir = os.path.join(output_dir, f"fold_{fold}")
    train_pos_count, train_neg_count = count_samples_in_directory(os.path.join(fold_dir, "train"))
    test_pos_count, test_neg_count = count_samples_in_directory(os.path.join(fold_dir, "test"))
    print(f"Fold {fold}: Train Positive: {train_pos_count}, Train Negative: {train_neg_count}, Test Positive: {test_pos_count}, Test Negative: {test_neg_count}")

Total files in fold: 226140
Fold 1: Train files: 35468, Test files: 9760
Fold 2: Train files: 35468, Test files: 9760
Fold 3: Train files: 35468, Test files: 9760
Fold 4: Train files: 35468, Test files: 9760
Fold 5: Train files: 35468, Test files: 9760
Checking directory: fold/fold_1/train
Checking directory: fold/fold_1/test
Fold 1: Train Positive: 571, Train Negative: 17163, Test Positive: 244, Test Negative: 4636
Checking directory: fold/fold_2/train
Checking directory: fold/fold_2/test
Fold 2: Train Positive: 571, Train Negative: 17163, Test Positive: 244, Test Negative: 4636
Checking directory: fold/fold_3/train
Checking directory: fold/fold_3/test
Fold 3: Train Positive: 571, Train Negative: 17163, Test Positive: 244, Test Negative: 4636
Checking directory: fold/fold_4/train
Checking directory: fold/fold_4/test
Fold 4: Train Positive: 571, Train Negative: 17163, Test Positive: 244, Test Negative: 4636
Checking directory: fold/fold_5/train
Checking directory: fold/fold_5/test
Fold