In [1]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

In [7]:
# Define the base directory where the training subfolders are located
base_dir = r'D:\MIET_HeartSound\Dataset\training'

# Define the target dataset directory
dataset_dir = r'D:\MIET_HeartSound\Dataset\Dataset'
train_dir = os.path.join(dataset_dir, 'train')
val_dir = os.path.join(dataset_dir, 'val')

# Define subdirectories for healthy and unhealthy in train and val
train_healthy = os.path.join(train_dir, 'healthy')
train_unhealthy = os.path.join(train_dir, 'unhealthy')
val_healthy = os.path.join(val_dir, 'healthy')
val_unhealthy = os.path.join(val_dir, 'unhealthy')

# Create these directories
os.makedirs(train_healthy, exist_ok=True)
os.makedirs(train_unhealthy, exist_ok=True)
os.makedirs(val_healthy, exist_ok=True)
os.makedirs(val_unhealthy, exist_ok=True)


In [8]:
# Collect all reference data from all subfolders
all_data = pd.DataFrame(columns=['filename', 'class'])
subfolders = ['training-a', 'training-b', 'training-c', 'training-d', 'training-e', 'training-f']

In [9]:
for subfolder in subfolders:
    ref_path = os.path.join(base_dir, subfolder, 'REFERENCE.csv')
    if os.path.exists(ref_path):
        sub_data = pd.read_csv(ref_path, header=None, names=['filename', 'class'])
        sub_data['filepath'] = subfolder  # Add subfolder info to keep track of file locations
        all_data = pd.concat([all_data, sub_data], ignore_index=True)
    else:
        print(f"No reference file in {subfolder}. Skipping.")

No reference file in training-a. Skipping.
No reference file in training-b. Skipping.
No reference file in training-c. Skipping.
No reference file in training-f. Skipping.


In [10]:
# Split the data into train and validation sets
train_data, val_data = train_test_split(all_data, test_size=0.3, stratify=all_data['class'], random_state=42)

In [11]:
# Function to copy files based on dataset split
def copy_files(data, train_or_val_dir):
    for _, row in data.iterrows():
        source_path = os.path.join(base_dir, row['filepath'], f"{row['filename']}.wav")
        target_dir = os.path.join(train_or_val_dir, 'healthy' if row['class'] == '-1' else 'unhealthy')
        if os.path.exists(source_path):
            shutil.copy(source_path, target_dir)
        else:
            print(f"File {row['filename']}.wav not found in {row['filepath']}. Skipping.")


In [12]:
train_data, val_data

(     filename class    filepath
 851    e00797    -1  training-e
 467    e00413    -1  training-e
 374    e00320    -1  training-e
 1684   e01630    -1  training-e
 206    e00152     1  training-e
 ...       ...   ...         ...
 160    e00106    -1  training-e
 360    e00306    -1  training-e
 170    e00116    -1  training-e
 961    e00907    -1  training-e
 1535   e01481    -1  training-e
 
 [1537 rows x 3 columns],
      filename class    filepath
 771    e00717    -1  training-e
 870    e00816    -1  training-e
 243    e00189    -1  training-e
 273    e00219    -1  training-e
 1958   e01904    -1  training-e
 ...       ...   ...         ...
 757    e00703     1  training-e
 30      d0031    -1  training-d
 1409   e01355    -1  training-e
 1888   e01834    -1  training-e
 610    e00556    -1  training-e
 
 [659 rows x 3 columns])

In [13]:
# Copy training data to the appropriate directories
copy_files(train_data, train_dir)

In [21]:
# Copy validation data to the appropriate directories
copy_files(val_data, val_dir)

File filename.wav not found in training-f. Skipping.
File filename.wav not found in training-c. Skipping.
