In [1]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

import os
import pandas as pd

# Load the initial labeled CSV file
csv_path = '/content/drive/MyDrive/Data Science AUEB/Deep Learning/datasets/Other Useful/labeled wavs/Initial_wav_labeled.csv'
labels_df = pd.read_csv(csv_path)

# Directory containing the augmented wav files
augmented_wav_dir = '/content/drive/MyDrive/Data Science AUEB/Deep Learning/datasets/Other Useful/WAV Files/Augmented WAV Files'

# Get a list of augmented wav filenames in the directory
augmented_wav_files = [f for f in os.listdir(augmented_wav_dir) if f.endswith('.wav')]

# Prepare a list to hold the data for augmented files
augmented_data = []

# Process each augmented wav file
for wav_file in augmented_wav_files:
    # Extract the base filename (before '_snippet')
    base_filename = wav_file.split('_snippet')[0] + '.wav'

    # Check if the base filename exists in the labeled data
    label_row = labels_df[labels_df['Filename'] == base_filename]

    if not label_row.empty:
        # If found, create a dictionary with the augmented filename and all columns from the original row
        row_dict = label_row.iloc[0].to_dict()
        row_dict['Filename'] = wav_file  # Overwrite 'Filename' with the augmented filename
        augmented_data.append(row_dict)
    else:
        # If not found, create a dictionary for the augmented file with 'Unknown' for Diagnosis and other columns as None/NaN
        # You might want to adjust how 'Unknown' or missing values are handled for other columns
        unknown_row = {'Filename': wav_file, 'Diagnosis': 'Unknown'}
        # Add other columns from labels_df with NaN or None if they are not in the 'unknown_row'
        for col in labels_df.columns:
            if col not in unknown_row:
                unknown_row[col] = None # or pd.NA depending on your preference
        augmented_data.append(unknown_row)


# Convert the data to a DataFrame
augmented_df = pd.DataFrame(augmented_data)

# Ensure the columns in augmented_df match the order of labels_df and include all of them
# This handles cases where some augmented files might not have a base filename match
augmented_df = augmented_df[labels_df.columns.tolist()]


# Save the result to a new CSV file
augmented_df_path = '/content/drive/MyDrive/Data Science AUEB/Deep Learning/datasets/Other Useful/labeled wavs/augmented_labels2.csv'
augmented_df.to_csv(augmented_df_path, index=False)

# Display the path for the generated file
print(f"Augmented labels saved to: {augmented_df_path}")

Mounted at /content/drive
Augmented labels saved to: /content/drive/MyDrive/Data Science AUEB/Deep Learning/datasets/Other Useful/labeled wavs/augmented_labels2.csv


In [2]:
labels_df

Unnamed: 0,WAV File ID,Patient ID,Filename,Diagnosis
0,1.0,1.0,101_1b1_Al_sc_Meditron.wav,URTI
1,2.0,1.0,101_1b1_Pr_sc_Meditron.wav,URTI
2,3.0,2.0,102_1b1_Ar_sc_Meditron.wav,Healthy
3,4.0,3.0,103_2b2_Ar_mc_LittC2SE.wav,Asthma
4,5.0,4.0,104_1b1_Al_sc_Litt3200.wav,COPD
...,...,...,...,...
1698,1756.0,280.0,H050_R2.wav,COPD
1699,1757.0,280.0,H050_R3.wav,COPD
1700,1758.0,280.0,H050_R4.wav,COPD
1701,1759.0,280.0,H050_R5.wav,COPD


In [3]:
augmented_df


Unnamed: 0,WAV File ID,Patient ID,Filename,Diagnosis
0,919.0,126.0,226_1b1_Ll_sc_Meditron_snippet_6.wav_stretch_0...,Pneumonia
1,919.0,126.0,226_1b1_Ll_sc_Meditron_snippet_6.wav_stretch_0...,Pneumonia
2,919.0,126.0,226_1b1_Ll_sc_Meditron_snippet_6.wav_pitch_-2.wav,Pneumonia
3,919.0,126.0,226_1b1_Ll_sc_Meditron_snippet_6.wav_pitch_1.wav,Pneumonia
4,919.0,126.0,226_1b1_Ll_sc_Meditron_snippet_6.wav_noise_0.0...,Pneumonia
...,...,...,...,...
10635,475.0,64.0,164_1b1_Ll_sc_Meditron_snippet_4.wav_stretch_0...,URTI
10636,475.0,64.0,164_1b1_Ll_sc_Meditron_snippet_4.wav_stretch_0...,URTI
10637,475.0,64.0,164_1b1_Ll_sc_Meditron_snippet_4.wav_pitch_-2.wav,URTI
10638,475.0,64.0,164_1b1_Ll_sc_Meditron_snippet_4.wav_pitch_1.wav,URTI


In [4]:
final_df=pd.concat([labels_df,augmented_df],ignore_index=True)


In [5]:
import pandas as pd
import re
from tqdm import tqdm # Assuming tqdm is now installed

# --- Data Loading ---
# Load the main combined DataFrame containing spectrogram information and initial labels
df = final_df.copy()

# Load individual domain DataFrames (these contain original filenames and their domains)
kauh_domain_df = pd.read_csv('/content/drive/MyDrive/Data Science AUEB/Deep Learning/datasets/Other Useful/labeled wavs/KAUH_labeled_wav.csv')
tr_domain_df = pd.read_csv('/content/drive/MyDrive/Data Science AUEB/Deep Learning/datasets/Other Useful/labeled wavs/TR_labeled_wav.csv')
icbhi_domain_df = pd.read_csv('/content/drive/MyDrive/Data Science AUEB/Deep Learning/datasets/Other Useful/labeled wavs/ICBHI_labeled_wav.csv')

print("Initial 'df' head:")
print(df.head())
print("\nInitial 'icbhi_domain_df' head:")
print(icbhi_domain_df.head())

# --- PREPROCESSING FILENAMES FOR ROBUST MERGING ---
def clean_filename(filename):
    """
    Cleans a filename string for standardization.
    Removes common audio/image extensions, converts to lowercase, and strips whitespace.
    """
    if isinstance(filename, str):
        filename = filename.lower().strip()
        # Remove common extensions (e.g., .wav, .mp3, .png, .ogg) from the end
        filename = re.sub(r'\.(wav|mp3|png|ogg)$', '', filename)
        return filename
    return str(filename).lower().strip() # Ensure it's a string even if it's not initially


# Apply cleaning to all relevant filename columns in both main and domain DFs
df['Filename_Clean'] = df['Filename'].apply(clean_filename)

kauh_domain_df['Filename_Clean'] = kauh_domain_df['Filename'].apply(clean_filename)
tr_domain_df['Filename_Clean'] = tr_domain_df['Filename'].apply(clean_filename)
icbhi_domain_df['Filename_Clean'] = icbhi_domain_df['Filename'].apply(clean_filename)


# Add a 'Domain' column to each individual DataFrame for consistent merging
kauh_domain_df['Domain'] = 'KAUH'
tr_domain_df['Domain'] = 'TR'
icbhi_domain_df['Domain'] = 'ICBHI'

# Combine the individual domain DFs into a single 'all_domain_dfs'
# 'Patient ID' columns have been removed as per the request.
all_domain_dfs = pd.concat([
    kauh_domain_df[['Filename_Clean', 'Domain']],
    tr_domain_df[['Filename_Clean', 'Domain']],
    icbhi_domain_df[['Filename_Clean', 'Domain']]
], ignore_index=True)

# Drop duplicates based on the cleaned filename.
all_domain_dfs.drop_duplicates(subset=['Filename_Clean'], inplace=True)


# --- PERFORMING SUBSTRING MATCHING FOR DOMAIN ASSIGNMENT ---
print("\nAttempting domain assignment using substring matching...")
initial_df_rows = len(df)
df['Domain'] = None # Initialize a new 'Domain' column
# 'Patient_ID' column initialization has been removed.

# Create dictionaries for faster lookup: original_cleaned_filename -> domain_name
domain_lookup = dict(zip(all_domain_dfs['Filename_Clean'], all_domain_dfs['Domain']))
# 'patient_id_lookup' dictionary creation has been removed.


for idx, row in tqdm(df.iterrows(), total=len(df), desc="Assigning domains"):
    current_spectrogram_filename = row['Filename_Clean']
    assigned_domain = None

    # First, try a direct exact match (most efficient)
    if current_spectrogram_filename in domain_lookup:
        assigned_domain = domain_lookup[current_spectrogram_filename]
    else:
        # If no direct match, iterate through known original filenames and check if they are
        # a substring of the current spectrogram filename.
        for original_base_filename, domain_name in domain_lookup.items():
            if original_base_filename in current_spectrogram_filename:
                assigned_domain = domain_name
                break # Found a match, move to the next spectrogram

    df.at[idx, 'Domain'] = assigned_domain
    # 'Patient_ID' assignment has been removed.


print(f"Initial rows in df: {initial_df_rows}")
print(f"Rows in df after domain assignment: {len(df)}")


# --- DEBUGGING ASSIGNMENT RESULTS ---
unmatched_filenames = df[df['Domain'].isnull()]
if not unmatched_filenames.empty:
    print("\n--- WARNING: Unmatched Filenames (after substring matching) ---")
    print(f"{len(unmatched_filenames)} filenames in 'spectrograms_labeled.csv' still did not match any domain.")
    print("Top 5 unmatched filenames (original column):")
    print(unmatched_filenames['Filename'].head())
    print("Top 5 unmatched filenames (cleaned for matching):")
    print(unmatched_filenames['Filename_Clean'].head())

    print("\nSample cleaned original filenames from ICBHI_domain_df (for comparison):")
    print(icbhi_domain_df['Filename_Clean'].sample(min(5, len(icbhi_domain_df))).tolist())


# Basic validation: ensure the 'Domain' column was populated
if df['Domain'].isnull().all():
    raise ValueError("Error: 'Domain' column is entirely NaN after domain assignment. Please check filename consistency and matching logic.")

# Convert string domain names to numerical IDs (e.g., KAUH:0, TR:1, ICBHI:2)
domain_id_map = {domain: i for i, domain in enumerate(df['Domain'].dropna().unique())}
df['Domain_ID'] = df['Domain'].map(domain_id_map)

# Handle any remaining unmatched files by assigning a specific ID (e.g., -1).
print("Assigning '-1' as Domain_ID for unmatched filenames (NaN in 'Domain' column).")
df['Domain_ID'] = df['Domain_ID'].fillna(-1)
df['Domain_ID'] = df['Domain_ID'].astype(int) # Ensure integer type

# The final DataFrame
final_df = df.copy()
final_df = final_df[final_df['Domain_ID'] != -1]
final_df = final_df.drop(columns=['Filename_Clean'])

print("\nDataFrame Head with new 'Domain' and 'Domain_ID' columns:")
print(final_df.head())
print("\nDistribution of 'Domain' in the combined DataFrame:")
print(final_df['Domain'].value_counts())
print("\nDistribution of 'Domain_ID' in the combined DataFrame:")
print(final_df['Domain_ID'].value_counts())
print("\nActual Domain ID Mapping:")
print(domain_id_map)
# The print statement for 'Patient_ID' distribution has been removed.


Initial 'df' head:
   WAV File ID  Patient ID                    Filename Diagnosis
0          1.0         1.0  101_1b1_Al_sc_Meditron.wav      URTI
1          2.0         1.0  101_1b1_Pr_sc_Meditron.wav      URTI
2          3.0         2.0  102_1b1_Ar_sc_Meditron.wav   Healthy
3          4.0         3.0  103_2b2_Ar_mc_LittC2SE.wav    Asthma
4          5.0         4.0  104_1b1_Al_sc_Litt3200.wav      COPD

Initial 'icbhi_domain_df' head:
   Patient ID                    Filename Diagnosis
0           1  101_1b1_Al_sc_Meditron.wav      URTI
1           1  101_1b1_Pr_sc_Meditron.wav      URTI
2           2  102_1b1_Ar_sc_Meditron.wav   Healthy
3           3  103_2b2_Ar_mc_LittC2SE.wav    Asthma
4           4  104_1b1_Al_sc_Litt3200.wav      COPD

Attempting domain assignment using substring matching...


Assigning domains: 100%|██████████| 12343/12343 [00:01<00:00, 6641.17it/s]

Initial rows in df: 12343
Rows in df after domain assignment: 12343
Assigning '-1' as Domain_ID for unmatched filenames (NaN in 'Domain' column).

DataFrame Head with new 'Domain' and 'Domain_ID' columns:
   WAV File ID  Patient ID                    Filename Diagnosis Domain  \
0          1.0         1.0  101_1b1_Al_sc_Meditron.wav      URTI  ICBHI   
1          2.0         1.0  101_1b1_Pr_sc_Meditron.wav      URTI  ICBHI   
2          3.0         2.0  102_1b1_Ar_sc_Meditron.wav   Healthy  ICBHI   
3          4.0         3.0  103_2b2_Ar_mc_LittC2SE.wav    Asthma  ICBHI   
4          5.0         4.0  104_1b1_Al_sc_Litt3200.wav      COPD  ICBHI   

   Domain_ID  
0          0  
1          0  
2          0  
3          0  
4          0  

Distribution of 'Domain' in the combined DataFrame:
Domain
KAUH     7029
ICBHI    4810
TR        504
Name: count, dtype: int64

Distribution of 'Domain_ID' in the combined DataFrame:
Domain_ID
1    7029
0    4810
2     504
Name: count, dtype: int64

Act




In [6]:
final_df

Unnamed: 0,WAV File ID,Patient ID,Filename,Diagnosis,Domain,Domain_ID
0,1.0,1.0,101_1b1_Al_sc_Meditron.wav,URTI,ICBHI,0
1,2.0,1.0,101_1b1_Pr_sc_Meditron.wav,URTI,ICBHI,0
2,3.0,2.0,102_1b1_Ar_sc_Meditron.wav,Healthy,ICBHI,0
3,4.0,3.0,103_2b2_Ar_mc_LittC2SE.wav,Asthma,ICBHI,0
4,5.0,4.0,104_1b1_Al_sc_Litt3200.wav,COPD,ICBHI,0
...,...,...,...,...,...,...
12338,475.0,64.0,164_1b1_Ll_sc_Meditron_snippet_4.wav_stretch_0...,URTI,ICBHI,0
12339,475.0,64.0,164_1b1_Ll_sc_Meditron_snippet_4.wav_stretch_0...,URTI,ICBHI,0
12340,475.0,64.0,164_1b1_Ll_sc_Meditron_snippet_4.wav_pitch_-2.wav,URTI,ICBHI,0
12341,475.0,64.0,164_1b1_Ll_sc_Meditron_snippet_4.wav_pitch_1.wav,URTI,ICBHI,0


In [7]:
final_df.to_csv('/content/drive/MyDrive/Data Science AUEB/Deep Learning/datasets/Other Useful/labeled wavs/final_labels.csv', index=False)

In [None]:
# prompt: find the domains and drop the TR

# Find the domains
domains = final_df['Domain'].unique()
print("Original Domains:", domains)

# Drop rows where the domain is 'TR'
final_df_no_tr = final_df[final_df['Domain'] != 'TR'].copy()

# Display the updated domain distribution
print("\nDomain Distribution after dropping 'TR':")
print(final_df_no_tr['Domain'].value_counts())

# Update the domain_id_map based on the remaining domains
# Get the unique remaining domains
remaining_domains = final_df_no_tr['Domain'].dropna().unique()
# Create a new mapping for the remaining domains
new_domain_id_map = {domain: i for i, domain in enumerate(remaining_domains)}

# Update the 'Domain_ID' column based on the new mapping
final_df_no_tr['Domain_ID'] = final_df_no_tr['Domain'].map(new_domain_id_map)

# Display the updated domain ID distribution and mapping
print("\nDomain_ID Distribution after dropping 'TR':")
print(final_df_no_tr['Domain_ID'].value_counts())
print("\nUpdated Domain ID Mapping:")
print(new_domain_id_map)

final_df_no_tr.to_csv('/content/drive/MyDrive/Data Science AUEB/Deep Learning/datasets/Other Useful/labeled wavs/final_labels_no_tr.csv', index=False)

Original Domains: ['ICBHI' 'KAUH' 'TR']

Domain Distribution after dropping 'TR':
Domain
KAUH     7029
ICBHI    4810
Name: count, dtype: int64

Domain_ID Distribution after dropping 'TR':
Domain_ID
1    7029
0    4810
Name: count, dtype: int64

Updated Domain ID Mapping:
{'ICBHI': 0, 'KAUH': 1}
