In [None]:
#Data Preprocessing
# Data Preprocessing of Coswara

import os
import librosa
import numpy as np
import soundfile as sf
import shutil

# --- FOLDER PATHS ---
kaggle_cough_folder = r"D:\ML\Kaggle\coswara_data\kaggle_data"
original_non_cough_folder = r"D:\ML\COSWARA\20220224\20220224"
output_folder = r"D:\ML\COSWARA\processed_output"

target_sr = 16000
target_length = int(target_sr * 2.0)  # 2 seconds

# --- AUDIO PROCESSING ---

def load_and_process_wav(fpath):
    y, _ = librosa.load(fpath, sr=target_sr)
    if len(y) < target_length:
        y = np.pad(y, (0, target_length - len(y)))
    else:
        y = y[:target_length]
    return y

# --- GATHER FILES ---

cough_files = []
non_cough_files = []

# Kaggle Cough
for folder in os.listdir(kaggle_cough_folder):
    folder_path = os.path.join(kaggle_cough_folder, folder)
    if os.path.isdir(folder_path):
        for file in os.listdir(folder_path):
            if "cough" in file.lower() and file.endswith(".wav"):
                fpath = os.path.join(folder_path, file)
                cough_files.append(fpath)

# Original Non-Cough
for folder in os.listdir(original_non_cough_folder):
    folder_path = os.path.join(original_non_cough_folder, folder)
    if os.path.isdir(folder_path):
        for file in os.listdir(folder_path):
            if file.endswith(".wav") and (
                "speech" in file.lower() 
                or "breath" in file.lower() 
                or "sentence" in file.lower() 
                or "counting" in file.lower()
            ):
                fpath = os.path.join(folder_path, file)
                non_cough_files.append(fpath)

print(f" Found {len(cough_files)} cough files")
print(f" Found {len(non_cough_files)} non-cough files")

In [None]:
#  Extracting cough and no_cough from FluSense audio — mono, 16kHz, padded 2s
import os
import pandas as pd
import textgrid
import librosa
import soundfile as sf
from pathlib import Path
import numpy as np

# --- PATHS ---

# AUDIO parent — contains FluSense audio-001, 002, etc
AUDIO_PARENT = Path("D:/Project/Data/ZIP FILES/Flusense")

# TEXTGRID path — your C:\Users\DELL\FluSense-data\flusense_data
TEXTGRID_DIR = Path("C:/Users/DELL/FluSense-data/flusense_data")

# OUTPUT
OUTPUT_BASE = Path("D:/ML/FluSense-processed")
os.makedirs(OUTPUT_BASE / "cough", exist_ok=True)
os.makedirs(OUTPUT_BASE / "no_event", exist_ok=True)

# --- SETTINGS ---

target_sr = 16000
target_length = int(target_sr * 2.0)  # 2 seconds → 32000 samples

TARGET_LABELS = {"cough"}
NO_EVENT_LABEL = "no_event"

# --- EXTRACT FUNCTION ---

def extract_segments():
    counts = {"cough": 0, "no_event": 0}

    for tg_file in TEXTGRID_DIR.glob("*.TextGrid"):
        base_name = tg_file.stem.split(".")[0]

        # Try to find .wav in any subfolder
        wav_file = None
        for subfolder in AUDIO_PARENT.glob("FluSense audio-*"):
            audio_subfolder = subfolder / "FluSense audio"
            candidate = audio_subfolder / (base_name + ".wav")
            if candidate.exists():
                wav_file = candidate
                break

        if wav_file is None:
            print(f" Missing audio for {base_name}")
            continue

        try:
            tg = textgrid.TextGrid.fromFile(str(tg_file))
            audio, sr = librosa.load(wav_file, sr=target_sr, mono=True)  # Force mono, resample to 16kHz
            tier = next((t for t in tg.tiers if t.name.lower() in {"label", "word", "words"}), None)
            if not tier:
                print(f" No matching tier in {tg_file.name}")
                continue

            for i, interval in enumerate(tier.intervals):
                label = interval.mark.strip().lower()
                start_sample = int(interval.minTime * target_sr)
                end_sample = int(interval.maxTime * target_sr)
                segment = audio[start_sample:end_sample]

                # Pad/clip to 2 seconds
                if len(segment) < target_length:
                    segment = np.pad(segment, (0, target_length - len(segment)))
                else:
                    segment = segment[:target_length]

                # Save file
                if label in TARGET_LABELS:
                    out_path = OUTPUT_BASE / "cough" / f"{base_name}_{label}_{i}.wav"
                    sf.write(out_path, segment, target_sr)
                    counts["cough"] += 1
                elif label in {"silence", "speech"}:
                    out_path = OUTPUT_BASE / "no_event" / f"{base_name}_no_event_{i}.wav"
                    sf.write(out_path, segment, target_sr)
                    counts["no_event"] += 1

        except Exception as e:
            print(f" Failed to process {tg_file.name}: {e}")

    return counts

# --- RUN EXTRACTION ---

counts = extract_segments()

# --- DISPLAY COUNTS ---
df_counts = pd.DataFrame(counts, index=["clips"]).T
print("\n Extracted Segment Counts:")
display(df_counts)