In [20]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/diagnosis-data/patient_diagnosis.csv
/kaggle/input/test-data/EP76_N,N,A L U,31,M.wav
/kaggle/input/test-data/EP93_N,N,P R M,75,M.wav
/kaggle/input/test-data/BP13_Heart Failure,C,P L L,55,M.wav
/kaggle/input/test-data/BP23_Lung Fibrosis,Crep,P R L ,50,M.wav
/kaggle/input/test-data/EP67_heart failure,Crep,P R L ,24,F.wav
/kaggle/input/test-data/EP79_asthma,E W,A R U,46,M.wav
/kaggle/input/test-data/BP24_heart failure,Crep,P R L ,76,F.wav
/kaggle/input/test-data/DP32_N,N,A R L,30,M.wav
/kaggle/input/test-data/DP83_N,N,A R U,18,F.wav
/kaggle/input/test-data/EP58_asthma,E W,P L M,40,M.wav
/kaggle/input/test-data/BP50_N,N,P R L ,27,M.wav
/kaggle/input/test-data/DP36_pneumonia,Crep,P R M,36,F.wav
/kaggle/input/test-data/EP21_BRON,Crep,P R L ,20,M.wav
/kaggle/input/test-data/BP69_pneumonia,Bronchial,P R L ,64,M.wav
/kaggle/input/test-data/BP16_heart failure,C,P R U,56,M.wav
/kaggle/input/test-data/EP44_asthma,E W,P R U,40,M.wav
/kaggle/input/test-data/DP97_Asthma,E W,P R U,24,M.w

In [10]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [11]:
def preprocess_audio(file_path, target_duration=5.0, sr=22050):
    """
    Preprocesses the audio file:
    - Trims or pads to a fixed duration.
    - Focuses on respiratory cycles with crackles/wheezes.
    """
    # Load the audio
    audio, _ = librosa.load(file_path, sr=sr)
    
    # Trim silence
    audio, _ = librosa.effects.trim(audio)
    
    # Pad or truncate to target duration
    target_samples = int(target_duration * sr)
    if len(audio) > target_samples:
        audio = audio[:target_samples]
    elif len(audio) < target_samples:
        audio = np.pad(audio, (0, target_samples - len(audio)), 'constant')
    
    return audio

In [12]:
def extract_features(audio, sr=22050):
    """
    Extract multiple audio features:
    - Mel-frequency cepstral coefficients (MFCCs)
    - Zero-crossing rate (ZCR)
    - Chroma feature
    - Spectral contrast
    """
    try:
        # Compute MFCCs
        mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13).T, axis=0)
        # Compute zero-crossing rate
        zcr = np.mean(librosa.feature.zero_crossing_rate(y=audio))
        # Compute chroma features
        chroma = np.mean(librosa.feature.chroma_stft(y=audio, sr=sr).T, axis=0)
        # Compute spectral contrast
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sr).T, axis=0)
        
        return np.concatenate([mfccs, [zcr], chroma, spectral_contrast])
    except Exception as e:
        print(f"Feature extraction error: {e}")
        return None

In [13]:
def parse_annotations(annotation_file):
    """
    Parse annotations to identify respiratory cycles with crackles/wheezes.
    """
    data = pd.read_csv(annotation_file, sep='\s+', header=None, 
                       names=["start", "end", "crackles", "wheezes"])
    if data.empty:
        return None, None
    return data["crackles"].sum(), data["wheezes"].sum()

In [14]:
def main_pipeline(audio_dir, annotation_dir, diagnosis_file):
    # Load the diagnosis data
    diagnosis_df = pd.read_csv(diagnosis_file)
    
    features, labels = [], []
    
    for file in os.listdir(audio_dir):
        if file.endswith(".wav"):
            audio_file = os.path.join(audio_dir, file)
            annotation_file = os.path.join(annotation_dir, f"{os.path.splitext(file)[0]}.txt")
            
            if not os.path.exists(annotation_file):
                print(f"Annotation file missing for {file}, skipping...")
                continue
            try:
                # Preprocess audio
                audio = preprocess_audio(audio_file)
                
                # Extract features
                feature = extract_features(audio)

                # Parse annotation
                crackles, wheezes = parse_annotations(annotation_file)
                if crackles is None or wheezes is None:
                    print(f"No valid annotations for {file}, skipping...")
                    continue
                # Get the patient ID and diagnosis
                patient_id = file.split('_')[0]  # Assuming patient ID is the prefix
                diagnosis = diagnosis_df.loc[diagnosis_df["Patient ID"] == int(patient_id), "Diagnosis"]
                if diagnosis.empty:
                    print(f"No diagnosis found for {file}, skipping...")
                    continue
                
                # Store features and labels
                features.append(feature)
                labels.append(diagnosis.values[0])
            except Exception as e:
                print(f"Error processing {file}: {e}")
                continue
    
    return np.array(features), np.array(labels)

In [15]:
# Train and classify
def classify(features, labels):
    """
    Train RandomForestClassifier and evaluate on test data.
    """
    le = LabelEncoder()
    labels = le.fit_transform(labels)
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    # Adjust the target_names to match only those present in y_test
    unique_classes = np.unique(y_test)  # Classes actually in the test set
    target_names = [le.classes_[i] for i in unique_classes]
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred, labels=unique_classes, target_names=target_names))
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    
    return clf, scaler, le

In [17]:
def predict_new_file(clf, scaler, le, audio_path):
    """
    Predicts the diagnosis for a given audio file.
    """
    audio = preprocess_audio(audio_path)
    feature = extract_features(audio).reshape(1, -1)
    feature = scaler.transform(feature)
    prediction = clf.predict(feature)
    return le.inverse_transform(prediction)

In [18]:
import joblib

def save_model(clf, scaler, le, model_path="/kaggle/working/lung_disease_model.joblib"):
    """
    Save the trained model, scaler, and label encoder to a file.
    """
    model_data = {
        "classifier": clf,
        "scaler": scaler,
        "label_encoder": le
    }
    joblib.dump(model_data, model_path)
    print(f"Model saved to {model_path}")


In [22]:
if __name__ == "__main__":
    AUDIO_DIR = "/kaggle/input/dsp-project/Seperated files/audio files"  # Update with actual path
    ANNOTATION_DIR = "/kaggle/input/dsp-project/Seperated files/txt files"  # Update with actual path
    DIAGNOSIS_FILE = "/kaggle/input/diagnosis-data/patient_diagnosis.csv"
    
    features, labels = main_pipeline(AUDIO_DIR, ANNOTATION_DIR, DIAGNOSIS_FILE)
    if len(features) > 0 and len(labels) > 0:
        clf, scaler, le = classify(features, labels)
        
        # Save the trained model
        save_model(clf, scaler, le)
        
        # Test with a new file
        test_audio_path = "/kaggle/input/test-data/BP107_Asthma,E W,P L U,59,F.wav"  # Replace with an actual audio file path
        prediction = predict_new_file(clf, scaler, le, test_audio_path)
        print(f"Prediction for {test_audio_path}: {prediction[0]}")
    else:
        print("No valid data processed.")


Classification Report:
                precision    recall  f1-score   support

        Asthma       0.00      0.00      0.00         1
Bronchiectasis       1.00      0.33      0.50         3
 Bronchiolitis       0.00      0.00      0.00         3
          COPD       0.94      0.99      0.96       160
       Healthy       0.75      0.60      0.67        10
     Pneumonia       0.20      0.25      0.22         4
          URTI       0.00      0.00      0.00         3

      accuracy                           0.91       184
     macro avg       0.41      0.31      0.34       184
  weighted avg       0.87      0.91      0.89       184

Accuracy: 0.907608695652174
Model saved to /kaggle/working/lung_disease_model.joblib
Prediction for /kaggle/input/test-data/BP107_Asthma,E W,P L U,59,F.wav: COPD
