# Steps for Data Preprocessing
- Convert all audio files in the input directory to WAV format
- Clean background noise from wav files
- Match audio and JSON files
- Load annotations
- Extract audio segments
- Extract features (MFCC) from audio segments
- Try smoteenn, smote, and random data balancing strategy
- Evaluate best strategy
- Split Dataset and save

## Import required libraries

In [1]:
import os
from pydub import AudioSegment
import json
import librosa
import numpy as np
from pydub.silence import detect_nonsilent, split_on_silence
from scipy.io import wavfile
import noisereduce as nr



## Convert all audio files to 'wav' format

In [2]:
# Define the input directory containing audio files and output directory
input_directory = 'data'  # Change this to your input directory
output_directory = 'output/wav_files'

# Create output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Convert all audio files in the input directory to WAV format
for filename in os.listdir(input_directory):
    if filename.endswith(('.mp4', '.mp3', '.ogg', '.flac', '.m4a', '.wma', '.mkv')):  # Add other formats as needed
        file_path = os.path.join(input_directory, filename)
        try:
            # Load the audio file
            audio = AudioSegment.from_file(file_path)
            # Define output file path
            output_file_path = os.path.join(output_directory, f"{os.path.splitext(filename)[0]}.wav")
            # Export as WAV
            audio.export(output_file_path, format='wav')
            print(f"Converted {filename} to {output_file_path}")
        except Exception as e:
            print(f"Failed to convert {filename}: {e}")

Converted Voice 002 (2).m4a to output/wav_files/Voice 002 (2).wav
Converted English 1st conversation.m4a to output/wav_files/English 1st conversation.wav
Converted progga.m4a to output/wav_files/progga.wav
Converted keralas calture.mp3 to output/wav_files/keralas calture.wav
Converted SS practice.mp3 to output/wav_files/SS practice.wav
Converted R10.m4a to output/wav_files/R10.wav
Converted Oct hhjh.mp3 to output/wav_files/Oct hhjh.wav
Converted Voice 001_sd.m4a to output/wav_files/Voice 001_sd.wav
Converted 2024-02-27 21-05-26.mkv to output/wav_files/2024-02-27 21-05-26.wav
Converted cricket toppic.m4a to output/wav_files/cricket toppic.wav
Converted Voice 002.m4a to output/wav_files/Voice 002.wav
Converted sanjid meyad.mp3 to output/wav_files/sanjid meyad.wav
Converted R7.m4a to output/wav_files/R7.wav
Converted Voice 003.m4a to output/wav_files/Voice 003.wav
Converted tamanna.m4a to output/wav_files/tamanna.wav
Converted Voice 001 (2).m4a to output/wav_files/Voice 001 (2).wav
Conver

In [3]:
def remove_noise(audio_file, output_folder):
    """
    Removes background noise from an audio file and saves the cleaned audio to the output folder.
    """
    # Load the audio file
    audio = AudioSegment.from_file(audio_file)
    
    # Convert to mono for processing (if needed)
    if audio.channels > 1:
        audio = audio.set_channels(1)
    
    # Noise removal using silence-based detection
    # Splitting audio into non-silent chunks
    nonsilent_chunks = split_on_silence(
        audio,
        min_silence_len=500,  # Minimum silence length in ms
        silence_thresh=audio.dBFS - 14  # Threshold for silence (adjust as needed)
    )
    
    # Reconstruct the audio by concatenating chunks
    cleaned_audio = AudioSegment.empty()
    for chunk in nonsilent_chunks:
        cleaned_audio += chunk
    
    # Save the cleaned audio to the output folder
    output_path = os.path.join(output_folder, os.path.basename(audio_file))
    cleaned_audio.export(output_path, format="wav")
    print(f"Processed and saved: {output_path}")

def process_audio_folder(input_folder, output_folder):
    """
    Processes all audio files in the input folder, removes noise, and saves them to the output folder.
    """
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Loop through all files in the input folder
    for file_name in os.listdir(input_folder):
        input_path = os.path.join(input_folder, file_name)
        
        # Process only audio files (e.g., .wav, .mp3)
        if file_name.endswith((".wav", ".mp3")):
            remove_noise(input_path, output_folder)
        else:
            print(f"Skipping non-audio file: {file_name}")

# Example usage
input_folder = "wav_files"  # Replace with your input folder path
output_folder = "output/cleaned_wav_files"  # Replace with your output folder path

process_audio_folder(input_folder, output_folder)

Processed and saved: output/cleaned_wav_files/Voice 007_sd.wav
Processed and saved: output/cleaned_wav_files/R1.wav
Processed and saved: output/cleaned_wav_files/Voice 004.wav
Processed and saved: output/cleaned_wav_files/omg dj.wav
Processed and saved: output/cleaned_wav_files/practice .wav
Processed and saved: output/cleaned_wav_files/Voice 006.wav
Processed and saved: output/cleaned_wav_files/Voice 001_sd (1).wav
Processed and saved: output/cleaned_wav_files/Sep 25, 1.56 sa.wav
Processed and saved: output/cleaned_wav_files/R3.wav
Processed and saved: output/cleaned_wav_files/R2.wav
Processed and saved: output/cleaned_wav_files/Voice 007.wav
Processed and saved: output/cleaned_wav_files/hjjjhhggh.wav
Processed and saved: output/cleaned_wav_files/tamanna.wav
Processed and saved: output/cleaned_wav_files/Voice 003.wav
Processed and saved: output/cleaned_wav_files/Oct 8, 1.45 PM.wav
Skipping non-audio file: .DS_Store
Processed and saved: output/cleaned_wav_files/R7.wav
Processed and sav

In [None]:
# Paths to JSON and audio files
json_folder = 'JSON'
audio_folder = 'output/cleaned_wav_files'

# Match audio and JSON files
json_files = {os.path.splitext(f)[0]: os.path.join(json_folder, f) for f in os.listdir(json_folder) if f.endswith('.json')}
audio_files = {os.path.splitext(f)[0]: os.path.join(audio_folder, f) for f in os.listdir(audio_folder) if f.endswith(('.wav', '.m4a', '.mp3'))}

matched_files = {name: (json_files[name], audio_files[name]) for name in json_files if name in audio_files}

# Function to load annotations from a JSON file
def load_annotations(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    annotations = []
    for item in data[0]['annotations'][0]['result']:
        if item['type'] == 'labels':
            start = item['value']['start']
            end = item['value']['end']
            label = item['value']['labels'][0]
            annotations.append((start, end, label))
    return annotations

# Function to extract audio segments based on annotations
def extract_audio_segments(audio_file, annotations, sr=16000):
    y, _ = librosa.load(audio_file, sr=sr)
    segments = []
    for start, end, label in annotations:
        segment = y[int(start * sr):int(end * sr)]
        segments.append((segment, label))
    return segments

dataset = []
files_loaded = []

for name, (json_path, audio_path) in matched_files.items():
    if len(load_annotations(json_path)) != 0:
        files_loaded.append(json_path)
        annotations = load_annotations(json_path)
        audio_segments = extract_audio_segments(audio_path, annotations)
        dataset.extend(audio_segments)

# Function to extract features (MFCC) from audio segments
def extract_features(segments, n_mfcc=40, max_length=300):
    features, labels = [], []
    for i, (segment, label) in enumerate(segments):
        try:
            # Check if the segment is too short for FFT processing
            if len(segment) < 2048:  # Minimum required length for n_fft=2048
                print(f"Skipping segment {i} due to insufficient length: {len(segment)} samples")
                continue

            # Dynamically adjust n_fft based on signal length
            n_fft = min(2048, len(segment))  # Use the smaller of 2048 or segment length

            # Extract MFCC features with adjusted n_fft
            mfcc = librosa.feature.mfcc(y=segment, sr=16000, n_mfcc=n_mfcc, n_fft=n_fft)

            # Handle variable lengths (pad if short, truncate if long)
            if mfcc.shape[1] < max_length:
                padded_mfcc = np.pad(mfcc, ((0, 0), (0, max_length - mfcc.shape[1])), mode='constant')
            else:
                padded_mfcc = mfcc[:, :max_length]

            # Append features and labels
            features.append(padded_mfcc.T)
            labels.append(0 if label == 'Field pause' else 1)

        except Exception as e:
            print(f"Error processing segment {i}: {e}")

    return np.array(features), np.array(labels)

# Extract features and labels from the dataset
X, y = extract_features(dataset)