In [1]:
!pip install numpy scipy pydub noisereduce librosa soundfile

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting noisereduce
  Downloading noisereduce-3.0.3-py3-none-any.whl.metadata (14 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Downloading noisereduce-3.0.3-py3-none-any.whl (22 kB)
Installing collected packages: pydub, noisereduce
Successfully installed noisereduce-3.0.3 pydub-0.25.1


In [2]:
import os
import zipfile
import shutil
import librosa
import numpy as np
import soundfile as sf
from noisereduce import reduce_noise

# Function to unzip files
def unzip_file(zip_file, extract_to):
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Advanced audio preprocessing
def preprocess_audio(file_path, output_path, sample_rate=16000, target_duration=3.0):
    y, sr = librosa.load(file_path, sr=None)

    # Noise reduction
    reduced_noise = reduce_noise(y=y, sr=sr, prop_decrease=0.8)

    # Resample to standard sample rate
    if sr != sample_rate:
        y = librosa.resample(reduced_noise, orig_sr=sr, target_sr=sample_rate)
    else:
        y = reduced_noise

    # Trimming and silence removal
    y, _ = librosa.effects.trim(y, top_db=20)

    # Amplitude normalization
    y = librosa.util.normalize(y)

    # Handling variable lengths
    target_length = int(sample_rate * target_duration)
    if len(y) < target_length:
        y = np.pad(y, (0, target_length - len(y)), mode='constant')
    else:
        y = y[:target_length]

    # Save preprocessed audio
    sf.write(output_path, y, sample_rate)

# Process audio files and save directly
def process_audio_files(input_folder, output_folder, sample_rate=16000, target_duration=3.0):
    os.makedirs(output_folder, exist_ok=True)
    for file_name in os.listdir(input_folder):
        if file_name.endswith('.wav'):
            file_path = os.path.join(input_folder, file_name)
            output_path = os.path.join(output_folder, file_name)
            preprocess_audio(file_path, output_path, sample_rate, target_duration)

# Main function to handle preprocessing and saving
def preprocess_and_save(input_folder, output_folder):
    # Step 1: Unzip files
    unzip_file(zip_file, extract_to)

    # Step 2: Preprocess files and save
    process_audio_files(input_folder, output_folder)

    # Step 3: Zip the output folder
    output_zip = "/content/drive/MyDrive/tamil_finaled_test_audio_preprocessed4.zip"
    shutil.make_archive(output_zip.replace(".zip", ""), 'zip', output_folder)

    print(f"Preprocessed audio dataset saved at {output_zip}.")

# Main workflow
zip_file = "/content/drive/MyDrive/Dravidian-2025/Tamil/Audio/Raw/audio_ta_test.zip"
extract_to = "extracted_files"
output_folder = "preprocessed_audio"

preprocess_and_save(f"{extract_to}/audio", output_folder)


Preprocessed audio dataset saved at /content/drive/MyDrive/tamil_finaled_test_audio_preprocessed4.zip.
