In [5]:
!pip install numpy scipy pydub noisereduce librosa soundfile

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting noisereduce
  Downloading noisereduce-3.0.3-py3-none-any.whl.metadata (14 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Downloading noisereduce-3.0.3-py3-none-any.whl (22 kB)
Installing collected packages: pydub, noisereduce
Successfully installed noisereduce-3.0.3 pydub-0.25.1


In [6]:
import os
import zipfile
import shutil
import librosa
import numpy as np
import soundfile as sf
from sklearn.utils import resample
from noisereduce import reduce_noise
import pandas as pd

# Function to unzip files
def unzip_file(zip_file, extract_to):
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Advanced audio preprocessing
def preprocess_audio(file_path, output_path, sample_rate=16000, target_duration=3.0):
    y, sr = librosa.load(file_path, sr=None)

    # Noise reduction
    reduced_noise = reduce_noise(y=y, sr=sr, prop_decrease=0.8)

    # Resample to standard sample rate
    if sr != sample_rate:
        y = librosa.resample(reduced_noise, orig_sr=sr, target_sr=sample_rate)
    else:
        y = reduced_noise

    # Trimming and silence removal
    y, _ = librosa.effects.trim(y, top_db=20)

    # Amplitude normalization
    y = librosa.util.normalize(y)

    # Handling variable lengths
    target_length = int(sample_rate * target_duration)
    if len(y) < target_length:
        y = np.pad(y, (0, target_length - len(y)), mode='constant')
    else:
        y = y[:target_length]

    # Save preprocessed audio
    sf.write(output_path, y, sample_rate)

# Classify audio files based on file name
def classify_audio(file_name):
    if file_name.startswith("H"):
        main_class = "Hate"
        subclass = None
        if "_G_" in file_name:
            subclass = "Gender"
        elif "_P_" in file_name:
            subclass = "Political"
        elif "_R_" in file_name:
            subclass = "Religious"
        elif "_C_" in file_name:
            subclass = "Personal Defamation"
    elif file_name.startswith("NH"):
        main_class = "Not Hate"
        subclass = None
    else:
        main_class = "Unknown"
        subclass = None
    return main_class, subclass

# Balance dataset by oversampling
def balance_subclasses_and_not_hate(hate_data, not_hate_data, target_count_per_subclass, target_count_not_hate):
    balanced_hate_data = []

    # Balance each subclass of "Hate"
    for subclass in hate_data["subclass"].unique():
        subset = hate_data[hate_data["subclass"] == subclass]
        if len(subset) < target_count_per_subclass:
            oversampled = resample(subset, replace=True, n_samples=target_count_per_subclass, random_state=42)
            balanced_hate_data.append(oversampled)
        else:
            balanced_hate_data.append(subset)

    # Concatenate all balanced subclasses
    balanced_hate_df = pd.concat(balanced_hate_data)

    # Balance "Not Hate" to match total "Hate" records
    if len(not_hate_data) < target_count_not_hate:
        balanced_not_hate_df = resample(not_hate_data, replace=True, n_samples=target_count_not_hate, random_state=42)
    else:
        balanced_not_hate_df = not_hate_data

    return pd.concat([balanced_hate_df, balanced_not_hate_df])

# Process audio files and collect metadata
def process_audio_files(input_folder, output_folder, sample_rate=16000, target_duration=3.0):
    os.makedirs(output_folder, exist_ok=True)
    metadata = []
    for file_name in os.listdir(input_folder):
        if file_name.endswith('.wav'):
            file_path = os.path.join(input_folder, file_name)
            output_path = os.path.join(output_folder, file_name)

            preprocess_audio(file_path, output_path, sample_rate, target_duration)

            # Classify the file
            main_class, subclass = classify_audio(file_name)
            metadata.append({"file_name": file_name, "label": main_class, "subclass": subclass, "path": output_path})

    return pd.DataFrame(metadata)

# Zip folder
def create_zip(folder_path, zip_name):
    with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, folder_path)
                zipf.write(file_path, arcname)

# Display record counts
def display_record_counts(metadata_df):
    # Count records per class and subclass
    class_counts = metadata_df.groupby("label").size()
    print("Number of records per class:")
    print(class_counts)

    # Count records per subclass within "Hate"
    hate_subclass_counts = metadata_df[metadata_df["label"] == "Hate"].groupby("subclass").size()
    print("\nNumber of records per subclass within 'Hate':")
    print(hate_subclass_counts)

# Main workflow
zip_file = "/content/drive/MyDrive/Dravidian-2025/Tamil/Audio/Raw/audio_ta_train.zip"
extract_to = "extracted_files"
output_folder = "preprocessed_audio"
final_output_folder = "final_dataset"
output_zip = "/content/drive/MyDrive/tamil_train_audio_preprocessed4.zip"

# Step 1: Unzip files
unzip_file(zip_file, extract_to)

# Step 2: Preprocess files and collect metadata
metadata_df = process_audio_files(f"{extract_to}/audio", output_folder)

# Step 3: Separate data by label and subclass
hate_data = metadata_df[metadata_df["label"] == "Hate"]
not_hate_data = metadata_df[metadata_df["label"] == "Not Hate"]

# Debug: Print counts before balancing
print(f"Hate data records: {len(hate_data)}")
print(f"Not Hate data records: {len(not_hate_data)}")

# Step 4: Balance the data
target_count_per_subclass = 122  # Target number of records per subclass in Hate
target_count_not_hate = 122 * 4  # Target total for Not Hate, matching total Hate count

final_metadata_df = balance_subclasses_and_not_hate(hate_data, not_hate_data, target_count_per_subclass, target_count_not_hate)

# Debug: Check if final_metadata_df is created successfully
if final_metadata_df is not None:
    print("Final metadata DataFrame created successfully.")
else:
    print("Error: Final metadata DataFrame is None.")

# Display record counts
display_record_counts(final_metadata_df)

# Step 5: Save final dataset and zip
shutil.rmtree(final_output_folder, ignore_errors=True)
os.makedirs(final_output_folder, exist_ok=True)

for _, row in final_metadata_df.iterrows():
    shutil.copy(row["path"], os.path.join(final_output_folder, row["file_name"]))

create_zip(final_output_folder, output_zip)

print(f"Preprocessed and balanced audio dataset saved at {output_zip}.")


Hate data records: 222
Not Hate data records: 287
Final metadata DataFrame created successfully.
Number of records per class:
label
Hate        488
Not Hate    488
dtype: int64

Number of records per subclass within 'Hate':
subclass
Gender                 122
Personal Defamation    122
Political              122
Religious              122
dtype: int64
Preprocessed and balanced audio dataset saved at /content/drive/MyDrive/tamil_train_audio_preprocessed4.zip.
