In [None]:
!pip install audiomentations
!pip install imbalanced-learn

Collecting audiomentations
  Downloading audiomentations-0.37.0-py3-none-any.whl.metadata (11 kB)
Collecting numpy-minmax<1,>=0.3.0 (from audiomentations)
  Downloading numpy_minmax-0.3.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting numpy-rms<1,>=0.4.2 (from audiomentations)
  Downloading numpy_rms-0.4.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Collecting scipy<1.13,>=1.4 (from audiomentations)
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m786.7 kB/s[0m eta [36m0:00:00[0m
Downloading audiomentations-0.37.0-py3-none-any.whl (80 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.5/80.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy_minmax-0.3.1-cp310

In [None]:
import pandas as pd
import os
import librosa
import numpy as np
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load dataset
train_data_path = '/content/drive/MyDrive/data/extracted_files-3/en/validated_new.tsv'
audio_base_path = '/content/drive/MyDrive/data/extracted_files-3/en/input_audio_files'
df = pd.read_csv(train_data_path, sep='\t')
df.head(5)

Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment,broader_accents
0,01e8ea298cdecf26e273f5baac3915eb992c493f229686...,common_voice_en_39751075.mp3,e5e7d4694b7160add018a08876327f254690c1ab4c39ea...,Madin was a significant figure of post-war Bir...,,2,0,,,American English Accent,,en,,North American English
1,03b62f72067ec967c423852bef03d1b61e63c156d86f6e...,common_voice_en_40087973.mp3,e90c361c9684d01d31bc6e8df3060bc97e536ca707bef4...,No runoff was necessary.,,2,0,teens,transgender,British English,,en,,British English
2,05d33ad00cc2754da8e542a33a5255f9346535ef1d8619...,common_voice_en_40117514.mp3,e9475052b6e625f8c5890389e4ffc17a1078dec1483592...,It was a sickening sight.,,2,0,twenties,male_masculine,Australian English,,en,,British English
3,08072f2de4dcc2bfec5058dca41eb9535b61ccd193ecc4...,common_voice_en_39603786.mp3,e4657d8d47be955eb14e04cd1c2a2b9ef89d310f639678...,It is made by mounting a sidecar to a regular ...,,2,0,sixties,male_masculine,American English Accent,,en,,North American English
4,083af8bc921baf15ad5d8c8c876f4ecaf4f52bf6370161...,common_voice_en_39603175.mp3,e443f322884c5440d7f5072f21c5b0e1f0433ba6147471...,"Within his genre, Di Giorgio is respected for ...",,2,0,,,British English,,en,,British English


In [None]:
# Check the number of samples (rows) in the DataFrame
num_samples = df.shape[0]  # Returns the number of rows
print(f"Number of samples in df: {num_samples}")

Number of samples in df: 1643


In [None]:
# 1. Filter for 'North American English'
df_na = df[df['broader_accents'] == 'North American English']

# 2. Filter for 'British English'
df_br = df[df['broader_accents'] == 'British English']

# 3. Sample 10 random rows from each
df_na_sample = df_na.sample(10)
df_br_sample = df_br.sample(10)

# 4. Combine both samples into one DataFrame
df2 = pd.concat([df_na_sample, df_br_sample])
# 5. Drop the rows in df that are present in df2
df = df.drop(df2.index)


In [None]:
# Check the number of samples (rows) in the DataFrame
num_samples = df.shape[0]  # Returns the number of rows
print(f"Number of samples in df: {num_samples}")


# Check the number of samples (rows) in the DataFrame
num_samples = df2.shape[0]  # Returns the number of rows
print(f"Number of samples in df2: {num_samples}")

Number of samples in df: 1623
Number of samples in df2: 20


In [None]:
# Map accent labels to numeric IDs
accent_labels = df['broader_accents'].unique()
accent_to_id = {accent: idx for idx, accent in enumerate(accent_labels)}
df['accent_id'] = df['broader_accents'].apply(lambda x: accent_to_id[x])

# Function to preprocess audio files
def preprocess_audio(file_path, sr=16000):
    audio, _ = librosa.load(file_path, sr=sr)
    audio_trimmed, _ = librosa.effects.trim(audio)
    audio_normalized = librosa.util.normalize(audio_trimmed)
    return audio_normalized

# Process audio files
processed_audio = []
accent_targets = []
for index, row in df.iterrows():
    file_path = os.path.join(audio_base_path, row['path'])
    try:
        audio = preprocess_audio(file_path)
        processed_audio.append(audio)
        accent_targets.append(row['accent_id'])
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# Pad audio sequences to the same length
max_length = max(len(audio) for audio in processed_audio)
def pad_audio_sequence(audio, max_length):
    return np.pad(audio, (0, max_length - len(audio)), 'constant')

padded_audio = [pad_audio_sequence(audio, max_length) for audio in processed_audio]
X = np.array(padded_audio)
y = np.array(accent_targets)


In [None]:
# Define augmentation pipeline
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    Shift(min_shift=-0.5, max_shift=0.5, p=0.5)
])

# Apply augmentation to audio data
augmented_audio = [augment(samples=audio, sample_rate=16000) for audio in processed_audio]

# Pad augmented audio sequences
max_length_augmented = max(len(audio) for audio in augmented_audio)
padded_augmented_audio = [pad_audio_sequence(audio, max_length_augmented) for audio in augmented_audio]

# Combine original and augmented data
X_combined = np.concatenate((X, padded_augmented_audio), axis=0)
y_combined = np.concatenate((y, y), axis=0)
X_train, X_val, y_train, y_val = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import Adam
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import StepLR

# Define Dataset class
class AccentDataset(Dataset):
    def __init__(self, audio_inputs, labels):
        self.audio_inputs = [torch.tensor(a, dtype=torch.float32) for a in audio_inputs]
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.audio_inputs[idx], self.labels[idx]

# Collate function to pad sequences
def collate_fn(batch):
    inputs, labels = zip(*batch)
    inputs_padded = pad_sequence(inputs, batch_first=True)  # Pad sequences to have the same length
    return inputs_padded, torch.tensor(labels)

# Extract MFCC features correctly (increase n_mfcc to 20)
def extract_mfcc_features(audio_data, sr=16000, n_mfcc=20):
    return [librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc).T for audio in audio_data]

# Prepare MFCC features for training and validation data
X_train_mfcc = extract_mfcc_features(X_train)
X_val_mfcc = extract_mfcc_features(X_val)

# Create PyTorch Datasets and DataLoaders
train_dataset = AccentDataset(X_train_mfcc, y_train)
val_dataset = AccentDataset(X_val_mfcc, y_val)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Define LSTM model with Dropout and Batch Normalization
class AccentNormalizationModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(AccentNormalizationModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, num_layers=2, dropout=0.3, bidirectional=True)
        self.bn = nn.BatchNorm1d(hidden_size * 2)  # Batch normalization for bidirectional
        self.fc = nn.Linear(hidden_size * 2, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.bn(out[:, -1, :])  # Batch normalization
        out = self.dropout(out)  # Apply dropout
        out = self.fc(out)
        return out

# Initialize model, loss function, and optimizer
lstm_model = AccentNormalizationModel(input_size=20, hidden_size=128, num_classes=len(accent_labels))
criterion = nn.CrossEntropyLoss()
optimizer = Adam(lstm_model.parameters(), lr=0.001)
scheduler = StepLR(optimizer, step_size=2, gamma=0.5)  # Learning rate scheduler

# Training loop with Early Stopping and Learning Rate Scheduler
num_epochs = 20
best_val_accuracy = 0
patience = 5
early_stop_counter = 0

for epoch in range(num_epochs):
    lstm_model.train()
    for features, labels in train_loader:
        outputs = lstm_model(features)  # Input shape: (batch_size, sequence_length, input_size)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation
    lstm_model.eval()
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for features, labels in val_loader:
            outputs = lstm_model(features)
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100 * val_correct / val_total
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Accuracy: {val_accuracy:.2f}%')

    # Check for Early Stopping
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        early_stop_counter = 0
    else:
        early_stop_counter += 1

    if early_stop_counter >= patience:
        print("Early stopping triggered.")
        break

    # Adjust learning rate
    scheduler.step()


Epoch [1/20], Loss: 1.3123, Validation Accuracy: 62.31%
Epoch [2/20], Loss: 0.6218, Validation Accuracy: 58.15%
Epoch [3/20], Loss: 1.4150, Validation Accuracy: 64.62%
Epoch [4/20], Loss: 2.1234, Validation Accuracy: 65.08%
Epoch [5/20], Loss: 1.1759, Validation Accuracy: 67.38%
Epoch [6/20], Loss: 0.5170, Validation Accuracy: 66.46%
Epoch [7/20], Loss: 0.6161, Validation Accuracy: 66.77%
Epoch [8/20], Loss: 0.4660, Validation Accuracy: 68.92%
Epoch [9/20], Loss: 0.5569, Validation Accuracy: 68.00%
Epoch [10/20], Loss: 0.6468, Validation Accuracy: 69.23%
Epoch [11/20], Loss: 1.5506, Validation Accuracy: 68.31%
Epoch [12/20], Loss: 0.4683, Validation Accuracy: 68.46%
Epoch [13/20], Loss: 0.7820, Validation Accuracy: 68.62%
Epoch [14/20], Loss: 0.8592, Validation Accuracy: 69.54%
Epoch [15/20], Loss: 0.9784, Validation Accuracy: 68.62%
Epoch [16/20], Loss: 1.4186, Validation Accuracy: 69.38%
Epoch [17/20], Loss: 1.2207, Validation Accuracy: 68.31%
Epoch [18/20], Loss: 0.8947, Validation 