In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torchaudio
!pip install audiomentations

Collecting audiomentations
  Downloading audiomentations-0.37.0-py3-none-any.whl.metadata (11 kB)
Collecting numpy-minmax<1,>=0.3.0 (from audiomentations)
  Downloading numpy_minmax-0.3.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting numpy-rms<1,>=0.4.2 (from audiomentations)
  Downloading numpy_rms-0.4.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Collecting scipy<1.13,>=1.4 (from audiomentations)
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Downloading audiomentations-0.37.0-py3-none-any.whl (80 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.5/80.5 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy_minmax-0.3.1-cp310-c

In [None]:
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
import os
from torch.nn import functional as F
import torch.nn as nn
import torch.optim as optim
import numpy as np
from scipy.io.wavfile import write
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

# Step 1: Define Data Augmentation Pipeline
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    Shift(min_shift=-0.5, max_shift=0.5, p=0.5)
])

# Custom Dataset for Loading Audio Files with Augmentation
class SpeechDataset(Dataset):
    def __init__(self, data_dir, transform=None, target_length=80000, num_files=100, apply_augmentation=False):
        self.data_dir = data_dir
        self.transform = transform
        self.target_length = target_length
        self.apply_augmentation = apply_augmentation
        self.audio_files = [f for f in os.listdir(data_dir) if f.endswith('.wav')][:num_files]

        if len(self.audio_files) == 0:
            raise ValueError(f"No audio files found in directory: {data_dir}")

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        wav_path = os.path.join(self.data_dir, self.audio_files[idx])
        if not os.path.exists(wav_path):
            raise FileNotFoundError(f"Audio file not found: {wav_path}")

        try:
            waveform, sr = torchaudio.load(wav_path)
        except RuntimeError as e:
            print(f"Error loading audio file: {wav_path}, Error: {e}")
            raise e

        # Pad or truncate to the target length
        if waveform.shape[1] < self.target_length:
            padding = self.target_length - waveform.shape[1]
            waveform = F.pad(waveform, (0, padding))
        else:
            waveform = waveform[:, :self.target_length]

        # Apply Augmentation
        if self.apply_augmentation:
            waveform = augment_audio(waveform)

        # Apply normalization if any
        if self.transform:
            waveform = self.transform(waveform)

        return waveform, self.audio_files[idx]

# Function to Apply Augmentation to the Audio
def augment_audio(audio):
    augmented_samples = augment(samples=audio.numpy(), sample_rate=16000)
    return torch.tensor(augmented_samples)

# Define Normalization Transform
def normalize_waveform(waveform):
    return (waveform - waveform.mean()) / waveform.std()

# Directories for Data
data_dir_A = "/content/drive/MyDrive/data/extracted_files-3/en/North_American_English_W/"

# Target length for all audio files
target_length = 80000

# Initialize Dataset with 100 Files and Apply Augmentation
dataset_A = SpeechDataset(data_dir_A, transform=normalize_waveform, target_length=target_length, num_files=100, apply_augmentation=True)

# Initialize DataLoader
dataloader_A = DataLoader(dataset_A, batch_size=1, shuffle=True)

# Define VAE with Transformer Components: Encoder, Decoder
class TransformerVAE(nn.Module):
    def __init__(self, input_dim=80000, latent_dim=64, d_model=512, nhead=8, num_layers=6):
        super(TransformerVAE, self).__init__()
        self.fc_in = nn.Linear(input_dim, d_model)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead), num_layers=num_layers
        )
        self.fc_mean = nn.Linear(d_model, latent_dim)
        self.fc_logvar = nn.Linear(d_model, latent_dim)
        self.fc_out = nn.Linear(latent_dim, input_dim)
        self.latent_dim = latent_dim

    def encode(self, x):
        x = x.view(x.size(0), -1)  # Flatten the input to (batch_size, seq_length)
        x = self.fc_in(x)  # Linear transformation to match d_model
        x = x.unsqueeze(1)  # Add sequence length dimension for Transformer
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)  # Global average pooling over the sequence length
        mean = self.fc_mean(x)
        logvar = self.fc_logvar(x)
        return mean, logvar

    def reparameterize(self, mean, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mean + eps * std

    def decode(self, z):
        return self.fc_out(z)

    def forward(self, x):
        mean, logvar = self.encode(x)
        z = self.reparameterize(mean, logvar)
        return self.decode(z), mean, logvar

    def loss_function(self, recon_x, x, mean, logvar):
        recon_loss = nn.MSELoss()(recon_x, x)
        kl_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
        return recon_loss + kl_loss

# Initialize Models
model = TransformerVAE()
optimizer = optim.Adam(model.parameters(), lr=0.0002)

# Set device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to save audio data correctly
def save_audio(file_path, audio_tensor, sample_rate=16000):
    audio_np = audio_tensor.cpu().detach().numpy().squeeze(0)
    audio_np = audio_np / np.max(np.abs(audio_np) + 1e-6)
    audio_np = np.clip(audio_np, -1, 1)
    audio_np = (audio_np * 32767).astype(np.int16)

    if len(audio_np.shape) > 1:
        audio_np = audio_np[0]

    write(file_path, sample_rate, audio_np)

# Training Loop for Transformer-based VAE
def train_vae(dataloader_A, num_epochs=5):
    filename_mapping_A = {}  # Mapping for evaluation

    for epoch in range(num_epochs):
        batch_count = 0
        for real_A, file_A in dataloader_A:
            real_A = real_A.to(device)
            batch_count += 1

            # Forward pass through VAE
            reconstructed_A, mean, logvar = model(real_A)

            # Compute Reconstruction Loss
            loss = model.loss_function(reconstructed_A, real_A, mean, logvar)

            # Update Model
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Save Converted Audio for Evaluation
            converted_A_path = f"/content/drive/MyDrive/data/extracted_files-3/en/vae_converted_epoch_{epoch}/{file_A[0]}"
            os.makedirs(os.path.dirname(converted_A_path), exist_ok=True)

            save_audio(converted_A_path, reconstructed_A, sample_rate=16000)

            # Update Mappings for Evaluation
            filename_mapping_A[file_A[0]] = file_A[0]  # Map original to new

            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_count}], Loss: {loss.item():.4f}")

        print(f"Epoch [{epoch+1}/{num_epochs}] completed.")
        evaluate_vae(epoch, filename_mapping_A)

    print("Training completed successfully!")

# Evaluation function for VAE
def evaluate_vae(epoch, mapping_A):
    original_dir_A = "/content/drive/MyDrive/data/extracted_files-3/en/North_American_English_W/"
    converted_dir_A = f"/content/drive/MyDrive/data/extracted_files-3/en/vae_converted_epoch_{epoch}"

    print(f"Evaluating VAE performance after Epoch {epoch+1}")

    # Load Wav2Vec2 model and tokenizer from HuggingFace
    asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
    tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h")

    # Set the model to evaluation mode
    asr_model.eval()

    evaluate_metrics(original_dir=original_dir_A, converted_dir=converted_dir_A, mapping=mapping_A, asr_model=asr_model, tokenizer=tokenizer)

# Function to Compute MCD (Placeholder)
def compute_mcd(orig_audio, conv_audio):
    return np.random.random()

# Evaluation function for Mean Mel-Cepstral Distortion (MCD) and WER
def evaluate_metrics(original_dir, converted_dir, mapping, asr_model, tokenizer):
    mcd_scores = []
    wer_scores = []

    for orig_file, conv_file in mapping.items():
        orig_path = os.path.join(original_dir, orig_file)
        conv_path = os.path.join(converted_dir, conv_file)

        if not os.path.exists(orig_path):
            print(f"Original file not found: {orig_path}")
            continue
        if not os.path.exists(conv_path):
            print(f"Converted file not found: {conv_path}")
            continue

        try:
            orig_audio, _ = torchaudio.load(orig_path)
            conv_audio, _ = torchaudio.load(conv_path)
        except RuntimeError as e:
            print(f"Error loading audio files. Original: {orig_path}, Converted: {conv_path}, Error: {e}")
            continue

        # Compute MCD
        mcd_score = compute_mcd(orig_audio, conv_audio)
        mcd_scores.append(mcd_score)


    if mcd_scores:
        mean_mcd = np.mean(mcd_scores)
        print(f"Mean MCD: {mean_mcd:.4f}")
    else:
        print("No valid MCD scores computed due to missing or corrupt files.")


# Start Training for VAE
train_vae(dataloader_A)


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/5], Batch [1], Loss: 16.7536
Epoch [1/5], Batch [2], Loss: 23.0881
Epoch [1/5], Batch [3], Loss: 20.9909
Epoch [1/5], Batch [4], Loss: 16.4448
Epoch [1/5], Batch [5], Loss: 17.5795
Epoch [1/5], Batch [6], Loss: 10.7707
Epoch [1/5], Batch [7], Loss: 13.0960
Epoch [1/5], Batch [8], Loss: 13.8197
Epoch [1/5], Batch [9], Loss: 6.6172
Epoch [1/5], Batch [10], Loss: 12.4846
Epoch [1/5], Batch [11], Loss: 6.2364
Epoch [1/5], Batch [12], Loss: 5.6347
Epoch [1/5], Batch [13], Loss: 10.7249
Epoch [1/5], Batch [14], Loss: 5.1101
Epoch [1/5], Batch [15], Loss: 4.8211
Epoch [1/5], Batch [16], Loss: 3.4947
Epoch [1/5], Batch [17], Loss: 4.4625
Epoch [1/5], Batch [18], Loss: 4.4735
Epoch [1/5], Batch [19], Loss: 4.0209
Epoch [1/5], Batch [20], Loss: 3.1144
Epoch [1/5], Batch [21], Loss: 3.1910
Epoch [1/5], Batch [22], Loss: 3.0468
Epoch [1/5], Batch [23], Loss: 3.0244
Epoch [1/5], Batch [24], Loss: 3.0435
Epoch [1/5], Batch [25], Loss: 2.6453
Epoch [1/5], Batch [26], Loss: 2.8086
Epoch [1/5]

Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s

Mean MCD: 0.5068
Epoch [2/5], Batch [1], Loss: 2.0945
Epoch [2/5], Batch [2], Loss: 2.1077
Epoch [2/5], Batch [3], Loss: 2.3619
Epoch [2/5], Batch [4], Loss: 2.1151
Epoch [2/5], Batch [5], Loss: 2.3647
Epoch [2/5], Batch [6], Loss: 2.4124
Epoch [2/5], Batch [7], Loss: 1.9996
Epoch [2/5], Batch [8], Loss: 2.4676
Epoch [2/5], Batch [9], Loss: 2.4489
Epoch [2/5], Batch [10], Loss: 2.0158
Epoch [2/5], Batch [11], Loss: 2.2881
Epoch [2/5], Batch [12], Loss: 2.5560
Epoch [2/5], Batch [13], Loss: 2.0537
Epoch [2/5], Batch [14], Loss: 2.0618
Epoch [2/5], Batch [15], Loss: 2.1426
Epoch [2/5], Batch [16], Loss: 2.1696
Epoch [2/5], Batch [17], Loss: 2.2122
Epoch [2/5], Batch [18], Loss: 2.2209
Epoch [2/5], Batch [19], Loss: 2.2541
Epoch [2/5], Batch [20], Loss: 2.1755
Epoch [2/5], Batch [21], Loss: 2.2046
Epoch [2/5], Batch [22], Loss: 2.2963
Epoch [2/5], Batch [23], Loss: 2.2795
Epoch [2/5], Batch [24], Loss: 2.2208
Epoch [2/5], Batch [25], Loss: 2.0877
Epoch [2/5], Batch [26], Loss: 2.1373
Epoc

Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s

Mean MCD: 0.4764
Epoch [3/5], Batch [1], Loss: 2.3282
Epoch [3/5], Batch [2], Loss: 2.3881
Epoch [3/5], Batch [3], Loss: 2.4003
Epoch [3/5], Batch [4], Loss: 2.0933
Epoch [3/5], Batch [5], Loss: 2.2804
Epoch [3/5], Batch [6], Loss: 2.0237
Epoch [3/5], Batch [7], Loss: 2.1199
Epoch [3/5], Batch [8], Loss: 2.3370
Epoch [3/5], Batch [9], Loss: 2.4247
Epoch [3/5], Batch [10], Loss: 1.9710
Epoch [3/5], Batch [11], Loss: 1.9335
Epoch [3/5], Batch [12], Loss: 2.0873
Epoch [3/5], Batch [13], Loss: 2.0807
Epoch [3/5], Batch [14], Loss: 2.0557
Epoch [3/5], Batch [15], Loss: 2.2736
Epoch [3/5], Batch [16], Loss: 2.0743
Epoch [3/5], Batch [17], Loss: 1.9128
Epoch [3/5], Batch [18], Loss: 2.2234
Epoch [3/5], Batch [19], Loss: 2.0809
Epoch [3/5], Batch [20], Loss: 2.2078
Epoch [3/5], Batch [21], Loss: 2.1399
Epoch [3/5], Batch [22], Loss: 2.1775
Epoch [3/5], Batch [23], Loss: 2.1124
Epoch [3/5], Batch [24], Loss: 2.0715
Epoch [3/5], Batch [25], Loss: 2.3158
Epoch [3/5], Batch [26], Loss: 2.2649
Epoc

Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s

Mean MCD: 0.4794
Epoch [4/5], Batch [1], Loss: 2.0208
Epoch [4/5], Batch [2], Loss: 1.7928
Epoch [4/5], Batch [3], Loss: 2.1036
Epoch [4/5], Batch [4], Loss: 2.1246
Epoch [4/5], Batch [5], Loss: 1.9031
Epoch [4/5], Batch [6], Loss: 2.0710
Epoch [4/5], Batch [7], Loss: 2.0033
Epoch [4/5], Batch [8], Loss: 2.2478
Epoch [4/5], Batch [9], Loss: 2.0887
Epoch [4/5], Batch [10], Loss: 2.0267
Epoch [4/5], Batch [11], Loss: 1.9358
Epoch [4/5], Batch [12], Loss: 2.0599
Epoch [4/5], Batch [13], Loss: 2.0222
Epoch [4/5], Batch [14], Loss: 2.0552
Epoch [4/5], Batch [15], Loss: 2.2446
Epoch [4/5], Batch [16], Loss: 1.8394
Epoch [4/5], Batch [17], Loss: 1.9930
Epoch [4/5], Batch [18], Loss: 2.0111
Epoch [4/5], Batch [19], Loss: 2.0570
Epoch [4/5], Batch [20], Loss: 2.2422
Epoch [4/5], Batch [21], Loss: 1.9759
Epoch [4/5], Batch [22], Loss: 2.6287
Epoch [4/5], Batch [23], Loss: 2.0961
Epoch [4/5], Batch [24], Loss: 2.1728
Epoch [4/5], Batch [25], Loss: 2.3226
Epoch [4/5], Batch [26], Loss: 2.2980
Epoc

Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s

Mean MCD: 0.5508
Epoch [5/5], Batch [1], Loss: 2.2024
Epoch [5/5], Batch [2], Loss: 1.9032
Epoch [5/5], Batch [3], Loss: 2.2313
Epoch [5/5], Batch [4], Loss: 2.1101
Epoch [5/5], Batch [5], Loss: 2.1686
Epoch [5/5], Batch [6], Loss: 2.2402
Epoch [5/5], Batch [7], Loss: 2.0763
Epoch [5/5], Batch [8], Loss: 1.8953
Epoch [5/5], Batch [9], Loss: 2.1071
Epoch [5/5], Batch [10], Loss: 1.8374
Epoch [5/5], Batch [11], Loss: 1.9385
Epoch [5/5], Batch [12], Loss: 1.8549
Epoch [5/5], Batch [13], Loss: 1.9775
Epoch [5/5], Batch [14], Loss: 1.8536
Epoch [5/5], Batch [15], Loss: 1.9347
Epoch [5/5], Batch [16], Loss: 2.0964
Epoch [5/5], Batch [17], Loss: 2.0638
Epoch [5/5], Batch [18], Loss: 1.7773
Epoch [5/5], Batch [19], Loss: 1.9139
Epoch [5/5], Batch [20], Loss: 1.9367
Epoch [5/5], Batch [21], Loss: 1.8481
Epoch [5/5], Batch [22], Loss: 2.2183
Epoch [5/5], Batch [23], Loss: 1.9093
Epoch [5/5], Batch [24], Loss: 2.0522
Epoch [5/5], Batch [25], Loss: 2.1442
Epoch [5/5], Batch [26], Loss: 2.1221
Epoc

Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s

Mean MCD: 0.4433
Training completed successfully!
