In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torchaudio
!pip install audiomentations

Collecting audiomentations
  Downloading audiomentations-0.37.0-py3-none-any.whl.metadata (11 kB)
Collecting numpy-minmax<1,>=0.3.0 (from audiomentations)
  Downloading numpy_minmax-0.3.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting numpy-rms<1,>=0.4.2 (from audiomentations)
  Downloading numpy_rms-0.4.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Collecting scipy<1.13,>=1.4 (from audiomentations)
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading audiomentations-0.37.0-py3-none-any.whl (80 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.5/80.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy_minmax-0.3.1-cp310-c

In [None]:
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
import os
from torch.nn import functional as F
import torch.nn as nn
import torch.optim as optim
import numpy as np
from scipy.io.wavfile import write
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift

# Step 1: Define Data Augmentation Pipeline
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    Shift(min_shift=-0.5, max_shift=0.5, p=0.5)
])

# Custom Dataset for Loading Audio Files with Augmentation
class SpeechDataset(Dataset):
    def __init__(self, data_dir, transform=None, target_length=80000, num_files=100, apply_augmentation=False):
        self.data_dir = data_dir
        self.transform = transform
        self.target_length = target_length
        self.apply_augmentation = apply_augmentation
        self.audio_files = [f for f in os.listdir(data_dir) if f.endswith('.wav')][:num_files]

        if len(self.audio_files) == 0:
            raise ValueError(f"No audio files found in directory: {data_dir}")

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        wav_path = os.path.join(self.data_dir, self.audio_files[idx])
        if not os.path.exists(wav_path):
            raise FileNotFoundError(f"Audio file not found: {wav_path}")

        try:
            waveform, sr = torchaudio.load(wav_path)
        except RuntimeError as e:
            print(f"Error loading audio file: {wav_path}, Error: {e}")
            raise e

        # Pad or truncate to the target length
        if waveform.shape[1] < self.target_length:
            padding = self.target_length - waveform.shape[1]
            waveform = F.pad(waveform, (0, padding))
        else:
            waveform = waveform[:, :self.target_length]

        # Apply Augmentation
        if self.apply_augmentation:
            waveform = augment_audio(waveform)

        # Apply normalization if any
        if self.transform:
            waveform = self.transform(waveform)

        return waveform, self.audio_files[idx]

# Function to Apply Augmentation to the Audio
def augment_audio(audio):
    augmented_samples = augment(samples=audio.numpy(), sample_rate=16000)
    return torch.tensor(augmented_samples)

# Define Normalization Transform
def normalize_waveform(waveform):
    return (waveform - waveform.mean()) / waveform.std()

# Directories for Data
data_dir_A = "/content/drive/MyDrive/data/extracted_files-3/en/North_American_English_W/"

# Target length for all audio files
target_length = 80000

# Initialize Dataset with 100 Files and Apply Augmentation
dataset_A = SpeechDataset(data_dir_A, transform=normalize_waveform, target_length=target_length, num_files=100, apply_augmentation=True)

# Initialize DataLoader
dataloader_A = DataLoader(dataset_A, batch_size=1, shuffle=True)

# Define VAE without Transformer Components: Encoder, Decoder
class SimpleVAE(nn.Module):
    def __init__(self, input_dim=80000, latent_dim=64):
        super(SimpleVAE, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc21 = nn.Linear(512, latent_dim)
        self.fc22 = nn.Linear(512, latent_dim)
        self.fc3 = nn.Linear(latent_dim, 512)
        self.fc4 = nn.Linear(512, input_dim)

    def encode(self, x):
        h1 = F.relu(self.fc1(x.view(-1, 80000)))
        return self.fc21(h1), self.fc22(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return torch.sigmoid(self.fc4(h3))

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

    def loss_function(self, recon_x, x, mu, logvar):
        recon_loss = nn.MSELoss()(recon_x, x.view(-1, 80000))
        kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        return recon_loss + kl_loss

# Initialize Models
model = SimpleVAE()
optimizer = optim.Adam(model.parameters(), lr=0.0002)

# Set device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to save audio data correctly
def save_audio(file_path, audio_tensor, sample_rate=16000):
    audio_np = audio_tensor.cpu().detach().numpy().squeeze(0)
    audio_np = audio_np / np.max(np.abs(audio_np) + 1e-6)
    audio_np = np.clip(audio_np, -1, 1)
    audio_np = (audio_np * 32767).astype(np.int16)

    if len(audio_np.shape) > 1:
        audio_np = audio_np[0]

    write(file_path, sample_rate, audio_np)

# Training Loop for Simple VAE
def train_vae(dataloader_A, num_epochs=5):
    filename_mapping_A = {}  # Mapping for evaluation

    for epoch in range(num_epochs):
        batch_count = 0
        for real_A, file_A in dataloader_A:
            real_A = real_A.to(device)
            batch_count += 1

            # Forward pass through VAE
            reconstructed_A, mu, logvar = model(real_A)

            # Compute Loss
            loss = model.loss_function(reconstructed_A, real_A, mu, logvar)

            # Update Model
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Save Converted Audio for Evaluation
            converted_A_path = f"/content/drive/MyDrive/data/extracted_files-3/en/vae_converted_epoch_{epoch}/{file_A[0]}"
            os.makedirs(os.path.dirname(converted_A_path), exist_ok=True)

            save_audio(converted_A_path, reconstructed_A, sample_rate=16000)

            # Update Mappings for Evaluation
            filename_mapping_A[file_A[0]] = file_A[0]  # Map original to new

            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_count}], Loss: {loss.item():.4f}")

        print(f"Epoch [{epoch+1}/{num_epochs}] completed.")
        evaluate_vae(epoch, filename_mapping_A)

    print("Training completed successfully!")

# Evaluation function for VAE
def evaluate_vae(epoch, mapping_A):
    original_dir_A = "/content/drive/MyDrive/data/extracted_files-3/en/North_American_English_W/"
    converted_dir_A = f"/content/drive/MyDrive/data/extracted_files-3/en/vae_converted_epoch_{epoch}"

    print(f"Evaluating VAE performance after Epoch {epoch+1}")
    evaluate_metrics(original_dir=original_dir_A, converted_dir=converted_dir_A, mapping=mapping_A)

# Function to Compute MCD (Placeholder)
def compute_mcd(orig_audio, conv_audio):
    return np.random.random()

# Evaluation function for Mean Mel-Cepstral Distortion (MCD)
def evaluate_metrics(original_dir, converted_dir, mapping):
    mcd_scores = []

    for orig_file, conv_file in mapping.items():
        orig_path = os.path.join(original_dir, orig_file)
        conv_path = os.path.join(converted_dir, conv_file)

        if not os.path.exists(orig_path):
            print(f"Original file not found: {orig_path}")
            continue
        if not os.path.exists(conv_path):
            print(f"Converted file not found: {conv_path}")
            continue

        try:
            orig_audio, _ = torchaudio.load(orig_path)
            conv_audio, _ = torchaudio.load(conv_path)
        except RuntimeError as e:
            print(f"Error loading audio files. Original: {orig_path}, Converted: {conv_path}, Error: {e}")
            continue

        # Compute MCD
        mcd_score = compute_mcd(orig_audio, conv_audio)
        mcd_scores.append(mcd_score)

    if mcd_scores:
        mean_mcd = np.mean(mcd_scores)
        print(f"Mean MCD: {mean_mcd:.4f}")
    else:
        print("No valid scores computed due to missing or corrupt files.")

# Start Training for VAE
train_vae(dataloader_A)


Epoch [1/5], Batch [1], Loss: 3.8443
Epoch [1/5], Batch [2], Loss: 4.3802
Epoch [1/5], Batch [3], Loss: 3.7984
Epoch [1/5], Batch [4], Loss: 5.8977
Epoch [1/5], Batch [5], Loss: 5.2885
Epoch [1/5], Batch [6], Loss: 3.9073
Epoch [1/5], Batch [7], Loss: 4.6379
Epoch [1/5], Batch [8], Loss: 4.4580
Epoch [1/5], Batch [9], Loss: 6.3473
Epoch [1/5], Batch [10], Loss: 5.7203
Epoch [1/5], Batch [11], Loss: 7.3422
Epoch [1/5], Batch [12], Loss: 11.7899
Epoch [1/5], Batch [13], Loss: 7.2433
Epoch [1/5], Batch [14], Loss: 15.6629
Epoch [1/5], Batch [15], Loss: 6.5123
Epoch [1/5], Batch [16], Loss: 10.6417
Epoch [1/5], Batch [17], Loss: 15.6176
Epoch [1/5], Batch [18], Loss: 13.7358
Epoch [1/5], Batch [19], Loss: 4.5846
Epoch [1/5], Batch [20], Loss: 10.1587
Epoch [1/5], Batch [21], Loss: 47.5850
Epoch [1/5], Batch [22], Loss: 7.5190
Epoch [1/5], Batch [23], Loss: 7.6221
Epoch [1/5], Batch [24], Loss: 6.9065
Epoch [1/5], Batch [25], Loss: 9.3862
Epoch [1/5], Batch [26], Loss: 20.9936
Epoch [1/5], 