In [5]:
import os
import sys

import functools
import itertools

from typing import Optional, Tuple, Union, TypeVar, List
from torch import Tensor
import numpy.typing as npt

import matplotlib.pyplot as plt
import seaborn as sns

from IPython import display
from IPython.display import Audio

import math
import random
import numpy as np
import pandas as pd

from sklearn.cluster import MiniBatchKMeans

import torch
from torch import nn
# import torch.nn.functional as F

from torch import Tensor
import torch.optim as optim
from torch.optim.optimizer import Optimizer
from torch.utils.data import DataLoader
# from loss import hubert_loss

import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
from torchaudio.utils import download_asset

import librosa

In [2]:
Batch = Tuple[Tensor, Tensor, Tensor]
Batch_FineTune = Tuple[Tensor, Tensor, Tensor, Tensor]
T1 = TypeVar("T1", bound=npt.NBitBase) # numpy.int64

In [3]:
torch.random.manual_seed(0)

<torch._C.Generator at 0x195a01f2450>

In [4]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


### Neural Network Model

In [12]:
cv_test_1 = "cv-corpus-15.0-delta-2023-09-08-es\cv-corpus-15.0-delta-2023-09-08\es\clips\common_voice_es_38401302.mp3"
y, sr = librosa.load(cv_test_1)
Audio(data=y, rate=sr)

In [13]:
def calculate_length(length):
    """Calculate output length of tensors after feature extractor
    with kernel sizes equal to [10,3,3,3,3,2,2] and strides equal to [5,2,2,2,2,2,2]"""
    
    length = torch.div(length - 10, 5, rounding_mode="floor") + 1
    length = torch.max(torch.zeros_like(length), length) # When input length is 0, the resulting length can be negative. So fix it here.
                                                         # Same for every line equivalent to this

    length = torch.div(length - 3, 2, rounding_mode="floor") + 1
    length = torch.max(torch.zeros_like(length), length) 

    length = torch.div(length - 3, 2, rounding_mode="floor") + 1
    length = torch.max(torch.zeros_like(length), length) 

    length = torch.div(length - 3, 2, rounding_mode="floor") + 1
    length = torch.max(torch.zeros_like(length), length) 

    length = torch.div(length - 3, 2, rounding_mode="floor") + 1
    length = torch.max(torch.zeros_like(length), length) 

    length = torch.div(length - 2, 2, rounding_mode="floor") + 1
    length = torch.max(torch.zeros_like(length), length) 

    length = torch.div(length - 2, 2, rounding_mode="floor") + 1
    length = torch.max(torch.zeros_like(length), length) 

    return length

In [45]:
class cnnFeatureExtractor(nn.Module):
    """Convolution unit of FeatureExtractor
    Note: Adapted from -> https://github.com/pytorch/audio/blob/main/src/torchaudio/models/wav2vec2/components.py
                       -> https://github.com/pytorch/audio/blob/main/src/torchaudio/models/wav2vec2/model.py"""
    
    def __init__(self):
        super().__init__()
        self.normalization = nn.GroupNorm(num_groups=512,num_channels=512,affine=True)
        self.encoderCNN_1 = torch.nn.Conv1d(in_channels=1, out_channels=512, kernel_size=10, stride=5,bias=False)
        self.encoderCNN_2 = torch.nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2,bias=False)
        self.encoderCNN_3 = torch.nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2,bias=False)
        self.encoderCNN_4 = torch.nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2,bias=False)
        self.encoderCNN_5 = torch.nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2,bias=False)
        self.encoderCNN_6 = torch.nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2,bias=False)
        self.encoderCNN_7 = torch.nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=2,bias=False)

    def calculate_length(length):
        """Calculate output length of tensors after feature extractor
        with kernel sizes equal to [10,3,3,3,3,2,2] and strides equal to [5,2,2,2,2,2,2]"""
        
        length = torch.div(length - 10, 5, rounding_mode="floor") + 1
        length = torch.max(torch.zeros_like(length), length) # When input length is 0, the resulting length can be negative. So fix it here.
                                                             # Same for every line equivalent to this
    
        length = torch.div(length - 3, 2, rounding_mode="floor") + 1
        length = torch.max(torch.zeros_like(length), length) 
    
        length = torch.div(length - 3, 2, rounding_mode="floor") + 1
        length = torch.max(torch.zeros_like(length), length) 
    
        length = torch.div(length - 3, 2, rounding_mode="floor") + 1
        length = torch.max(torch.zeros_like(length), length) 
    
        length = torch.div(length - 3, 2, rounding_mode="floor") + 1
        length = torch.max(torch.zeros_like(length), length) 
    
        length = torch.div(length - 2, 2, rounding_mode="floor") + 1
        length = torch.max(torch.zeros_like(length), length) 
    
        length = torch.div(length - 2, 2, rounding_mode="floor") + 1
        length = torch.max(torch.zeros_like(length), length) 
    
        return length
    
    def forward(
        self, 
        waveform: Tensor,
        length: Optional[Tensor],
    ) -> Tuple[Tensor, Optional[Tensor]]:
        """
        Input Arguments:
            x (Tensor): Shape: ``[batch, in_channels, in_frame]``.
            length (Tensor or None, optional): Shape ``[batch, ]``.
        Returns:
            Tensor: Shape ``[batch, out_channels, out_frames]``.
            Optional[Tensor]: Shape ``[batch, ]``.
        """
        
        #if length is not None:
        out_length = calculate_length(length)
        
        waveform = waveform.unsqueeze(1)      # (batch, channel==1, frame) <- From FeatureExtractor()
        x1 = self.encoderCNN_1(waveform)      # <- From ConvLayerBlock() from here
        x_norm = self.normalization(x1)       # Each layer should organized as (batch, feature, frame)
        x_norm = nn.functional.gelu(x_norm) 
        x2 = self.encoderCNN_2(x_norm)
        x2 = nn.functional.gelu(x2)
        x3 = self.encoderCNN_3(x2)
        x3 = nn.functional.gelu(x3)
        x4 = self.encoderCNN_4(x3)
        x4 = nn.functional.gelu(x4)
        x5 = self.encoderCNN_5(x4)
        x5 = nn.functional.gelu(x5)
        x6 = self.encoderCNN_6(x5)
        x6 = nn.functional.gelu(x6)
        x7 = self.encoderCNN_7(x6)
        x_encoder = nn.functional.gelu(x7)     # <- From ConvLayerBlock() up to here
        x_encoder = x_encoder.transpose(1, 2)  # (batch, frame, feature) <- From FeatureExtractor()
        
        return x_encoder, out_length

In [51]:
a = cnnFeatureExtractor()
y_tensor = torch.unsqueeze(torch.tensor(y),0)
print(len(y), len(y_tensor[0]))
b = a(y_tensor,len(y_tensor[0]))

115101 115101


In [64]:
a

cnnFeatureExtractor(
  (normalization): GroupNorm(512, 512, eps=1e-05, affine=True)
  (encoderCNN_1): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
  (encoderCNN_2): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
  (encoderCNN_3): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
  (encoderCNN_4): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
  (encoderCNN_5): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
  (encoderCNN_6): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
  (encoderCNN_7): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
)

In [53]:
b[0].shape

torch.Size([1, 358, 512])

In [54]:
b[1]

tensor(359)

In [62]:
len(b[0][0])

358

In [63]:
b

(tensor([[[-1.2927e-07, -1.7803e-07,  5.5459e-08,  ..., -1.0631e-07,
            2.8823e-08,  2.5017e-07],
          [ 2.3088e-09, -7.7342e-07,  8.4819e-08,  ..., -1.0716e-06,
            3.0153e-07,  2.9325e-07],
          [-1.3598e-05, -5.2320e-06, -3.7996e-06,  ...,  1.6382e-06,
           -3.9878e-06,  9.5204e-06],
          ...,
          [-5.6238e-06,  3.2845e-05,  3.7571e-05,  ..., -3.6034e-05,
           -3.2288e-05, -8.3210e-06],
          [-6.2744e-05, -5.4111e-05,  2.0057e-05,  ...,  2.5081e-05,
            9.8886e-07,  3.2840e-05],
          [-7.9304e-06,  6.1175e-06, -2.4123e-05,  ..., -2.0152e-05,
           -2.8994e-05,  2.5170e-05]]], grad_fn=<TransposeBackward0>),
 tensor(359))