In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

import librosa

from transformers import Wav2Vec2Model, AutoProcessor, HubertForCTC
from datasets import load_dataset

from huggingface_hub import login
login()

# other imports we might need
import pandas as pd
from pathlib import Path
from tqdm import tqdm

import torchaudio
from sklearn.model_selection import train_test_split

import os
import sys

import torchaudio
import librosa
import IPython.display as ipd
import numpy as np

from transformers import AutoConfig, Wav2Vec2Processor

In [None]:
# Load dataset
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate

ds = load_dataset("MushanW/GLOBE_V2") # GLOBE accented dataset

# Load an audio file
y, sr = librosa.load('audio.wav')

mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)


In [None]:
# Load dataset
# GLOBE splits are: test (5.46l), val (4.11k), train (572k)
# ds = load_dataset("MushanW/GLOBE_V2") # GLOBE V2 accented dataset

split = 'val'
sampling_rate = 24000 #24kHz
ds = load_dataset("MushanW/GLOBE", split=split, sampling_rate=sampling_rate) # GLOBE accented dataset

In [None]:
# use torchaudio to load and librosa to resample
# librosa is best for reading .wav but torchaudio is best for reading .mp3
# resampling is done with librosa regardless
speech, sr = torchaudio.load(path)
speech = speech[0].numpy().squeeze()
speech = librosa.resample(np.asarray(speech), sr, 16_000)
ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000)

In [None]:
# Get Mel spectrogram
S = librosa.feature.melspectrogram(y=array, sr=sampling_rate, n_mels=128, fmax=8000)
S_dB = librosa.power_to_db(S, ref=np.max)

plt.figure().set_figwidth(12)
librosa.display.specshow(S_dB, x_axis="time", y_axis="mel", sr=sampling_rate, fmax=8000)
plt.colorbar()

C = librosa.feature.mfcc()

In [None]:
# Get fundamental frequency (f0)
y, sr = librosa.load(librosa.ex('trumpet'))
f0, voiced_flag, voiced_probs = librosa.pyin(y,sr=sr, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
times = librosa.times_like(f0, sr=sr)

In [None]:
# Model config

model_name_or_path = "lighteternal/wav2vec2-large-xlsr-53-greek"
pooling_mode = "mean"

config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

In [None]:
# Set processor and model
# processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
# model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
processor = AutoProcessor.from_pretrained('hubert_large_ll60k_finetune_ls960.pt')
model = HubertForCTC.from_pretrained('hubert_large_ll60k_finetune_ls960.pt')

# audio file is decoded on the fly
inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)

# transcribe speech
transcription = processor.batch_decode(predicted_ids)
transcription[0]

inputs["labels"] = processor(text=dataset[0]["text"], return_tensors="pt").input_ids

# compute loss
loss = model(**inputs).loss
round(loss.item(), 2)

In [None]:
from transformers import Wav2Vec2Processor, HubertForCTC
from datasets import load_dataset
import torch

dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
sampling_rate = dataset.features["audio"].sampling_rate

processor = Wav2Vec2Processor.from_pretrained('facebook/hubert-large-ls960-ft')
model = HubertForCTC.from_pretrained('facebook/hubert-large-ls960-ft')

# audio file is decoded on the fly
inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)

# one-hot predictions
num_classes = logits.shape[-1]
one_hot = F.one_hot(predicted_ids, num_classes)

# transcribe speech
transcription = processor.batch_decode(predicted_ids)


# compute loss
with processor.as_target_processor():
    inputs["labels"] = processor(dataset[0]["text"], return_tensors="pt").input_ids

loss = model(**inputs).loss,

In [None]:
# Pronounciation encoder
class PronunciationEncoder(nn.Module):
    def __init__(self, one_hots, accent_emb, *args, **kwargs) -> None:
        super(PronunciationEncoder, self).__init__()
        self.projectionLayer = nn.Linear()
        self.transformer = nn.Transformer()
        self.dropout = nn.Dropout(p=0.3)

In [None]:
# from transformers import Wav2Vec2Model

# Wav2Vec 2.0 Large (LV-60 + CV + SWBD + FSH)
# w2v_large_lv_fsh_swbd_cv_ftls960_updated.pt
# model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device)