In [None]:
import torch
import librosa
import numpy as np
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

# Load pre-trained model and tokenizer
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")



In [None]:
# Load WAV file and extract features
audio_file = "harvard.wav"
signal, sample_rate = librosa.load(audio_file, sr=16000)
inputs = tokenizer(signal, return_tensors="pt", padding=True)

# Pass features through the model to get predicted transcriptions
with torch.no_grad():
    outputs = model(inputs.input_values, attention_mask=inputs.attention_mask)

predicted_ids = np.argmax(outputs.logits, axis=-1)
transcription = tokenizer.batch_decode(predicted_ids)[0]

# Identify timestamps of 'm' phoneme with confidence above a certain threshold
m_indices = [i for i, (ph, conf) in enumerate(zip(transcription, outputs.logits.softmax(-1).max(-1).values)) if ph == 'm' and conf > 0.9]
m_timestamps = [(idx * 0.02, (idx+1) * 0.02) for idx in m_indices]  # assuming frame shift of 0.02 seconds

print(m_timestamps)

In [None]:
import torch
import torchaudio
import librosa
import numpy as np
from sklearn.cluster import AgglomerativeClustering

# Load the fine-tuned Facebook Wav2Vec2 model
model = torch.hub.load('pytorch/fairseq', 'wav2vec2_large', 'facebook/wav2vec2-large-960h-lv60-self')

# Load the test audio file
test_file = "harvard.wav"
waveform, sample_rate = torchaudio.load(test_file)

# Preprocess the audio file
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
waveform = resampler(waveform)
feature_extractor = model.feature_extractor
with torch.no_grad():
    features = feature_extractor(waveform)
features = np.array(features.squeeze(0))

# Get the phonemes and their timestamps
phonemes = []
start_times = []
end_times = []

clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=15, linkage='complete')
cluster_labels = clustering.fit_predict(features)
for i, label in enumerate(cluster_labels):
    if i == 0 or label != cluster_labels[i-1]:
        phoneme = model.task.target_dictionary.string([label + 4]) # +4 because the first 4 symbols are reserved for special tokens
        if phoneme in ['m', 'b', 'p']:
            phonemes.append(phoneme)
            start_times.append((i * 10) / 1000) # start time in seconds
    if i == len(cluster_labels) - 1 or label != cluster_labels[i+1]:
        end_times.append(((i+1) * 10) / 1000) # end time in seconds

# Combine the phonemes and their timestamps
phoneme_times = [f"{phoneme}-{start_time:.3f}" for phoneme, start_time in zip(phonemes, start_times)]

# Print the phonemes and their timestamps
print(phoneme_times)

In [None]:
import librosa
import subprocess

# Load WAV file and extract features
audio_file = "example.wav"
signal, sample_rate = librosa.load(audio_file, sr=16000)
mfccs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=13, hop_length=256)

# Reshape and write the features to disk as Kaldi input
features = mfccs.transpose()
with open("example.ark", "wb") as f:
    f.write(f"{audio_file} [".encode())
    for i in range(features.shape[0]):
        f.write(f" {i} ".encode())
        for j in range(features.shape[1]):
            f.write(struct.pack("<f", features[i,j]))
    f.write("]\n".encode())

# Use Kaldi to perform speech recognition and identify 'm' phoneme
cmd = ["online2-wav-nnet3-latgen-faster", "--rt-max=5", "--frames-per-chunk=20", "--acoustic-scale=1.0",
       "--beam=12.0", "--lattice-beam=6.0", "--max-active=7000", "--min-active=200", "--do-endpointing=false",
       "--config=exp/tdnn_lstm_asr_baseline_sp/config.yaml", "exp/tdnn_lstm_asr_baseline_sp/final.mdl",
       "exp/tdnn_lstm_asr_baseline_sp/graph/HCLG.fst", "exp/tdnn_lstm_asr_baseline_sp/graph/words.txt",
       "example.ark", "example.lat"]
subprocess.run(cmd)

# Read the output lattice and identify 'm' phoneme
with open("example.lat", "r") as f:
    for line in f:
        if "m" in line:
            fields = line.strip().split()
            start_time = int(fields[2]) * 0.032
            end_time = int(fields[3]) * 0.032
            print(f"m phoneme found at {start_time}-{end_time} seconds")
