In [None]:
#import the necessary packages
import torch
import torchaudio
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration, Speech2TextModel
from datasets import load_dataset
import soundfile as sf

In [None]:
torch.cuda.is_available()

True

In [None]:
#downloading the pretrained model and processor from huggingface
model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr").cuda()
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr")

Playing an audio

In [None]:
import IPython.display as ipd
import librosa

In [None]:
x , sr = librosa.load("./dataset/audio/61-70968-0000.flac", sr = 16000)

In [None]:
ipd.Audio(x, rate=sr)

Getting the names of the audio files

In [None]:
import os

In [None]:
os.listdir("./dataset/audio/")[:10]

['1089-134686-0000.flac',
 '1089-134686-0001.flac',
 '1089-134686-0002.flac',
 '1089-134686-0003.flac',
 '1089-134686-0004.flac',
 '1089-134686-0005.flac',
 '1089-134686-0006.flac',
 '1089-134686-0007.flac',
 '1089-134686-0008.flac',
 '1089-134686-0009.flac']

In [None]:
files = os.listdir("./dataset/audio/")

Applying the pretrained model to the audio

In [None]:
#conversion of the speech data to tensors
input_features = processor(
    x,
    sampling_rate=16_000,
    return_tensors="pt"
).input_features  # Batch size 1

  tensor = as_tensor(value)


In [None]:
input_features

tensor([[[-0.5333, -0.7201, -1.0232,  ..., -1.1419, -0.9544, -0.9229],
         [-0.0068, -0.2457, -0.7009,  ..., -1.2417, -0.9190, -0.9552],
         [-0.9343, -0.6430, -0.9317,  ..., -1.2209, -1.1045, -1.1680],
         ...,
         [-0.1723, -0.4007, -0.9895,  ..., -1.2141, -0.9508, -0.8373],
         [-0.6004, -0.5391, -0.7518,  ..., -1.2158, -1.0004, -1.1116],
         [-0.3249, -0.5308, -1.0946,  ..., -1.1424, -1.0163, -0.9887]]])

In [None]:
#generate ids for the tensors
generated_ids = model.generate(input_features)

  input_lengths = (input_lengths - 1) // 2 + 1


In [None]:
#applying the model to predict the transcription of the input speech
transcription = processor.batch_decode(generated_ids)

In [None]:
transcription

['he began a confused complaint against the wizard who had vanished behind the curtain on the left']

In [None]:
#Storing the real transcrption from the txt files one by one
lines = []
for i in os.listdir("./dataset/text"):
    file1 = open("./dataset/text/"+i)
    for f in file1:
        lines.append(f)
    file1.close()

In [None]:
#Number of transcriptions in the available (should be equal to the number of speech files) 
len(lines)

2620

In [None]:
import pandas as pd

In [None]:
#Separating the audio file names and the corresponding transcriptions
aud_file = []
trans_data = []
for i in lines:
    aud = i.split(" ")[0]
    tran = i.split(" ")[1:]
    aud_file.append(aud)
    trans_data.append(" ".join(tran).strip())

In [None]:
#converting it to a dataframe for easy access
trans_df = pd.DataFrame({'File_name':aud_file,'Transcription':trans_data})
trans_df.set_index("File_name",inplace=True)

In [None]:
trans_df.head()

Unnamed: 0_level_0,Transcription
File_name,Unnamed: 1_level_1
1089-134686-0000,HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...
1089-134686-0001,STUFF IT INTO YOU HIS BELLY COUNSELLED HIM
1089-134686-0002,AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...
1089-134686-0003,HELLO BERTIE ANY GOOD IN YOUR MIND
1089-134686-0004,NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...


In [None]:
trans_df.loc['61-70968-0000']['Transcription'].lower()

'he began a confused complaint against the wizard who had vanished behind the curtain on the left'

In [None]:
#import WER from dataset library
from datasets import load_metric

metric = load_metric("wer")

In [None]:
#Running the model for the entire dataset. Repeating the above steps inside the for loop
%%time
transcription_list = []
scores=0
for speech in files:
    x , sr = librosa.load("./dataset/audio/"+speech, sr = 16000)
    input_features = processor(x,sampling_rate=16_000,return_tensors="pt").input_features
    generated_ids = model.generate(input_ids=input_features.cuda())
    transcription = processor.batch_decode(generated_ids)
    transcription_list.append(transcription)
    real_trans = trans_df.loc[speech.split('.')[0]]['Transcription'].lower()
    score=metric.compute(predictions=transcription, references=[real_trans])
    scores = scores + score

  input_lengths = (input_lengths - 1) // 2 + 1


Wall time: 11min 3s


In [None]:
#WER for the entire dataset
print("WER for Librispeech Test_clean Dataset :",scores/len(files)*100)

WER for Librispeech Test_clean Dataset : 3.7410072875023426


## WER for the test-clean dataset using fairseq S2T is 3.74%

In [None]:
#output transcription
transcription_list

[['he hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick peppered flour fat and sauce'],
 ['stuffed into you his belly counselled him'],
 ['after early nightfall the yellow lamps would light up here and there the squalid quarter of the brothels'],
 ['hello bertie any good in your mind'],
 ['number ten fresh nellie is waiting on you good night husband'],
 ["the music came nearer and he recalled the words the words of shelley's fragment upon the moon wandering companionless pale for weariness"],
 ['the dull light fell more faintly upon the page whereon another equation began to unfold itself slowly and to spread abroad its widening tail'],
 ['a cold lucid indifference reigned in his soul'],
 ['the chaos in which his ardor extinguished itself was a cold indifferent knowledge of himself'],
 ['at most by an alms given to a beggar whose blessing he fled from he might hope wearily to win for himself some measure of act