In [None]:
%%capture
!pip install speechbrain

In [None]:
# Download + Unpacking test-clean of librispeech
import shutil
from speechbrain.utils.data_utils import download_file

MINILIBRI_TEST_URL = "https://www.openslr.org/resources/12/test-clean.tar.gz"
download_file(MINILIBRI_TEST_URL, 'test-clean.tar.gz')
shutil.unpack_archive( 'test-clean.tar.gz', '.')

Downloading https://www.openslr.org/resources/12/test-clean.tar.gz to test-clean.tar.gz


test-clean.tar.gz: 347MB [00:14, 23.5MB/s]                           


In [None]:
from speechbrain.pretrained import EncoderDecoderASR
audio_1 = "/content/LibriSpeech/test-clean/1089/134686/1089-134686-0030.flac"

# Uncomment for using another pre-trained model
asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-crdnn-rnnlm-librispeech", savedir="pretrained_models/asr-crdnn-rnnlm-librispeech",  run_opts={"device":"cuda"})
asr_model.transcribe_file(audio_1)

Downloading:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/212M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/253k [00:00<?, ?B/s]

'BEWARE OF MAKING THAT MISTAKE'

In [None]:
import torch
import torchaudio

snt_1, fs = torchaudio.load(audio_1)
wav_lens=torch.tensor([1.0])
asr_model.transcribe_batch(snt_1, wav_lens)

(['BEWARE OF MAKING THAT MISTAKE'],
 [[22, 59, 67, 11, 8, 147, 68, 12, 20, 224, 336, 11]])

In [None]:
audio_2 = "/content/LibriSpeech/test-clean/1089/134686/1089-134686-0007.flac"

snt_2, fs = torchaudio.load(audio_2)
wav_lens=torch.tensor([1.0])
asr_model.transcribe_batch(snt_2, wav_lens)

(['A COLD LUCID INDIFFERENCE REIGNED IN HIS SOUL'],
 [[5,
   335,
   10,
   451,
   23,
   124,
   13,
   10,
   148,
   42,
   25,
   218,
   73,
   454,
   4,
   13,
   29,
   56,
   149]])

In [None]:
# Padding
from torch.nn.utils.rnn import pad_sequence
batch = pad_sequence([snt_1.squeeze(), snt_2.squeeze()], batch_first=True, padding_value=0.0)
wav_lens=torch.tensor([snt_1.shape[1]/batch.shape[1], snt_2.shape[1]/batch.shape[1]])
asr_model.transcribe_batch(batch, wav_lens)


(['BEWARE OF MAKING THAT MISTAKE',
  'A COLD LUCID INDIFFERENCE REIGNED IN HIS SOUL'],
 [[22, 59, 67, 11, 8, 147, 68, 12, 20, 224, 336, 11],
  [5,
   335,
   10,
   451,
   23,
   124,
   13,
   10,
   148,
   42,
   25,
   218,
   73,
   454,
   4,
   13,
   29,
   56,
   149]])

In [None]:
audio_files=[]
audio_files.append('/content/LibriSpeech/test-clean/1089/134686/1089-134686-0030.flac')
audio_files.append('/content/LibriSpeech/test-clean/1089/134686/1089-134686-0014.flac')
audio_files.append('/content/LibriSpeech/test-clean/1089/134686/1089-134686-0007.flac')
audio_files.append('/content/LibriSpeech/test-clean/1089/134691/1089-134691-0000.flac')
audio_files.append('/content/LibriSpeech/test-clean/1089/134691/1089-134691-0003.flac')
audio_files.append('/content/LibriSpeech/test-clean/1188/133604/1188-133604-0030.flac')
audio_files.append('/content/LibriSpeech/test-clean/1089/134691/1089-134691-0019.flac')
audio_files.append('/content/LibriSpeech/test-clean/1188/133604/1188-133604-0006.flac')

sigs=[]
lens=[]
for audio_file in audio_files:
  snt, fs = torchaudio.load(audio_file)
  sigs.append(snt.squeeze())
  lens.append(snt.shape[1])

batch = pad_sequence(sigs, batch_first=True, padding_value=0.0)

lens = torch.Tensor(lens) / batch.shape[1]

x = asr_model.transcribe_batch(batch, lens)
x

(['BEWARE OF MAKING THAT MISTAKE',
  'HE TRIED TO THINK HOW IT COULD BE',
  'A COLD LUCID INDIFFERENCE REIGNED IN HIS SOUL',
  'HE COULD WAIT NO LONGER',
  'THE UNIVERSITY',
  'HE KNOWS THEM BOTH',
  'A VOICE FROM BEYOND THE WORLD WAS CALLING',
  'THEN HE COMES TO THE BEAK OF IT'],
 [[22, 59, 67, 11, 8, 147, 68, 12, 20, 224, 336, 11],
  [16, 392, 4, 9, 292, 195, 24, 141, 22],
  [5,
   335,
   10,
   451,
   23,
   124,
   13,
   10,
   148,
   42,
   25,
   218,
   73,
   454,
   4,
   13,
   29,
   56,
   149],
  [16, 141, 625, 84, 257, 25],
  [2, 146, 245, 21, 1, 161],
  [16, 179, 1, 129, 557],
  [5, 506, 86, 22, 795, 2, 472, 19, 339, 12],
  [190, 16, 261, 1, 9, 2, 22, 336, 8, 24]])

In [None]:
audio_files1=[]
audio_files1.append('/content/LibriSpeech/test-clean/1089/134686/1089-134686-0031.flac')
audio_files1.append('/content/LibriSpeech/test-clean/1089/134686/1089-134686-0011.flac')


sigs1=[]
lens1=[]
for audio_file in audio_files1:
  snt1, fs1 = torchaudio.load(audio_file)
  sigs1.append(snt1.squeeze())
  lens1.append(snt1.shape[1])

batch1 = pad_sequence(sigs1, batch_first=True, padding_value=0.0)

lens1 = torch.Tensor(lens1) / batch1.shape[1]

y = asr_model.transcribe_batch(batch1, lens1)

**LSTM**

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.style.use('dark_background')
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, GlobalMaxPooling1D, SpatialDropout1D

1. Data Preparation
Data manifest files

In [None]:
import pandas as pd

In [None]:
# loading train and test data 
train = audio_files # get the train split

test = audio_files1 # get the test split

In [None]:
train

['/content/LibriSpeech/test-clean/1089/134686/1089-134686-0030.flac',
 '/content/LibriSpeech/test-clean/1089/134686/1089-134686-0014.flac',
 '/content/LibriSpeech/test-clean/1089/134686/1089-134686-0007.flac',
 '/content/LibriSpeech/test-clean/1089/134691/1089-134691-0000.flac',
 '/content/LibriSpeech/test-clean/1089/134691/1089-134691-0003.flac',
 '/content/LibriSpeech/test-clean/1188/133604/1188-133604-0030.flac',
 '/content/LibriSpeech/test-clean/1089/134691/1089-134691-0019.flac',
 '/content/LibriSpeech/test-clean/1188/133604/1188-133604-0006.flac']

In [None]:
test

['/content/LibriSpeech/test-clean/1089/134686/1089-134686-0031.flac',
 '/content/LibriSpeech/test-clean/1089/134686/1089-134686-0011.flac']

In [None]:
replace_list = {r"i'm": 'i am',
                r"'re": ' are',
                r"let’s": 'let us',
                r"'s":  ' is',
                r"'ve": ' have',
                r"can't": 'can not',
                r"cannot": 'can not',
                r"shan’t": 'shall not',
                r"n't": ' not',
                r"'d": ' would',
                r"'ll": ' will',
                r"'scuse": 'excuse',
                ',': ' ,',
                '.': ' .',
                '!': ' !',
                '?': ' ?',
                '\s+': ' '}
def clean_text(text):
    text = text.lower()
    for s in replace_list:
        text = text.replace(s, replace_list[s])
    text = ' '.join(text.split())
    return text

In [None]:
X_train = train['Phrase'].apply(lambda p: clean_text(p))

In [None]:
phrase_len = X_train.apply(lambda p: len(p.split(' ')))
max_phrase_len = phrase_len.max()
print('max phrase len: {0}'.format(max_phrase_len))
plt.figure(figsize = (10, 8))
plt.hist(phrase_len, alpha = 0.2, density = True)
plt.xlabel('phrase len')
plt.ylabel('probability')
plt.grid(alpha = 0.25)

In [None]:
y_train = train['Sentence']

In [None]:
max_words = 8192
tokenizer = Tokenizer(
    num_words = max_words,
    filters = '"#$%&()*+-/:;<=>@[\]^_`{|}~'
)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen = max_phrase_len)

batch_size = 512
epochs = 8

In [None]:
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim = max_words, output_dim = 256, input_length = max_phrase_len))
model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(5, activation = 'softmax'))
model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

In [None]:
history = model_lstm.fit(
    X_train,
    y_train,
    validation_split = 0.1,
    epochs = 8,
    batch_size = 512
)