# Libraries

Importing necessary audio file libraries to convert Speech to Text

In [3]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import os
import torchaudio
import numpy as np
import pandas as pd

### Wav2Vec2 Automatic Speech Recognition Model

Using Transfer Learning by implementing Wav2Vec2 HuggingFace Model. I have used two models, small and large (based on training size of original models by Facebook). <br>
I have not created a DeepLearning Neural Network pipeline of my own for this case as our sample size of inputs (9) is very small to train a neural network

Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. 

In [4]:
model_name_small  = "facebook/wav2vec2-base-960h" # 360MB # Small model
model_name_large  = "facebook/wav2vec2-large-960h-lv60-self" # 1.18GB # Large model

In [5]:
processor_small = Wav2Vec2Processor.from_pretrained(model_name_small)
model_small = Wav2Vec2ForCTC.from_pretrained(model_name_small)

processor_large = Wav2Vec2Processor.from_pretrained(model_name_large)
model_large = Wav2Vec2ForCTC.from_pretrained(model_name_large)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
files = os.listdir('./3_audio_data/')
files = files[:-1]
print('Number of audio files : ',len(files))
print(files)

actual_words = ['one','four','four','one','three','one','five','three','five']

Number of audio files :  9
['102.wav', '112.wav', '134.wav', '137.wav', '138.wav', '147.wav', '154.wav', '323.wav', '423.wav']


In [7]:
# Speech to text using small Wav2Vec model

def get_transcription_small(audio_path):
  # load our wav file
    speech, sr = torchaudio.load(audio_path)
    speech = speech.squeeze()
  # or using librosa
  # speech, sr = librosa.load(audio_file, sr=16000)
  # resample from whatever the audio sampling rate to 16000
    resampler = torchaudio.transforms.Resample(sr, 16000)
    speech = resampler(speech)
  # tokenize our wav
    input_values = processor_small(speech, return_tensors="pt", sampling_rate=16000)["input_values"]
  # perform inference
    logits = model_small(input_values)["logits"]
  # use argmax to get the predicted IDs
    predicted_ids = torch.argmax(logits, dim=-1)
  # decode the IDs to text
    transcription = processor_small.decode(predicted_ids[0])
    return transcription.lower()

In [8]:
# Speech to text using Large Wav2Vec model

def get_transcription_large(audio_path):
  # load our wav file
    speech, sr = torchaudio.load(audio_path)
    speech = speech.squeeze()
  # or using librosa
  # speech, sr = librosa.load(audio_file, sr=16000)
  # resample from whatever the audio sampling rate to 16000
    resampler = torchaudio.transforms.Resample(sr, 16000)
    speech = resampler(speech)
  # tokenize our wav
    input_values = processor_large(speech, return_tensors="pt", sampling_rate=16000)["input_values"]
  # perform inference
    logits = model_large(input_values)["logits"]
  # use argmax to get the predicted IDs
    predicted_ids = torch.argmax(logits, dim=-1)
  # decode the IDs to text
    transcription = processor_large.decode(predicted_ids[0])
    return transcription.lower()

In [9]:
# Extraccting text from audio using small model

transcripts_small = []
for ele in files:
    trans = get_transcription_small(os.path.join('./3_audio_data/', ele))
    transcripts_small.append(trans)
    
print('Predicted words on small model: ',transcripts_small)
print('Actual words : ', actual_words)

  tensor = as_tensor(value)


Predicted words on small model:  ['wai', 'faw', 'for', 'an', 'three', 'while yo', 'five', 'three', 'five']
Actual words :  ['one', 'four', 'four', 'one', 'three', 'one', 'five', 'three', 'five']


In [10]:
# Extraccting text from audio using large model 

transcripts_large = []
for ele in files:
    res = get_transcription_large(os.path.join('./3_audio_data/', ele))
    transcripts_large.append(res)
    
print('Predicted words on big model: ',transcripts_large)
print('Actual words : ', actual_words)

Predicted words on big model:  ['well', 'for', 'for', 'a', 'three', 'wa', 'fim', 'tree', 'five']
Actual words :  ['one', 'four', 'four', 'one', 'three', 'one', 'five', 'three', 'five']


In [11]:
# Creating a dataset to compare large and small model's predicted strings from audio files

dataset = pd.DataFrame()

dataset['Predicted_small'] = transcripts_small
dataset['Predicted_large'] = transcripts_large
dataset['Actual'] = actual_words

dataset

Unnamed: 0,Predicted_small,Predicted_large,Actual
0,wai,well,one
1,faw,for,four
2,for,for,four
3,an,a,one
4,three,three,three
5,while yo,wa,one
6,five,fim,five
7,three,tree,three
8,five,five,five


In [12]:
# Computing Jaro similarity between Actual class and predicted values of large and small model

import jellyfish # Jellyfish library consists of Phonetic and distance methods to compare two strings
# For more info refer here - https://github.com/jamesturk/jellyfish

actual_classes = dataset.Actual.unique() # Unique values of actual classes

def jaro_similarity_calculator(colname):
    jaro_similarity = []
    values = []
    for i in dataset[colname]:
        jar_sim = []
        index = 0
        for j in actual_classes:
            jar_sim.append(jellyfish.jaro_similarity(i, j))
            max_sim = max(jar_sim)
            index = np.argmax(jar_sim)
            vals = actual_classes[index]
        jaro_similarity.append(max_sim)
        values.append(vals)
    return [jaro_similarity,values]

Jaro Similarity is the measure of similarity between two strings. The value of Jaro distance ranges from 0 to 1. where 1 means the strings are equal and 0 means no similarity between the two strings.

In [13]:
# Adding columns for large and small model after computing Jaro similarity
# Also, computing the predicted classes after using Jaro similarity

dataset['Large_JaroSim'] = jaro_similarity_calculator('Predicted_large')[0]
dataset['Small_JaroSim'] = jaro_similarity_calculator('Predicted_small')[0]
dataset['Predicted_large_Jaro'] = jaro_similarity_calculator('Predicted_large')[1]
dataset['Predicted_small_Jaro'] = jaro_similarity_calculator('Predicted_small')[1]

dataset

Unnamed: 0,Predicted_small,Predicted_large,Actual,Large_JaroSim,Small_JaroSim,Predicted_large_Jaro,Predicted_small_Jaro
0,wai,well,one,0.527778,0.527778,one,five
1,faw,for,four,0.916667,0.527778,four,four
2,for,for,four,0.916667,0.916667,four,four
3,an,a,one,0.0,0.611111,one,one
4,three,three,three,1.0,1.0,three,three
5,while yo,wa,one,0.0,0.583333,one,five
6,five,fim,five,0.722222,1.0,five,five
7,three,tree,three,0.933333,1.0,three,three
8,five,five,five,1.0,1.0,five,five


In [14]:
accuracy = dataset[dataset.Predicted_large_Jaro == dataset.Actual].shape[0] / dataset.Actual.shape[0]
print('Accuracy of model with Predicted_large_jaro -', accuracy*100,'%')

Accuracy of model with Predicted_large_jaro - 100.0 %


In [15]:
accuracy = dataset[dataset.Predicted_small_Jaro == dataset.Actual].shape[0] / dataset.Actual.shape[0]
print('Accuracy of model with Predicted_small_jaro -', accuracy*100,'%')

Accuracy of model with Predicted_small_jaro - 77.77777777777779 %


I have used Jaro similarity criterion to compute the closeness of predicted word to the actual word. <br>
We can use Levenshtein distance, Jaro distance, Damerau-Levenshtein distance, etc. as well to find distances between two strings.

In [16]:
# Exploring the process of finding the values obtained in Predicted_large_Jaro and Predicted_small_Jaro

import jellyfish

actual_classes = dataset.Actual.unique()
jaro_similarity = []
values = []

for i in dataset.Predicted_small:
    jar_sim = []
    index = 0
    for j in actual_classes:
        jar_sim.append(jellyfish.jaro_similarity(i, j))
        print('Jaro similarity between ',i,' and ',j,' is ',jar_sim[-1])
        max_sim = max(jar_sim)
        index = np.argmax(jar_sim)
        vals = actual_classes[index]
    print('Maximum similarity is',max_sim,' with ',vals,'\n')
    jaro_similarity.append(max_sim)
    values.append(vals)


Jaro similarity between  wai  and  one  is  0.0
Jaro similarity between  wai  and  four  is  0.0
Jaro similarity between  wai  and  three  is  0.0
Jaro similarity between  wai  and  five  is  0.5277777777777778
Maximum similarity is 0.5277777777777778  with  five 

Jaro similarity between  faw  and  one  is  0.0
Jaro similarity between  faw  and  four  is  0.5277777777777778
Jaro similarity between  faw  and  three  is  0.0
Jaro similarity between  faw  and  five  is  0.5277777777777778
Maximum similarity is 0.5277777777777778  with  four 

Jaro similarity between  for  and  one  is  0.0
Jaro similarity between  for  and  four  is  0.9166666666666666
Jaro similarity between  for  and  three  is  0.5111111111111111
Jaro similarity between  for  and  five  is  0.5277777777777778
Maximum similarity is 0.9166666666666666  with  four 

Jaro similarity between  an  and  one  is  0.611111111111111
Jaro similarity between  an  and  four  is  0.0
Jaro similarity between  an  and  three  is  0.0

We see in case of 'faw', we see equal similarity of 'faw' with 'four' and 'five', which is surprising. I expected this to have more inclination towards 'four'.

## Using Phonetic encoding to check for similarity of sounds

In [17]:
from jellyfish import soundex, metaphone, nysiis, match_rating_codex,\
    levenshtein_distance, damerau_levenshtein_distance, hamming_distance,\
    jaro_similarity

dataList = [i for tup in zip(transcripts_large,actual_words) for i in tup]

sounds_encoding_methods = [soundex, metaphone, nysiis, match_rating_codex]

report = pd.DataFrame([dataList]).T
report.columns = ['word']
for i in sounds_encoding_methods:
    report[i.__name__]= report['word'].apply(lambda x: i(x))
print(report)

     word soundex metaphone nysiis match_rating_codex
0    well    W400        WL    WAL                 WL
1     one    O500        ON     ON                 ON
2     for    F600        FR    FAR                 FR
3    four    F600        FR    FAR                 FR
4     for    F600        FR    FAR                 FR
5    four    F600        FR    FAR                 FR
6       a    A000         A      A                  A
7     one    O500        ON     ON                 ON
8   three    T600        0R    TRY                THR
9   three    T600        0R    TRY                THR
10     wa    W000         W      W                  W
11    one    O500        ON     ON                 ON
12    fim    F500        FM    FAN                 FM
13   five    F100        FF    FAV                 FV
14   tree    T600        TR    TRY                 TR
15  three    T600        0R    TRY                THR
16   five    F100        FF    FAV                 FV
17   five    F100        FF 

Considering 'wai' and 'one', I expected both of them to have similar phonetic encoding as they both start with a sound of 'w'. <br>
But this is not the case in our model. Hence, I have not used phonetics to classify.

In [18]:
# Some examples of jaro_similarity

print(jellyfish.jaro_similarity('Jellyfish','jelly'))
print(jellyfish.jaro_similarity('Jellyfish','fish'))
print(jellyfish.jaro_similarity('Jellyfish','jellyFish'))
print(jellyfish.jaro_similarity('Jellyfish','jelfish'))

0.7481481481481481
0.0
0.8518518518518517
0.8412698412698413
