#This is a Quick Hands on for Speech -> Text -> NER 
# using 🤗Hugging Face Transformers, Wav2Vec-2.0 and spaCy.

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 41.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  A

In [2]:
import librosa
import soundfile as sf
import torch
import warnings

from transformers import Wav2Vec2ForMaskedLM, Wav2Vec2Tokenizer

warnings.filterwarnings("ignore")

In [3]:

#load wav2vec2 tokenizer and model
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")

#You should probably TRAIN this model on a down-stream task to be able to use it for better predictions and inference.
model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")


# define speech-to-text function
def asr_transcript(audio_file):
    transcript = ""

    # Stream over 10 seconds chunks
    stream = librosa.stream(
        audio_file, block_length=10, frame_length=16000, hop_length=16000
    )

    for speech in stream:
        if len(speech.shape) > 1:
            speech = speech[:, 0] + speech[:, 1]

        input_values = tokenizer(speech, return_tensors="pt").input_values
        logits = model(input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = tokenizer.batch_decode(predicted_ids)[0]
        transcript += transcription.lower() + " "

    return transcript

Downloading:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/162 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.


Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForMaskedLM were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
import IPython.display as ipd

In [14]:
audio_path = '/content/man1_wb.wav'

y,sr = librosa.load(audio_path,sr=8000)

print('Audio Intially')
ipd.Audio(y, rate=sr)

Audio Intially


In [15]:
#just some audio file, replace this with your own file
text_output = asr_transcript("/content/man1_wb.wav")

In [16]:
text_output

'in the course of a december tour in yorkshire i rode for a long distance in one of the public coaches on the day preceding christmas '

In [None]:
'''
For the Named Entity Recognition Task.
'''

In [17]:
import spacy

In [18]:
nlp=spacy.load('en_core_web_sm')
nlp.pipe_names

['tagger', 'parser', 'ner']

In [19]:
from spacy import displacy

doc = nlp(text_output)
displacy.render(nlp(doc.text),style='ent', jupyter=True)

In [None]:
# Short and Quick Code for (Speech ==> Text ==> NER) by Roja Achary