In [8]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, BlenderbotTokenizer, BlenderbotForConditionalGeneration
import numpy as np
import torch
from ipywebrtc import AudioRecorder, CameraStream
import torchaudio
from IPython.display import Audio
import moviepy.editor as moviepy
import pydub
import os
from gtts import gTTS
import playsound

In [2]:
#Speech to text model
model_stt = Wav2Vec2ForCTC.from_pretrained(r'yongjian/wav2vec2-large-a') # Note: PyTorch Model
processor_stt = Wav2Vec2Processor.from_pretrained(r'yongjian/wav2vec2-large-a')

#Conversation model
mname = "facebook/blenderbot-400M-distill"
model_conv = BlenderbotForConditionalGeneration.from_pretrained(mname)
tokenizer = BlenderbotTokenizer.from_pretrained(mname)

In [3]:
def stt(audio_file, model_stt = model_stt, processor_stt = processor_stt):
    """
    audio_file: path to audio file to transcribe
    returns: transcription
    This function is based on Wav2Vec2 by meta
    """
    #read audio
    sound = pydub.AudioSegment.from_wav(audio_file)
    #need to set framerate = 16k for model
    sound = sound.set_frame_rate(16000)
    
    #convert audio to np and normalize it [-1, 1]
    np_wav = np.array(sound.get_array_of_samples()).astype(float)
    np_wav /= np_wav.max()
    
    sample_rate = processor_stt.feature_extractor.sampling_rate
    with torch.no_grad():
        model_inputs = processor_stt(np_wav, sampling_rate=sample_rate, return_tensors="pt", padding=True)
        logits = model_stt(model_inputs.input_values, attention_mask=model_inputs.attention_mask).logits # use .cuda() for GPU acceleration
        pred_ids = torch.argmax(logits, dim=-1).cpu()
        pred_text = processor_stt.batch_decode(pred_ids)
    return pred_text

In [4]:
def get_answer(text, model_conv = model_conv, tokenizer = tokenizer):
    """
    text: String of natural language to answer
    returns: String of the answer
    """
    inputs = tokenizer([text], return_tensors="pt")
    reply_ids = model_conv.generate(**inputs)
    return tokenizer.batch_decode(reply_ids)[0]

In [5]:
def tts(text):
    """
    text: String of natural language
    Saves the audio file in the current directory
    """
    answer = gTTS(text, lang="en", slow=False)
    answer.save("answer.mp3")

In [6]:
#Record your audio
camera = CameraStream(constraints={'audio': True,'video':False})
recorder = AudioRecorder(stream=camera)
recorder

AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'audio': True, 'video': …

In [25]:
#Convert webm to wav
with open('recording.webm', 'wb') as f:
    f.write(recorder.audio.value)
moviepy.ffmpeg_tools.ffmpeg_extract_audio("recording.webm", "wav_file_name.wav")

#Transcription
transcript = stt("wav_file_name.wav")[0]
print(f"You: {transcript}")

#play answer
answer_text = get_answer(transcript).replace("<s>","").replace("</s>","")
tts(answer_text)
playsound.playsound(r"C:\Users\rorinr\Documents\Programming\My digital friend\answer.mp3")

Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
You: WHAT ARE YOU DOING FOR FUN
