# AI Voice Assistant (AVA)
### Powered by Open Source models
AVA uses the following pipeline:
- Read and transcribe input audio with PyAudio and OpenAI Whisper (Query Ingestion)
- Generate text response with Qwen Chat (Text Generation)
- Give audio response with Bark TTS (Response)

In [27]:
#Imports
import sys
import gc
import os
import torch

from transformers import WhisperProcessor, WhisperForConditionalGeneration
import wave
import pyaudio
from datasets import Dataset, Audio

from transformers import AutoModelForCausalLM, AutoTokenizer
import nltk
import numpy as np
from bark.generation import (
    generate_text_semantic,
    preload_models,
)
from bark.api import semantic_to_waveform
from bark import generate_audio, SAMPLE_RATE
from scipy.io import wavfile
from IPython import display

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["SUNO_USE_SMALL_MODELS"] = "0"
os.environ["SUNO_OFFLOAD_CPU"] = "1"
preload_models()

#### Query Ingestion

In [12]:
def record_query(query_id: str) -> str:
    """
    Records user query at the sampling rate of 16000 and stores it as a .wav file
    Args:
    query_id: A unique id for user query

    Returns:
    Name of the recorded audio file
    """
    filename = query_id + "_Q.wav"
    
    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 1 if sys.platform == 'darwin' else 2
    RATE = 16000
    RECORD_SECONDS = 10

    with wave.open(filename, 'wb') as wf:
        p = pyaudio.PyAudio()
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(p.get_sample_size(FORMAT))
        wf.setframerate(RATE)
    
        stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True)
    
        print('Recording...')
        for _ in range(0, RATE // CHUNK * RECORD_SECONDS):
            wf.writeframes(stream.read(CHUNK))
        print('Done')
    
        stream.close()
        p.terminate()
    
    return filename

In [7]:
def read_query(filename: str, preloaded: tuple|None=None) -> str|None:
    """
    Reads and transcribes an audio file
    Args:
    filename: Path to the audio recording (must be a .wav file)
    preloaded: defines whether the model has been preloaded or not.
    For faster responses, preload the model in the main pipeline. In this case, pass a
    tuple of the loaded model as (processor, model).
    To minimize memory usage, pass preloaded=None. This will load the model inside this function
    and release it from the memory once transcription is generated. This is slower but memory efficient.

    Returns:
    Transcription of the audio file
    """

    if not filename.endswith('.wav'):
        print("Only .wav files are supported currently.")
        return None
        
    audio_dataset = Dataset.from_dict({"audio": [filename]}).cast_column("audio", Audio())
    audio_sample = audio_dataset[0]["audio"]
    waveform = audio_sample["array"]
    sampling_rate = audio_sample["sampling_rate"]

    if not preloaded:
        #Load OpenAI Whisper for transcription
        processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
        model.config.forced_decoder_ids = None
    else:
        processor, model = preloaded

    input_features = processor(waveform, sampling_rate=sampling_rate, return_tensors="pt").input_features
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

    if not preloaded:
        del processor
        del model
        gc.collect()
        torch.cuda.empty_cache()

    return transcription[0]

#### Text Generation

In [14]:
def generate_response(query: str, preloaded: tuple|None=None) -> str:
    """
    Generates a text response for user query using Qwen-chat model.
    Args:
    query: User query
    preloaded: defines whether the model has been preloaded or not.
    For faster responses, preload the model in the main pipeline. In this case, pass a
    tuple of the loaded model as (model, tokenizer).
    To minimize memory usage, pass preloaded=None. This will load the model inside this function
    and release it from the memory once transcription is generated. This is slower but memory efficient.
    
    Returns:
    LLM generated response
    """

    if not preloaded:
        model = AutoModelForCausalLM.from_pretrained(
            "Qwen/Qwen1.5-4B-Chat",
            torch_dtype="auto",
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-4B-Chat")
    else:
        model, tokenizer = preloaded

    messages = [
            {"role": "system", "content": "You are a helpful assistant designated to respond to query to the best of your knowledge."},
            {"role": "user", "content": query}
        ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    
    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    if not preloaded:
        del tokenizer
        del model
        gc.collect()
        torch.cuda.empty_cache()
    
    return response

#### Audio Response

In [28]:
def audio_response(query_id:str, text: str) -> str:
    """
    Converts input text to speech and saves it to .wav file
    Args:
    query_id: Unique ID for user query
    text: text to be converted to speech
    
    Returns:
    Name of the file with the generated audio
    """
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

    GEN_TEMP = 0.6
    SPEAKER = "v2/en_speaker_9"
    silence = np.zeros(int(0.25 * SAMPLE_RATE))  # quarter second of silence
    sentences = nltk.sent_tokenize(text)

    pieces = []
    for sentence in sentences:
        semantic_tokens = generate_text_semantic(
            sentence,
            history_prompt=SPEAKER,
            temp=GEN_TEMP,
            min_eos_p=0.05,  # this controls how likely the generation is to end
        )
    
        audio_array = semantic_to_waveform(semantic_tokens, history_prompt=SPEAKER,)
        pieces += [audio_array, silence.copy()]

    waveform = np.concatenate(pieces)
    audio = display.Audio(waveform, rate=SAMPLE_RATE)
    filename = query_id + "_A.wav"
    with open(filename, "wb") as file:
          file.write(audio.data)
    #wavfile.write(filename, SAMPLE_RATE, waveform.astype(np.dtype('i2')))
    return filename

## Pipeline

In [29]:
class VoiceAssistant:
    def __init__(self, preload:list|None=None):
        self.preload = preload
        if not self.preload:
            self.preloaded_stt = None
            self.preloaded_tgen = None
        else:
            self.preload = [mode.lower() for mode in self.preload]
            if 'transcribe' in self.preload():
                self.processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
                self.stt_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
                self.stt_model.config.forced_decoder_ids = None
                self.preloaded_stt = (self.processor, self.stt_model)
            else:
                self.preloaded_stt = None

            if 'response' in self.preload():
                self.response_model = AutoModelForCausalLM.from_pretrained(
                    "Qwen/Qwen1.5-4B-Chat",
                    torch_dtype="auto",
                    device_map="auto"
                )
                self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-4B-Chat")
                self.preloaded_tgen = (self.response_model, self.tokenizer)
            else:
                self.preloaded_tgen = None

    def run(self, query_id):
        self.recorded_file = record_query(query_id)
        print("Transcribing the audio...")
        self.transcription = read_query(self.recorded_file, preloaded=self.preloaded_stt)
        print("Generating response...")
        self.response = generate_response(self.transcription, preloaded=self.preloaded_tgen)
        print("Storing results...")
        self.target_file = audio_response(query_id, self.response)
        print("Done. The output is stored in", self.target_file)
        return #self.target_file

In [30]:
AVA = VoiceAssistant()
query_id = "test01"
AVA.run(query_id)

Recording...
Done
Transcribing the audio...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generating response...


Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  4.90it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Storing results...


100%|██████████████████████████████████████████████████████████████████████████████████| 75/75 [00:02<00:00, 32.14it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.68s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 605/605 [00:20<00:00, 29.26it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:53<00:00,  1.73s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 632/632 [00:18<00:00, 33.95it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 32/32 [3:42:11<00:00, 416.62s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 636/636 [00:37<00:00, 17.13it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [00:44<00:00,  1.38s/it]


Done. The output is stored in test01_A.wav
