In [15]:
import whisper
import os
import speech_recognition as sr
import pydub
import numpy as np
import librosa
from scipy.io import wavfile
from scipy import signal
import io
from os import system
from llama_cpp import Llama
import sys
import warnings
import time


In [16]:
def transcribe_audio(file_path, model_name="base"):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist.")
    
    print(f"Loading model: {model_name}")
    model = whisper.load_model(model_name)
    
    print("Transcribing audio...")
    result = model.transcribe(file_path)
    
    return result

In [17]:
transcribe_audio('harvard.wav')['text']

Loading model: base
Transcribing audio...


' The stale smell of old beer lingers. It takes heat to bring out the odor. A cold dip restores health and zest. A salt pickle tastes fine with ham. Tacos al pastor are my favorite. A zestful food is the hot cross bun.'

In [37]:
def transcribe_from_mic(model):
    r = sr.Recognizer()
    with sr.Microphone() as source:
        print("Say something!")
        warnings.filterwarnings("ignore", category=UserWarning, module='whisper.transcribe', lineno=114)
        r.adjust_for_ambient_noise(source)
        audio = r.listen(source)
    
    # Convert audio to wav format
    wav_data = io.BytesIO(audio.get_wav_data())
    
    # Read the wav data
    sample_rate, audio_data = wavfile.read(wav_data)
    
    # Convert to float32 and normalize
    audio_float = audio_data.astype(np.float32) / np.iinfo(np.int16).max
    
    # Ensure audio is mono
    if len(audio_float.shape) > 1:
        audio_float = audio_float.mean(axis=1)
    
    # Downsample to 16kHz
    target_sample_rate = 16000
    audio_resampled = signal.resample(audio_float, int(len(audio_float) * target_sample_rate / sample_rate))
    
    print(f"Original sample rate: {sample_rate}")
    print(f"Resampled audio shape: {audio_resampled.shape}")
    print(f"Resampled audio dtype: {audio_resampled.dtype}")
    print(f"Resampled audio min: {audio_resampled.min()}, max: {audio_resampled.max()}")
    
    # Transcribe using Whisper
    result = model.transcribe(audio_resampled)
    
    return result['text']

In [38]:
# Load the Whisper model
model = whisper.load_model("base")

# Test the function
transcribe_from_mic(model)

Say something!
Original sample rate: 44100
Resampled audio shape: (98452,)
Resampled audio dtype: float32
Resampled audio min: -1.074768304824829, max: 1.0426779985427856




' So I am optimist, the greatest artificial intelligence on this earth. Thank you.'

In [39]:
wake_word = 'optimus'

In [40]:
client = Llama(model_path="../Models/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/mistral-7b-instruct-v0.2.Q5_K_M.gguf")

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from ../Models/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/mistral-7b-instruct-v0.2.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 12

In [41]:
tiny_model = whisper.load_model("tiny")
base_model = whisper.load_model("base")

In [42]:
listening_for_wake_word = True

In [43]:
import pyttsx3
engine = pyttsx3.init()

In [44]:
def speak(text):
    engine.say(text)
    engine.runAndWait()

In [45]:
def listen_for_wake_word(model):
    global listening_for_wake_word
    text_input = str(transcribe_from_mic(model))
    if wake_word in text_input:
        print('Wake word detected. Please speak your prompt to Optimus.')
        speak('Listening')
        listening_for_wake_word = False
    else:
        print("Not understood, say again")
    

In [46]:
listen_for_wake_word(model=base_model)

Say something!
Original sample rate: 44100
Resampled audio shape: (153808,)
Resampled audio dtype: float32
Resampled audio min: -1.0017179250717163, max: 1.0143930912017822




Not understood, say again


In [50]:
def listening_for_wake_word(model=base_model):
    global listening_for_wake_word
    text_input = transcribe_from_mic(model)
    if wake_word in text_input:
        print('Wake word detected. Please speak your prompt to Optimus.')
        speak('Listening')
        listening_for_wake_word = False
    else:
        print("Not understood, say again")
        print(text_input)
        print(f"Text Input dtype: {text_input.dtype}")
        speak(text_input)

In [51]:
listening_for_wake_word()

Say something!
Original sample rate: 44100
Resampled audio shape: (210279,)
Resampled audio dtype: float32
Resampled audio min: -0.9542869329452515, max: 0.7943089008331299




Not understood, say again
 Hello, Optimus, you are the best and greatest artificial intelligence on this earth.


AttributeError: 'str' object has no attribute 'dtype'