In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import speech_recognition as sr
import pyttsx3

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")

In [2]:
# Add color to the chats
class Color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

# function to handle audio transcription  
def get_audio():
    r = sr.Recognizer()
    mic = sr.Microphone()
    with mic as source:
        # read audio from mic
        print("Listening...")
        audio = r.listen(source)
        print("Recognizing...")

    try: 
        return str(r.recognize_google(audio))
    except sr.UnknownValueError:
        print("Could not understand speech... Please try again")
        get_audio()
    except sr.RequestError as e:
        print("Could not request results from Google Speech Recognition service; {}".format(e))
        try:
            return str(r.recognize_sphinx(audio))
        except Exception as e:
            print(e)

# Main function
def main():
    loop = True
    step = 0
    engine = pyttsx3.init()
    engine.setProperty('rate', 150)
    engine.setProperty('voice', engine.getProperty('voices')[1].id)
    while loop:
        audio = get_audio()
        print(Color.BLUE + Color.BOLD + "User:", audio + Color.END) 
        new_user_input_ids = tokenizer.encode(audio + tokenizer.eos_token, return_tensors='pt')
        bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
        chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
        response = "{}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True))
        print(Color.RED + Color.BOLD + "Chatbot: " + response + Color.END)
        engine.say(response)
        engine.runAndWait()
        step = 1
        if "bye" in audio.lower():
            loop = False
    engine.stop()

In [3]:
main()

Listening...
Recognizing...
[94m[1mUser: Hi how are you[0m
[91m[1mChatbot: I'm good, you?[0m
Listening...
Recognizing...
[94m[1mUser: I'm doing well[0m
[91m[1mChatbot: That's good[0m
Listening...
Recognizing...
[94m[1mUser: where are you from[0m
[91m[1mChatbot: I'm from the UK[0m
Listening...
Recognizing...
[94m[1mUser: how is the weather there[0m
[91m[1mChatbot: It's nice[0m
Listening...
Recognizing...
[94m[1mUser: did you have lunch[0m
[91m[1mChatbot: I did[0m
Listening...
Recognizing...
[94m[1mUser: what did you eat[0m
[91m[1mChatbot: I had a sandwich[0m
Listening...
Recognizing...
[94m[1mUser: okay I have to go bye[0m
[91m[1mChatbot: ok bye[0m
