# 1. Import libraries

In [1]:
import speech_recognition as sr
import cv2
from tkinter import NW, Tk, Canvas, PhotoImage, Button, Frame,Label
import time
from PIL import Image, ImageTk

# 2. Defining recognizer & microphone instance

In [2]:
recognizer = sr.Recognizer()
microphone = sr.Microphone()

# 3. Creating text in canvas

In [3]:
def set_text_canvas(message):
    text_canvas.delete("all")
    text_canvas.create_text(320, 20, text=message, fill="#000", font=('Helvetica 15 bold'))

In [4]:
def set_panel_canvas(message):
    panel_canvas.create_text(320, 50, text=message, fill="#000", font=('Helvetica 15 bold'))

# 4. Convert image to photoImage

In [5]:
def get_photo_image(image):
    cv2image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(cv2image)
    resized_image= img.resize((170,250), Image.ANTIALIAS)
    imgtk = ImageTk.PhotoImage(image = resized_image)
    return imgtk

# 5. Fetch gesture from dataset to preview

In [6]:
def get_gesture_image(endpoint):
    path = r'..\\data\\DataForSpeech\\' + endpoint + ".jpg"
    font = cv2.FONT_HERSHEY_SIMPLEX
    org = (50, 50)
    fontScale = 1
    color = (255, 255, 255)
    thickness = 2
    
    while True:
        img = cv2.imread(path)
        image = cv2.resize(img, (300, 300))
        cv2.putText(image, endpoint, org, font, fontScale, color, thickness, cv2.LINE_AA)
        cv2.imshow('Gesture', image)
        
        code = cv2.waitKey(0)
        if code == ord('q'):
            break
            
        if cv2.getWindowProperty("Gesture", cv2.WND_PROP_VISIBLE) <1:
            break

    cv2.destroyAllWindows()

# 6.1. Reognize speech from microphone: method 1

In [7]:
def recognize_speech_from_microphone():
    if not isinstance(recognizer, sr.Recognizer):
        raise TypeError("`recognizer` must be `Recognizer` instance")

    if not isinstance(microphone, sr.Microphone):
        raise TypeError("`microphone` must be `Microphone` instance")

    with microphone as source:
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)

    response = {
        "success": True,
        "error": None,
        "transcription": None
    }

    try:
        response["transcription"] = recognizer.recognize_google(audio)
    except sr.RequestError:
        response["transcription"] = None
        response["success"] = False
        response["error"] = "API unavailable"
    except sr.UnknownValueError:
        response["transcription"] = None
        response["error"] = "Unable to recognize speech"

    return response

# 6.2. Reognize speech from microphone: method 2

In [8]:
def recognize_speech():
    with microphone as source:
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)
    
    response = {
        "success": True,
        "error": None,
        "transcription": None
    }

    try:
        response["transcription"] = recognizer.recognize_google(audio)
    except Exception as e:
        response["success"] = False
        response["transcription"] = None
        response["error"] = "Unable to recognize speech"
        
    return response

# 7. Speech Recognition

In [9]:
root = Tk()
root.title("Speech Recognition")
root.geometry("640x500")
cam = cv2.VideoCapture(0)

micImagePath = r'..\\assets\\microphone.jpg'

def close():
    root.destroy()
    
def start():
    set_text_canvas("PLEASE SAY SOMETHING!")
    response = recognize_speech()
    transcription = response["transcription"]
    if transcription:
        set_text_canvas("YOU SAID: {}".format(transcription).upper())
        get_gesture_image(transcription.upper())
    if not response["success"]:
        set_text_canvas("I DIDN'T CATCH THAT. WHAT DID YOU SAY?")
    if response["error"]:
        set_text_canvas("ERROR: {}".format(response["error"]))


panelFrame = Frame(root)
panelFrame.pack(side="top")
panelFrame.configure(bg="white")

panel_canvas = Canvas(panelFrame, width=640, height=100)
panel_canvas.pack(side="top")

text_canvas = Canvas(panelFrame, width=640, height=100)
text_canvas.pack(side="bottom", pady=10)

panel = Label(panelFrame, width = 640, height = 200)
panel.pack(side="bottom", padx=10)

set_panel_canvas("USE YOUR MICROPHONE TO TELL SOMETHING!")

microPhoneImage = cv2.imread(micImagePath)
imgtk = get_photo_image(microPhoneImage)
panel.imgtk = imgtk
panel.configure(image=imgtk)

set_text_canvas("CLICK TO START. LET'S BEGIN !")

frame = Frame(root)
frame.pack(side="bottom")

cancel = Button(frame, text="QUIT", command=close, bg = "#C0392B", relief = "groove", fg = "#fff", bd = 0, width = 20, font=('Helvetica 10 bold'))
cancel.pack(side="left",fill="both", expand="no", padx="10", pady="10")

start = Button(frame, text="START", command=start, bg = "#76D7C4", relief = "groove", fg = "#fff", bd = 0, width = 80, font=('Helvetica 10 bold'))
start.pack(side="right", fill="both", expand="no", padx="10", pady="10")

root.mainloop()
cam.release()
cv2.destroyAllWindows()