In [None]:
# !pip3 install gTTS
# ! pip3 install pyaudio
# !pip3 install SpeechRecognition pydub
# !pip3 install wavio

In [1]:
from gtts import gTTS
import speech_recognition as sr

import subprocess
import utilities
import os
import re

import pyaudio
import math
import struct
import wave
import time
# import wavio

from tqdm.notebook import tqdm

In [2]:
DATA_DIR = '/Users/umariqbal/Documents/voice-assistant-central/' 
SKILLS_ADDR = os.path.join(DATA_DIR,'data/subgrouped_skills.json')

In [3]:
PERSONA = 'Dating'

In [4]:
# https://stackoverflow.com/questions/18406570/python-record-audio-on-detected-sound
Threshold = 10

SHORT_NORMALIZE = (1.0/32768.0)
chunk = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
swidth = 2

TIMEOUT_LENGTH = 5

class Recorder:

    @staticmethod
    def rms(frame):
        count = len(frame) / swidth
        format = "%dh" % (count)
        shorts = struct.unpack(format, frame)

        sum_squares = 0.0
        for sample in shorts:
            n = sample * SHORT_NORMALIZE
            sum_squares += n * n
        rms = math.pow(sum_squares / count, 0.5)

        return rms * 1000

    
    def __init__(self):
        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(format=FORMAT,
                                  channels=CHANNELS,
                                  rate=RATE,
                                  input=True,
                                  output=True,
                                  frames_per_buffer=chunk)

    def record(self, last_recording):
        print('Noise detected, recording now')
        rec = []
        current = time.time()
        end = time.time() + TIMEOUT_LENGTH
        
        default_end = time.time() + 30
        force_stop = False
        
        while current <= end:
            data = self.stream.read(chunk)

            if self.rms(data) >= Threshold: 
                end = time.time() + TIMEOUT_LENGTH

            current = time.time()
            rec.append(data)
            
            # stop recording after 30 seconds
            if current > default_end:
                force_stop = True
                break 
                
        self.write(b''.join(rec), last_recording)
        
        return force_stop

        
    def write(self, recording, last_recording):
        wf = wave.open(last_recording, 'wb')
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(self.p.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(recording)
        wf.close()
        print('Written to file: {}'.format(last_recording))
        print('Listening again')


    def listen(self, last_recording):
        count = 0
        speak_counter = 0
        force_stop = False
        
        while True:
            input = self.stream.read(chunk)
            rms_val = self.rms(input)
            
            # May need to move this logic up in write. 
            if rms_val >= Threshold:
                count = 0
                # speak_counter += 1

                ## 6 * 5 (TIMEOUT)
                ## Listen for 30 seconds at max and then break
                # if speak_counter > 6:
                #    force_stop = True
                #    break

                force_stop = self.record(last_recording)
                
                if force_stop:
                    break
            
            elif rms_val < Threshold:
                count += 1
                if count > 20:
                    break
          
        return force_stop

In [5]:
def get_utterances(skills_json, no_skills_to_interact = 50):
    skills_utterances = {}
    total_extracted = 0 
    
    for skill in skills_json:
        skills_utterances[skill] = []

        for command in skills_json[skill]['Sample_Invocation_Utterances']:
            if command.strip() == '':
                continue
            skills_utterances[skill].append(command.replace('”', '').replace('“', '').replace('.', '').strip())

        skill_desc = skills_json[skill]['Skill_description'].\
                        replace('”', '"').replace('“', '"').replace('‘','').replace('’','')
        more_commands = re.findall(r'"([^"]*)"', skill_desc)

        more_commands_counter = 0
        for command in more_commands:
            if command.lower().startswith('alexa') and len(command.strip()) > len('alexa'):
                if any(x in command for x in ['*','#','(',')','[',']','{','}','<','>']): 
                    continue
                
                if command.strip().count(' ') > 1 and command.replace('.', '').strip() not in skills_utterances[skill]:  
                    skills_utterances[skill].append(command.replace('.', '').strip())
                    
                    # Only recording 5 occurrences at max from description.
                    more_commands_counter += 1
                    if more_commands_counter >= 5:
                        break

        total_extracted += 1
        if total_extracted >= no_skills_to_interact:
            break
            
    return skills_utterances


def get_last_response(last_recording):
    speech_to_text = sr.Recognizer()
    text = ''
    
    try:
        with sr.AudioFile(last_recording) as source:
            # listen for the data (load audio to memory)
            audio_data = speech_to_text.record(source)
            # recognize (convert from speech to text)
            text = speech_to_text.recognize_google(audio_data, language = "en")
            text = text['alternative'][0]['transcript']
    except:
        pass
    
    if len(text) == 0:
        return ''
    
    return text.strip()
    

def play_utterances(skills_utterances):

    total_utterances = 0
    for skill in skills_utterances:
        for utterance in skills_utterances[skill]:
            total_utterances += 1
        
    pbar = tqdm(total=total_utterances, position=0, leave=True)
    
    for skill in skills_utterances:
        for utterance in skills_utterances[skill]:

            utterance_wav = gTTS(text=utterance, lang='en', slow=False)
            utterance_wav.save("current-utterance.wav")
            subprocess.call(["afplay", "current-utterance.wav"])
            
#             time.sleep(2)
            
            last_response = get_last_response("last-recording.wav")
            
            # continue to next skill after listening 
            record_response = Recorder()
            force_stop = record_response.listen("last-recording.wav")
            del record_response
            
            if force_stop:
                subprocess.call(["afplay", "alexa-stop.wav"])
            
            else:
                current_response = get_last_response("last-recording.wav")
                print('LAST:', last_response)
                print('CURRENT:', current_response)
                
                if last_response != '' and last_response == current_response:
                    subprocess.call(["afplay", "alexa-stop.wav"])
            
            
            time.sleep(2)
            pbar.update(1)

In [6]:
all_skills = utilities.read_json(SKILLS_ADDR)
all_skills = all_skills[PERSONA]

In [15]:
skills_utterances = get_utterances(all_skills, no_skills_to_interact=10) 
play_utterances(skills_utterances)

  0%|          | 0/27 [00:00<?, ?it/s]

who's your date night idea that is under $5 find a random way to serve people smile at strangers pick up trash find an elderly neighbor who needs help with yard work and with their permission do it Etc
Noise detected, recording now
Written to file: last-recording.wav
Listening again
you make me want to settle down
LAST: who's your date night idea that is under $5 find a random way to serve people smile at strangers pick up trash find an elderly neighbor who needs help with yard work and with their permission do it Etc
CURRENT: you make me want to settle down
you make me want to settle down
Noise detected, recording now
Written to file: last-recording.wav
Listening again
is that an eggplant in your pocket or are you just happy to see me
LAST: you make me want to settle down
CURRENT: is that an eggplant in your pocket or are you just happy to see me
is that an eggplant in your pocket or are you just happy to see me
Noise detected, recording now
Written to file: last-recording.wav
Listeni

Noise detected, recording now
Written to file: last-recording.wav
Listening again
today's tip hey cutie yeah I'm talking to you get your head out of your phone and look around starting today and through the rest of this month challenge yourself to be more open and available to communicate with anyone anytime anywhere once you put down your phone the world becomes a singles bar
LAST: here we go hey cutie yeah I'm talking to you get your head out of your phone and look around starting today and through the rest of this month challenge yourself to be more open and available to communicate with anyone anytime anywhere once you put down your phone the world becomes a singles bar
CURRENT: today's tip hey cutie yeah I'm talking to you get your head out of your phone and look around starting today and through the rest of this month challenge yourself to be more open and available to communicate with anyone anytime anywhere once you put down your phone the world becomes a singles bar


In [14]:
# count = 0
# for skill in skills_utterances:
#     for utterance in skills_utterances[skill]:
#         print(utterance)
#         count += 1
#     print('\n')
# print(count)

In [None]:
# count = 0
# for skill in all_skills:
#     skill_desc = all_skills[skill]['Skill_description'].replace('”', '"').replace('“', '"').replace('*', '')
#     more_commands = re.findall(r'"([^"]*)"', skill_desc)

#     print('\n\n')  
#     for command in all_skills[skill]['Sample_Invocation_Utterances']:
#         print(command.replace('”', '').replace('“', ''))
        
#     print('\n\n')    
    
#     for command in more_commands:
#         if 'alexa' in command.lower():
#             print(command)
    
#     print('\n\n\n\n')
# #     for attr in all_skills[skill]:
# #         print(attr)
        
# #     break

#     count += 1
    
#     if count > 0:
#         break

In [None]:
# count = 0
# for skill in all_skills:
#     print(all_skills[skill]['Name'], all_skills[skill]['Skill_link'])
#     count += 1
    
#     if count > 50:
#         break

In [None]:
# utterance_mp3 = gTTS(text='Alexa, stop', lang='en', slow=False)
# utterance_mp3.save("alexa-stop.wav")
# subprocess.call(["afplay", "alexa-stop.wav"])

In [None]:
# subprocess.call(["afplay", "last_recording.wav"])