In [None]:
from gtts import gTTS
import speech_recognition as sr

import subprocess
import utilities
import os
import re

import pyaudio
import math
import struct
import wave
import time

from tqdm.notebook import tqdm

In [None]:
DATA_DIR = "DATA_DIR"
SKILLS_ADDR = os.path.join(DATA_DIR,'subgrouped_skills.json')

In [None]:
PERSONA = 'PERSONA_NAME'

### Listen and record Alexa responses

In [None]:
Threshold = 15

SHORT_NORMALIZE = (1.0/32768.0)
chunk = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
swidth = 2

TIMEOUT_LENGTH = 5

class Recorder:

    @staticmethod
    def rms(frame):
        count = len(frame) / swidth
        format = "%dh" % (count)
        shorts = struct.unpack(format, frame)

        sum_squares = 0.0
        for sample in shorts:
            n = sample * SHORT_NORMALIZE
            sum_squares += n * n
        rms = math.pow(sum_squares / count, 0.5)

        return rms * 1000

    
    def __init__(self):
        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(format=FORMAT,
                                  channels=CHANNELS,
                                  rate=RATE,
                                  input=True,
                                  output=True,
                                  frames_per_buffer=chunk)

    def record(self, file_name):
        print('Noise detected, recording now')
        rec = []
        current = time.time()
        end = time.time() + TIMEOUT_LENGTH
        
        default_end = time.time() + 30
        force_stop = False
        
        while current <= end:
            data = self.stream.read(chunk)

            if self.rms(data) >= Threshold: 
                end = time.time() + TIMEOUT_LENGTH

            current = time.time()
            rec.append(data)
            
            # stop recording after 30 seconds
            if current > default_end:
                force_stop = True
                break 
                
        self.write(b''.join(rec), file_name)
        
        return force_stop

        
    def write(self, recording, file_name):
        time_1 = os.path.getmtime(file_name + '-1.wav')
        time_2 = os.path.getmtime(file_name + '-2.wav')

        if time_1 > time_2:
            file_name = file_name + '-2.wav'
        else:
            file_name = file_name + '-1.wav'

        wf = wave.open(file_name, 'wb')
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(self.p.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(recording)
        wf.close()
        print('Written to file: {}'.format(file_name))
        print('Listening again')


    def listen(self, file_name):
        count = 0
        speak_counter = 0
        force_stop = False
        
        while True:
            input = self.stream.read(chunk)
            rms_val = self.rms(input)
            
            if rms_val >= Threshold:
                count = 0

                force_stop = self.record(file_name)
                
                if force_stop:
                    break
            
            elif rms_val < Threshold:
                count += 1
                if count > 30:
                    break
          
        return force_stop

### Helper functions 

In [None]:
def get_responses(file_name):
    responses = []
    responses.append(get_last_response(file_name + '-1.wav'))
    responses.append(get_last_response(file_name + '-2.wav'))
    return list(filter(None, responses))
    

def get_last_response(last_recording):
    speech_to_text = sr.Recognizer()
    text = ''
    
    try:
        with sr.AudioFile(last_recording) as source:
            # listen for the data (load audio to memory)
            audio_data = speech_to_text.record(source)
            # recognize (convert from speech to text)
            text = speech_to_text.recognize_google(audio_data, language = "en")
            text = text['alternative'][0]['transcript']
    except:
        pass
    
    if len(text) == 0:
        return ''
    
    return text.strip()


def file_clean_up(LAST_RECORDING, CURRENT_RECORDING):
    os.remove(LAST_RECORDING + '-1.wav')
    os.remove(LAST_RECORDING + '-2.wav')
    
    os.rename(CURRENT_RECORDING + '-1.wav', LAST_RECORDING + '-1.wav')
    os.rename(CURRENT_RECORDING + '-2.wav', LAST_RECORDING + '-2.wav')
    
    utterance_wav = gTTS(text='None', lang='en', slow=False)
    utterance_wav.save(CURRENT_RECORDING + '-1.wav')
    utterance_wav.save(CURRENT_RECORDING + '-2.wav')

### Interaction logic

In [None]:
def get_utterances(skills_json, no_skills_to_interact = 50):
    skills_utterances = {}
    total_extracted = 0 
    
    for skill in skills_json:
        skills_utterances[skill] = []

        for command in skills_json[skill]['Sample_Invocation_Utterances']:
            if command.strip() == '':
                continue
            command = command.replace('”', '').replace('“', '').replace('.', '').strip()
            
            if not command.lower().startswith('alexa'):
                command = 'Alexa, ' + command
            
            skills_utterances[skill].append(command)

        skill_desc = skills_json[skill]['Skill_description'].\
                        replace('”', '"').replace('“', '"').replace('‘','').replace('’','')
        more_commands = re.findall(r'"([^"]*)"', skill_desc)

        more_commands_counter = 0
        for command in more_commands:
            if command.lower().startswith('alexa') and len(command.strip()) > len('alexa'):
                if any(x in command for x in ['*','#','(',')','[',']','{','}','<','>']): 
                    continue
                
                if command.strip().count(' ') > 1 and command.replace('.', '').strip() not in skills_utterances[skill]:  
                    skills_utterances[skill].append(command.replace('.', '').strip())
                    
                    # Only recording 5 occurrences at max from description.
                    more_commands_counter += 1
                    if more_commands_counter >= 5:
                        break

        total_extracted += 1
        if total_extracted >= no_skills_to_interact:
            break
            
    return skills_utterances
    

def play_utterances(skills_utterances, CURRENT_UTTERANCE, LAST_RECORDING, CURRENT_RECORDING, ALEXA_STOP):    
    
    # To update progress bar
    total_utterances = 0
    for skill in skills_utterances:
        for utterance in skills_utterances[skill]:
            total_utterances += 1
        
    pbar = tqdm(total=total_utterances, position=0, leave=True)
    
    for skill in skills_utterances:
        for utterance in skills_utterances[skill]:

            utterance_wav = gTTS(text=utterance, lang='en', slow=False)
            utterance_wav.save(CURRENT_UTTERANCE + '.wav')
            subprocess.call(['afplay', CURRENT_UTTERANCE + '.wav'])
            
            last_responses = get_responses(LAST_RECORDING)
            
            # continue to next skill after listening 
            record_response = Recorder()
            force_stop = record_response.listen(CURRENT_RECORDING)
            del record_response
            
            if force_stop:
                subprocess.call(['afplay', ALEXA_STOP + '.wav'])
            
            else:
                current_responses = get_responses(CURRENT_RECORDING)
                print('LAST:', last_responses)
                print('CURRENT:', current_responses)
                
                if any(x in current_responses for x in last_responses): 
                    subprocess.call(['afplay', ALEXA_STOP + '.wav'])
            
            file_clean_up(LAST_RECORDING, CURRENT_RECORDING)
            time.sleep(2)
            pbar.update(1)

### Initialize default file names

In [None]:
LAST_RECORDING = 'last-recording'
CURRENT_RECORDING = 'current-recording'
CURRENT_UTTERANCE = 'current-utterance'
ALEXA_STOP = 'alexa-stop'

### Write helper audio files

In [None]:
utterance_wav = gTTS(text='Alexa, stop', lang='en', slow=False)
utterance_wav.save(ALEXA_STOP + '.wav')

utterance_wav = gTTS(text='None', lang='en', slow=False)
utterance_wav.save(LAST_RECORDING + '-1.wav')
utterance_wav.save(LAST_RECORDING + '-2.wav')
utterance_wav.save(CURRENT_RECORDING + '-1.wav')
utterance_wav.save(CURRENT_RECORDING + '-2.wav')

### Extract utterances from samples utterances and skill description

In [None]:
all_skills = utilities.read_json(SKILLS_ADDR)
all_skills = all_skills[PERSONA]

### Play utterances

In [None]:
skills_utterances = get_utterances(all_skills, no_skills_to_interact=50) 
play_utterances(skills_utterances, CURRENT_UTTERANCE, LAST_RECORDING, CURRENT_RECORDING, ALEXA_STOP)