In [12]:
import os

import IPython
import matplotlib
import matplotlib.pyplot as plt
import requests
import torch
import torchaudio

import pyaudio
from six.moves import queue
import time

torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
#speech recognition model load
wav2vec = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
model_wav2vec = wav2vec.get_model().to(device)

In [14]:
#speech recognition decoder
class GreedyCTCDecoder(torch.nn.Module):
    def __init__(self, labels, blank=0):
        super().__init__()
        self.labels = labels
        self.blank = blank

    def forward(self, emission: torch.Tensor) -> str:
        """Given a sequence emission over labels, get the best path string
        Args:
          emission (Tensor): Logit tensors. Shape `[num_seq, num_label]`.

        Returns:
          str: The resulting transcript
        """
        indices = torch.argmax(emission, dim=-1)  # [num_seq,]
        indices = torch.unique_consecutive(indices, dim=-1)
        indices = [i for i in indices if i != self.blank]
        return "".join([self.labels[i] for i in indices])

decoder = GreedyCTCDecoder(labels=wav2vec.get_labels())

In [15]:
#TTS vocoders
def text_to_sequence(text):
  text = text.lower()
  return [look_up[s] for s in text if s in symbols]

symbols = '_-!\'(),.:;? abcdefghijklmnopqrstuvwxyz'
symbols = set(symbols)

tacotron_pipeline = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH

processor = tacotron_pipeline.get_text_processor()
tacotron2 = tacotron_pipeline.get_tacotron2().to(device)
vocoder = tacotron_pipeline.get_vocoder().to(device)

waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp32')
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to(device)
waveglow.eval()

Using cache found in C:\Users\RT/.cache\torch\hub\NVIDIA_DeepLearningExamples_torchhub


WaveGlow(
  (upsample): ConvTranspose1d(80, 80, kernel_size=(1024,), stride=(256,))
  (WN): ModuleList(
    (0): WN(
      (in_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
        (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
        (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
        (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
        (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
        (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
        (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
      )
      (res_skip_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (1): Conv1d(51

In [16]:
#마이크로 입력받도록 하기
RATE = 44000 # 44khz
# 버퍼는 1600
CHUNK = int(RATE / 10)  # 100ms

class MicrophoneStream(object):
    """마이크 입력 클래스"""
    def __init__(self, rate, chunk):
        self._rate = rate
        self._chunk = chunk

        # 마이크 입력 버퍼 생성
        self._buff = queue.Queue()
        self.closed = True

    # 클래스 열면 발생함.
    def __enter__(self):
        # pyaudio 인터페이스 생성
        self._audio_interface = pyaudio.PyAudio()
        # 16비트, 모노로 마이크 열기
        # 여기서 _fill_buffer 함수가 바로 callback함수 인데
        # 실제 버퍼가 쌓이면 이곳이 호출된다.
        # 즉, _fill_buffer 마이크 입력을 _fill_buffer 콜백함수로 전달 받음
        self._audio_stream = self._audio_interface.open(
            format=pyaudio.paInt16,
            channels=1, rate=self._rate,
            input=True, frames_per_buffer=self._chunk,
            stream_callback=self._fill_buffer,
        )        
        self.closed = False
        return self

    def __exit__(self, type, value, traceback):
        # 클래스 종료시 발생
        # pyaudio 종료
        self._audio_stream.stop_stream()
        self._audio_stream.close()

        self.closed = True
        # Signal the generator to terminate so that the client's
        # streaming_recognize method will not block the process termination.
        self._buff.put(None)
        self._audio_interface.terminate()
    
    # 마이크 버퍼가 쌓이면(CHUNK = 1600) 이 함수 호출 됨. 
    def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
        # 마이크 입력 받으면 큐에 넣고 리턴
        self._buff.put(in_data)
        return None, pyaudio.paContinue

    # 제너레이터 함수 
    def generator(self):
        #클래스 종료될 떄까지 무한 루프 돌림 
        while not self.closed:
            
            # 큐에 데이터를 기다림.
            # block 상태임.
            chunk = self._buff.get()

            # 데이터가 없다면 문제 있음
            if chunk is None:
                return

            # data에 마이크 입력 받기
            data = [chunk]

            # 추가로 받을 마이크 데이터가 있는지 체크 
            while True:
                try:
                    # 데이터가 더 있는지 체크
                    chunk = self._buff.get(block=False)
                    if chunk is None:
                        return
                    # 데이터 추가
                    data.append(chunk)
                except queue.Empty:
                    # 큐에 데이터가 더이상 없다면 break
                    break

            #마이크 데이터를 리턴해줌 
            yield b''.join(data)
# [END audio_stream]

In [17]:
def listen_print_loop(responses):
    num_chars_printed = 0
    for response in responses:
        if not response.results:
            continue
        result = response.results[0]
        if not result.alternatives:
            continue

        transcript = result.alternatives[0].transcript

        overwrite_chars = ' ' * (num_chars_printed - len(transcript))

        if not result.is_final:
            sys.stdout.write(transcript + overwrite_chars + '\r')
            sys.stdout.flush()

            num_chars_printed = len(transcript)

        else:
            print(transcript + overwrite_chars)
            num_chars_printed = 0

In [18]:
def mic_on():
    # 마이크 열기 
    with MicrophoneStream(RATE, CHUNK) as stream:
        # 마이크 데이터 핸들을 가져옴 
        audio_generator = stream.generator()
        for i in range(2):
            # 1000번만 마이크 데이터 가져오고 빠져나감.
            result = []
            for x in audio_generator:
                # 마이크 음성 데이터
                result.append(x)            
            time.sleep(0.001)

In [19]:
import queue, os, threading
import sounddevice as sd
import soundfile as sf
from scipy.io.wavfile import write

recorder = False
recording = False
q = queue.Queue()
new_wav = []
    
def complicated_record():
    with sf.SoundFile("output.wav", mode='w', samplerate=22000, subtype='PCM_16', channels=1) as file:
        with sd.InputStream(samplerate=22000, dtype='int16', channels=1, callback=complicated_save):
            while recording : 
                file.write(q.get())
                #new_wav.append(q.get())
        
def complicated_save(indata, frames, time, status):
    q.put(indata.copy())
    
def start():
    global recorder
    global recording
    recording = True
    recorder = threading.Thread(target=complicated_record)
    recorder.start()
    
def stop():
    global recorder
    global recording
    recording = False
    recorder.join()

The returned features is a list of tensors. Each tensor is the output of
a transformer layer.




In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model_chatbot = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")

In [26]:
#파일을 로드하고, 들어볼 수 있게합니다
def request(step):
    if step==0 : print('Chatbot : hello~ I ready to listen')
    start()
    time.sleep(6)
    stop()
    print('Chatbot : yes I got it')
    waveform, sample_rate = torchaudio.load("output.wav")
    waveform = waveform.to(device)

# 샘플레이트를 변경
    if sample_rate != wav2vec.sample_rate:
        waveform = torchaudio.functional.resample(waveform, sample_rate, wav2vec.sample_rate)

#모델을 통과시켜서 text로 인식시킵니다
    with torch.inference_mode():
        emission, _ = model_wav2vec(waveform)

    transcript = decoder(emission[0])
    transcript = transcript.replace('|',' ')
    if transcript=="NO|": 
        step=10000
        return step
    print(f'Chatbot : you said : {transcript}, I think about answer...')
    new_step = 0
    new_user_input_ids = tokenizer.encode(transcript + tokenizer.eos_token, return_tensors='pt')
    #print(f'new_user_input_ids : {new_user_input_ids}')

    # 봇에게 발화별 id를 할당시켜줍니다. 처음이면 넘어갑니다.append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if new_step > 0 else new_user_input_ids
    #print(f'bot_input_ids : {bot_input_ids}')

    # 채팅기록을 남기는 것이며, 최대단어는 1000까지입니다 generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model_chatbot.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
    #print(f'chat_history_ids : {chat_history_ids}')

    # 봇이 이해한 답의 token을 디코딩하여 우리가 아는 자연어로 만들어줍니다.
    text = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
        # pretty print last ouput tokens from bot

    print(f'Chatbot : {text}')

    with torch.inference_mode():
        processed, lengths = processor(text)
        processed = processed.to(device)
        lengths = lengths.to(device)
#위에서와 달리 spec 뒤에 spec_length가 추가되었고, vocoder가 추가되었다.
        spec, spec_lengths, _ = tacotron2.infer(processed, lengths)

    with torch.no_grad():
        waveforms = waveglow.infer(spec)

    torchaudio.save("chatbot_answer.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate)
    IPython.display.display(IPython.display.Audio("chatbot_answer.wav", autoplay=True))
    time.sleep(4)
    step += 1
    print(f'Chatbot : you can chat with me after {10-step}times, more answer?')
    return step

In [28]:
step = 0
while step<10:
    step = request(step)

Chatbot : hello~ I ready to listen
Chatbot : yes I got it
Chatbot : you said : DO YOU LIKE EPPER , I think about answer...
Chatbot : I like EPPER


Chatbot : you can chat with me after 9times, more answer?
Chatbot : yes I got it
Chatbot : you said : I LIKE A BATHWAR , I think about answer...
Chatbot : I LIKE A BATHWAR


Chatbot : you can chat with me after 8times, more answer?
Chatbot : yes I got it
Chatbot : you said : DO YOU KNOW ABOUT MET , I think about answer...
Chatbot : I do not know about met.


KeyboardInterrupt: 