In [1]:
import tensorflow as tf
import numpy as np
import pyaudio
from tensorflow.keras import models
import turtle
import webrtcvad
import collections


In [2]:
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK_DURATION_MS = 30  # Размер фрейма в миллисекундах
PADDING_DURATION_MS = 300  # Дополнительная длительность окна в миллисекундах
FRAME_SIZE = int(RATE * CHUNK_DURATION_MS / 1000)  # Размер фрейма в сэмплах
NUM_PADDING_FRAMES = int(RATE * PADDING_DURATION_MS / 1000 / FRAME_SIZE)  # Количество дополнительных фреймов

In [3]:
class Frame(object):
    """Represents a "frame" of audio data."""
    def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration

In [4]:
def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames):
    """Фильтрует аудиофреймы, оставляя только голосовые.

    Принимает экземпляр webrtcvad.Vad и поток аудиофреймов.

    Возвращает сегменты голоса, разделенные молчаниями.
    """
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    triggered = False

    voiced_frames = []
    for frame in frames:
        is_speech = vad.is_speech(frame.bytes, sample_rate)
        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            if num_unvoiced > 0.9 * ring_buffer.maxlen:
                triggered = False
                yield b''.join([f.bytes for f in voiced_frames])
                ring_buffer.clear()
                voiced_frames = []
    if triggered:
        yield b''.join([f.bytes for f in voiced_frames])

In [5]:
def record_audio():
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=FRAME_SIZE)
    print("Listening...")

    frames = []
    try:
        while True:
            data = stream.read(FRAME_SIZE)
            frames.append(data)
    except KeyboardInterrupt:
        pass

    print("Recording stopped.")

    stream.stop_stream()
    stream.close()
    p.terminate()

    return frames

In [6]:
def get_command_from_microphone():
    vad = webrtcvad.Vad(2)  # Уровень агрессивности VAD (1-3)
    frames = record_audio()
    frames = [np.frombuffer(frame, dtype=np.int16) for frame in frames]
    frames = [Frame(frame, None, None) for frame in frames]
    segments = vad_collector(RATE, CHUNK_DURATION_MS, PADDING_DURATION_MS, vad, frames)
    waveform = b''.join(segments)
    # Здесь вы можете передать сегмент аудио на предсказание команды черепашке
    
    
    spec = preprocess_audiobuffer(audio)
    prediction = model(spec)
    print(prediction)
    confidence = np.max(tf.nn.softmax(prediction))
    print('Confidence: ', confidence)
    if confidence < 0.8:
        print("Недостаточно уверенное предсказание. Пропускаем.")
        label_pred = np.argmax(prediction, axis=1)
        print(label_pred)
        command = commands[label_pred[0]]
        print('Predicted label: ', command)
        return None
    label_pred = np.argmax(prediction, axis=1)
    print(label_pred)
    command = commands[label_pred[0]]
    print('Predicted label: ', command)
    return command
    # return waveform

In [7]:
def preprocess_audiobuffer(waveform):
    waveform = waveform / 32768
    waveform = tf.convert_to_tensor(waveform, dtype=tf.float32)
    spectrogram = get_spectrogram(waveform)
    spectrogram=tf.expand_dims(spectrogram, 0)
    return spectrogram

In [8]:
def get_spectrogram(waveform):
    input_len = 16000
    waveform = waveform[:input_len]
    zero_padding = tf.zeros(
        [input_len] - tf.shape(waveform),
        dtype=tf.float32)
    waveform = tf.cast(waveform, dtype=tf.float32)
    equal_length = tf.concat([waveform, zero_padding], 0)
    spectrogram = tf.signal.stft(
        equal_length, frame_length=255, frame_step=128)
    spectrogram = tf.abs(spectrogram)
    spectrogram = spectrogram[..., tf.newaxis]
    return spectrogram

In [9]:
commands = ['down', 'go', 'left', 'right', 'stop', 'up']
model = models.load_model('model.h5')



In [10]:
s = turtle.getscreen()

t = turtle.Turtle() # starts at right:

size = t.turtlesize()
increase = (2 * num for num in size)
t.turtlesize(*increase)

t.pensize(5)
t.shapesize()
t.pencolor("blue")

def go_right():
    # target = 0
    current = t.heading()
    if current == 0:
        pass
    elif current == 90:
        t.right(90)
    elif current == 180:
        t.right(180)
    elif current == 270:
        t.left(90)
    else:
        raise ValueError('not a right angle!')

def go_up():
    # target = 90
    current = t.heading()
    if current == 0:
        t.left(90)
    elif current == 90:
        pass
    elif current == 180:
        t.right(90)
    elif current == 270:
        t.left(180)
    else:
        raise ValueError('not a right angle!')
    
def go_left():
    # target = 180
    current = t.heading()
    if current == 0:
        t.left(180)
    elif current == 90:
        t.left(90)
    elif current == 180:
        pass
    elif current == 270:
        t.right(90)
    else:
        raise ValueError('not a right angle!')
    
def go_down():
    # target = 270
    current = t.heading()
    if current == 0:
        t.right(90)
    elif current == 90:
        t.right(180)
    elif current == 180:
        t.left(90)
    elif current == 270:
        pass
    else:
        raise ValueError('not a right angle!')


def move_turtle(command):
    if command == 'up':
        go_up()
    elif command == 'down':
        go_down()
    elif command == 'left':
        go_left()
    elif command == 'right':
        go_right()
    elif command == 'go':
        t.forward(100)
    elif command == 'stop':
        s.bye()
        print('Stopping the turtle')

while True:
    command = get_command_from_microphone()
    if command is None:
        continue
    move_turtle(command)
    if command == "stop":
        terminate()
        break

Listening...
Recording stopped.


Error: Error while processing frame