In [1]:
import pyaudio
import keyboard
import numpy as np
import matplotlib.pyplot as plt
import os
import librosa
import librosa.display
import pickle

#import pynput.keyboard

import warnings
warnings.filterwarnings('ignore')

%matplotlib tk
plt.style.use('dark_background')

p = pyaudio.PyAudio()

In [3]:
print ('AVAILABLE DEVICES:\n')
for i in range(0, p.get_device_count()):
    info = p.get_device_info_by_index(i)
    print( str(info['index']) + ': %s : %s' % (info['name'],
                                                   p.get_host_api_info_by_index(info['hostApi'])['name']))
    pass

# Choose Speakers (2- Logitech G430 Gaming Headset) Windows WASAPI
device_id = 4
device_info = p.get_device_info_by_index(device_id)
if device_info['maxInputChannels'] > device_info['maxOutputChannels']:
    channels = device_info['maxInputChannels']
else: channels = device_info['maxOutputChannels']

print('\nSELECTED:', device_info['name'])

AVAILABLE DEVICES:

0: Microsoft Sound Mapper - Input : MME
1: Microphone (High Definition Aud : MME
2: Microsoft Sound Mapper - Output : MME
3: Speakers (High Definition Audio : MME
4: Speakers (High Definition Audio Device) : Windows WASAPI
5: Microphone (High Definition Audio Device) : Windows WASAPI
6: Output () : Windows WDM-KS
7: Microphone (HD Audio Mixed capture) : Windows WDM-KS
8: Speakers (HD Audio Speaker) : Windows WDM-KS
9: Headset (@System32\drivers\bthhfenum.sys,#2;%1 Hands-Free AG Audio%0
;(Synergy)) : Windows WDM-KS
10: Headset (@System32\drivers\bthhfenum.sys,#2;%1 Hands-Free AG Audio%0
;(Synergy)) : Windows WDM-KS
11: Input (Game Capture HD60 Pro Audio) : Windows WDM-KS
12: Headphones () : Windows WDM-KS
13: Headset (@System32\drivers\bthhfenum.sys,#2;%1 Hands-Free AG Audio%0
;(TOZO-T9)) : Windows WDM-KS
14: Headset (@System32\drivers\bthhfenum.sys,#2;%1 Hands-Free AG Audio%0
;(TOZO-T9)) : Windows WDM-KS
15: Headphones () : Windows WDM-KS

SELECTED: Speakers (High D

In [4]:
p.get_device_info_by_index(device_id)

{'index': 4,
 'structVersion': 2,
 'name': 'Speakers (High Definition Audio Device)',
 'hostApi': 1,
 'maxInputChannels': 0,
 'maxOutputChannels': 2,
 'defaultLowInputLatency': 0.0,
 'defaultLowOutputLatency': 0.0026666999999999997,
 'defaultHighInputLatency': 0.0,
 'defaultHighOutputLatency': 0.01,
 'defaultSampleRate': 48000.0}

In [5]:
chunk = 2048 # Two data streams
rate = int(device_info['defaultSampleRate'])

stream = p.open(format=pyaudio.paInt16,
                channels=channels,
                rate=rate,
                input=True,
                frames_per_buffer=chunk,
                input_device_index=device_id,
                as_loopback=True)

Chunk size (C) is equal to hop length. This means there is 1 window per chunk, and a FFT is performed on each chunk of audio data. That is, a FFT is performed on 1024/48000 (s) = 21.3 ms worth of audio information. Then, n_frames (n) is the number of frames you can fit in a set duration per chunk size C. The audio buffer takes the dimensions n x C. Operations take place to convert the column information (chunked audio data) to frequencies on the mel scale and are fit to a dimension specified by n_mels (m). The dimensions of the mel spectrogram take the form m x n, where m are the bins for the mel frequencies, and n is the number of frames that is set for a duration.

In [6]:
duration = 1 # seconds
hop_length = 1024
n_frames = int(np.ceil(rate * duration / hop_length))
n_fft = 1024
#window_size = n_fft
n_mels=128
y_axis = 'mel'


In [None]:
# Working spectrograms (BOTH AUDIO CHANNELS)

audio_buffer = [np.zeros((n_frames, chunk)), np.zeros((n_frames, chunk))]
print(audio_buffer[0].shape)

S, S_db = [], []
for i in range(2):
    S.append(np.zeros((n_mels, n_frames)))
    S_db.append(np.zeros((n_mels, n_frames)))

fig, ax = plt.subplots(nrows=1, ncols=2, sharex=True, sharey=True, figsize=(10,5))

while True:
    
    # Read audio stream data and normalize 

    data = stream.read(chunk) 
    data = np.frombuffer(data, np.int16).reshape(chunk, 2) # (1024, 2)
    #data = data.astype(np.float32) / np.iinfo(np.int16).max # normalize audio data

    for idx, side_audio_data in enumerate([data[:,0], data[:,1]]):

        audio_buffer[idx] = np.roll(audio_buffer[idx], shift=-1, axis=0)
        audio_buffer[idx][-1, :] = side_audio_data

        S[idx] = librosa.feature.melspectrogram(audio_buffer[idx].flatten(),
                                            sr=rate,
                                            n_fft=n_fft,
                                            n_mels=n_mels, 
                                            hop_length=hop_length,
                                            fmax=rate/2) 

        S_db[idx] = librosa.power_to_db(S[idx], ref=1)

        ax[idx].clear()

        img = librosa.display.specshow(S_db[idx],
                                        ax=ax[idx],
                                        sr=rate,
                                        n_fft=n_fft,
                                        x_axis='time',
                                        y_axis=y_axis, 
                                        fmax=rate/2, 
                                        hop_length=hop_length)

    fig.canvas.draw()
    fig.canvas.flush_events()

    if keyboard.is_pressed('ctrl') and keyboard.is_pressed('c'):
        if fig: plt.close(fig)
        break

In [None]:
# Working spectrogram (LEFT AUDIO CHANNEL ONLY)

audio_buffer = np.zeros((n_frames, chunk))

fig, ax = plt.subplots()

while True:
    
    # Read audio stream data and normalize 

    data = stream.read(chunk) 
    data = np.frombuffer(data, np.int16).reshape(chunk, 2) # (1024, 2)
    #data = data.astype(np.float32) / np.iinfo(np.int16).max # normalize audio data
    audio_data = data[:,0] # JUST PLOTTING LEFT AUDIO CHANNEL RIGHT NOW
    #print(audio_data.shape)

    audio_buffer = np.roll(audio_buffer, -1, axis=0)
    audio_buffer[-1, :] = audio_data
    #print(len(audio_buffer.flatten()))

    S = librosa.feature.melspectrogram(audio_buffer.flatten(),
                                       sr=rate,
                                       n_fft=n_fft,
                                       n_mels=n_mels, 
                                       hop_length=hop_length,
                                       fmax=rate/2) 

    #print(S.shape)
    S_db = librosa.power_to_db(S,ref=1) # (n_mels=128, 187)
    #print(S_db.shape)
    # print(np.max(S_db), np.min(S_db)) # (128, 187)

    ax.clear()
    img = librosa.display.specshow(S_db,
                                   ax=ax,
                                   sr=rate,
                                   n_fft=n_fft,
                                   x_axis='time',
                                   y_axis=y_axis, 
                                   fmax=rate/2, 
                                   hop_length=hop_length)

    fig.canvas.draw()
    fig.canvas.flush_events()

    if keyboard.is_pressed('ctrl') and keyboard.is_pressed('c'):
        if fig: plt.close(fig)
        break

# To do: add type in name for footsteps, gunshots, maybe even breathing **

In [7]:
# Creating hotkeys for saving files

directions = {
    ('left',): 'hardLeft',
    ('left', 'up'): 'softLeftFront',
    ('left', 'down'): 'softLeftBack',
    ('up',): 'front',
    ('down',): 'back',
    ('right', 'up'): 'softRightFront',
    ('right', 'down'): 'softRightBack',
    ('right',): 'hardRight',
    ('control',): 'center'
}

sound_class = {'-': '_footstep',
               '*': '_gunshot',
               '/': '_voice',
               '.': '_breathing', # refers to wounded breathing, need to rename
               'Ins': '_healing',
               '1': '_exhausted'} # refers to exhausted breathing

source = 'EFT'



def save_audio(hotkey, audio_buffer, S_db, count):
    
    direction = directions[tuple(hotkey.split('+')[:-1])]
    sound = sound_class[hotkey.split('+')[-1]]
    title = direction+'_'+source+sound+'_'+str(count[direction+'_'+source+sound])
    count[direction+'_'+source+sound] += 1

    parameters = {
        'audio_buffer': audio_buffer,
        'S_db': S_db
    }

    print('Saving', title)
    
    with open('AudioData/'+title, 'wb') as file:
        pickle.dump(parameters, file)


hotkeys = []

for sound in sound_class.keys():
    for keys in directions.keys():
        for idx, key in enumerate(keys):
            if idx == 0: key_strings = key
            else: key_strings += '+'+key
        #print(key_strings)
        final_hotkey = key_strings + '+'+sound
        hotkeys.append(final_hotkey)

print(hotkeys)



def init_count(hotkeys):

     count = {}

     for hotkey in hotkeys:

          direction = directions[tuple(hotkey.split('+')[:-1])]
          sound = sound_class[hotkey.split('+')[-1]]
          count[direction+'_'+source+sound] = 0
          
     return count

def create_hotkeys(hotkeys):
    for hotkey in hotkeys:
        keyboard.add_hotkey(hotkey, save_audio, args=[hotkey, audio_buffer, S_db, count], timeout=1)


['left+-', 'left+up+-', 'left+down+-', 'up+-', 'down+-', 'right+up+-', 'right+down+-', 'right+-', 'control+-', 'left+*', 'left+up+*', 'left+down+*', 'up+*', 'down+*', 'right+up+*', 'right+down+*', 'right+*', 'control+*', 'left+/', 'left+up+/', 'left+down+/', 'up+/', 'down+/', 'right+up+/', 'right+down+/', 'right+/', 'control+/', 'left+.', 'left+up+.', 'left+down+.', 'up+.', 'down+.', 'right+up+.', 'right+down+.', 'right+.', 'control+.', 'left+Ins', 'left+up+Ins', 'left+down+Ins', 'up+Ins', 'down+Ins', 'right+up+Ins', 'right+down+Ins', 'right+Ins', 'control+Ins', 'left+1', 'left+up+1', 'left+down+1', 'up+1', 'down+1', 'right+up+1', 'right+down+1', 'right+1', 'control+1']


Need to run above code block several times to work? May have to do with initialized variables ! OR may need to wait ~1 minute !

In [9]:
# Audio array collection (BOTH AUDIO CHANNELS)

duration = 1 # seconds
hop_length = 1024
n_frames = int(np.ceil(rate * duration / hop_length))
n_fft = 1024
#window_size = n_fft
n_mels=128
y_axis = 'mel'

# Initialize data collectors

audio_buffer = [np.zeros((n_frames, chunk)), np.zeros((n_frames, chunk))]

S, S_db = [], []
for i in range(2):
    S.append(np.zeros((n_mels, n_frames)))
    S_db.append(np.zeros((n_mels, n_frames)))

# Set up count and hotkeys

count = init_count(hotkeys)
create_hotkeys(hotkeys)

while True:
    
    # Read audio stream data and normalize 

    data = stream.read(chunk) 
    data = np.frombuffer(data, np.int16).reshape(chunk, 2) # (1024, 2)

    for idx, side_audio_data in enumerate([data[:,0], data[:,1]]):

        audio_buffer[idx] = np.roll(audio_buffer[idx], shift=-1, axis=0)
        audio_buffer[idx][-1, :] = side_audio_data

        S[idx] = librosa.feature.melspectrogram(audio_buffer[idx].flatten(),
                                            sr=rate,
                                            n_fft=n_fft,
                                            n_mels=n_mels, 
                                            hop_length=hop_length,
                                            fmax=rate/2) 

        S_db[idx] = librosa.power_to_db(S[idx], ref=1)


    if keyboard.is_pressed('ctrl') and keyboard.is_pressed('c'):
        break


keyboard.unhook_all_hotkeys()

Saving hardLeft_EFT_breathing_0
Saving hardLeft_EFT_breathing_1
Saving hardLeft_EFT_breathing_2
Saving front_EFT_breathing_0
Saving front_EFT_breathing_1
Saving front_EFT_breathing_2
Saving front_EFT_breathing_3
Saving front_EFT_breathing_4
Saving front_EFT_breathing_5
Saving hardRight_EFT_breathing_0
Saving hardRight_EFT_breathing_1
Saving hardRight_EFT_breathing_2
Saving front_EFT_breathing_6
Saving front_EFT_breathing_7
Saving front_EFT_breathing_8
Saving center_EFT_footstep_0
Saving center_EFT_footstep_1
Saving center_EFT_footstep_2
Saving center_EFT_footstep_3
Saving center_EFT_footstep_4
Saving center_EFT_footstep_5
Saving hardLeft_EFT_footstep_0
Saving hardLeft_EFT_footstep_1
Saving hardLeft_EFT_footstep_2
Saving front_EFT_footstep_0
Saving front_EFT_footstep_1
Saving front_EFT_footstep_2
Saving front_EFT_footstep_3
Saving hardLeft_EFT_breathing_3
Saving hardLeft_EFT_breathing_4
Saving hardLeft_EFT_breathing_5
Saving hardLeft_EFT_breathing_6
Saving hardLeft_EFT_breathing_7
Savin