In [2]:
import pyaudio
import os
os.chdir('../')
import struct
import numpy as np
import matplotlib.pyplot as plt
import time
from tkinter import TclError

# use this backend to display in separate Tk window
%matplotlib tk

# constants
CHUNK = 1024 * 2             # samples per frame
FORMAT = pyaudio.paInt16     # audio format (bytes per sample?)
CHANNELS = 1                 # single channel for microphone
RATE = 44100                 # samples per second

In [4]:
# create matplotlib figure and axes
fig, ax = plt.subplots(1, figsize=(15, 7))

# pyaudio class instance
p = pyaudio.PyAudio()

# stream object to get data from microphone
stream = p.open(
    format=FORMAT,
    channels=CHANNELS,
    rate=RATE,
    input=True,
    output=True,
    frames_per_buffer=CHUNK
)

# variable for plotting
x = np.arange(0, 2 * CHUNK, 2)

# create a line object with random data
line, = ax.plot(x, np.random.rand(CHUNK), '-', lw=2)

# basic formatting for the axes
ax.set_title('AUDIO WAVEFORM')
ax.set_xlabel('samples')
ax.set_ylabel('volume')
ax.set_ylim(0, 255)
ax.set_xlim(0, 2 * CHUNK)
plt.setp(ax, xticks=[0, CHUNK, 2 * CHUNK], yticks=[0, 128, 255])

# show the plot
plt.show(block=False)

print('stream started')

# for measuring frame rate
frame_count = 0
start_time = time.time()

while True:
    
    # binary data
    data = stream.read(CHUNK)  
    
    # convert data to integers, make np array, then offset it by 127
    data_int = struct.unpack(str(2 * CHUNK) + 'B', data)
    
    # create np array and offset by 128
    data_np = np.array(data_int, dtype='b')[::2] + 128
    
    line.set_ydata(data_np)
    
    # update figure canvas
    try:
        fig.canvas.draw()
        fig.canvas.flush_events()
        frame_count += 1
        
    except TclError:
        
        # calculate average frame rate
        frame_rate = frame_count / (time.time() - start_time)
        
        print('stream stopped')
        print('average frame rate = {:.0f} FPS'.format(frame_rate))
        break

stream started
stream stopped
average frame rate = 21 FPS


##  Training Dataset Preparation

Input to model : 8 sec of audio clip with My sound Dubbed in between . 

Inference : Model listens for 8 second for a while. Then, processes it . waits for a while. Then, Processes ..

More like, multithread, such that, by time it processes for prediction generation, it's preparing next wave .

The computaiton control is done by time.sleep .. 

In [1]:
import numpy as np
from pydub import AudioSegment
import random
import sys
import io
import os
import glob
import IPython
%matplotlib inline

In [36]:
import logging

In [2]:
# IPython.display.Audio("data/external/sample_internet.wav")

In [3]:
from pydub import AudioSegment
song = AudioSegment.from_wav('/home/fm-pc-lt-151/snap/audacity/748/training_data.wav')

In [4]:
type(song)

pydub.audio_segment.AudioSegment

In [5]:
## load training examles

In [6]:
train_path = '/home/fm-pc-lt-151/podcast_research/Podcast-Audio-Processing/data/external/Deep-Learning-Coursera-master/Sequence Models/Week3/Trigger word detection/raw_data/activates/'

In [7]:
root_ = '/home/fm-pc-lt-151/snap/audacity/748/'
clips = []
for i in os.listdir(root_):
    if i[0].isdigit():
        clip_path = os.path.join( root_ , i ) 
        clip = AudioSegment.from_wav(clip_path)
        print('Traning Clip Vol : ' , clip.dBFS )
        clips.append( clip )

Traning Clip Vol :  -28.489737167279724
Traning Clip Vol :  -25.508003256941404
Traning Clip Vol :  -29.95164431992424
Traning Clip Vol :  -33.16234877056899
Traning Clip Vol :  -25.37410450471753
Traning Clip Vol :  -29.664684633234724
Traning Clip Vol :  -24.5798693042547
Traning Clip Vol :  -29.201432071694352
Traning Clip Vol :  -30.17112454023635
Traning Clip Vol :  -29.301403573965196
Traning Clip Vol :  -27.812395710917173
Traning Clip Vol :  -24.28405692647013
Traning Clip Vol :  -34.77497501942614
Traning Clip Vol :  -34.363647882580025
Traning Clip Vol :  -35.28341661952751


In [8]:
total_song_length = song.duration_seconds

In [10]:
from pydub.playback import play
import random

In [11]:
n = 1000
play( song[23.9999999 * n : 29 * n] ) 

In [30]:
def sample_from_activate( clip_list ):
    return random.choice(clip_list)

## experimental 
def adjust_level(sound, deviation=None , default = True):
    if default or deviation is None:
        return sound
    else:
        difference = np.random.normal(clip.dBFS , deviation )
        print(f'Original : {clip.dBFS}')
        print(f'Deviated by : {difference}')
        return sound.apply_gain(difference)

def sample_from_background( background , total_duration_sec , clip_size = 8 ,  multiplier = 1000 ):
        
    total_duration_ = total_duration_sec - clip_size
    selection_start = np.random.uniform(  0 , total_duration_  ) 
    selection_end = (selection_start + clip_size) * multiplier
        
    background_clip = background[ selection_start * multiplier : selection_end  ] 
    return background_clip

def overlay_clip_to_bg( background , clips ):
    '''
        Dub Clip to background
    '''
    overlay_point = np.random.uniform( 7 , background.duration_seconds ) * 1000
    return background.overlay( random.choice( clips ) , position =  overlay_point ) , overlay_point

In [39]:
%%time
## check the goddamn speed
for i in range(30000):
    _ = sample_from_background(song, total_song_length)

CPU times: user 1.6 s, sys: 0 ns, total: 1.6 s
Wall time: 1.6 s


In [42]:
def generate_single_example( background , voice_clips ):
    
    total_song_length = background.duration_seconds 
    background = sample_from_background( background , 1 )

    ## positive labels here
    if np.random.uniform(0,1) > 0.5:
        clip_to_dub = sample_from_activate(clips)
        overlayed_clip, time_stamp = overlay_clip_to_bg( background, clip_to_dub )  
        return overlayed_clip , time_stamp
    
    else:
        return background , None

In [43]:
test_overlay , overlay_point = generate_single_example( song , clips )

In [44]:
test_overlay

In [45]:
type(test_overlay)

pydub.audio_segment.AudioSegment