In [1]:
import os
import librosa
import librosa.display ## To deal with module `display` not found error

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
os.chdir('../')

## This is necessary to find `src` folder
import sys
sys.path.append('C:\\Users\\Robin\\Downloads\\Podcast-Audio-Processing')


# from real_time_inference import RecordThread, TestThread
from src.model import Net
from src.settings import MODEL_DATA_LOC
import torch

device = "cpu"

In [2]:
from src.settings import MODEL_DATA_LOC
MODEL_DATA_LOC.mkdir(parents=True, exist_ok=True)

model_save_name = sorted(os.listdir(MODEL_DATA_LOC))[-1]
state = torch.load( MODEL_DATA_LOC / model_save_name )

In [3]:
state['validation_accuracy']

0.8561507936507936

In [10]:
model = Net()
model.load_state_dict(state['state_dict'])
model.eval()

Net(
  (pool1): MaxPool2d(kernel_size=2, stride=1, padding=0, dilation=1, ceil_mode=False)
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (pool3): MaxPool2d(kernel_size=2, stride=3, padding=0, dilation=1, ceil_mode=False)
  (conv1): Conv2d(1, 24, kernel_size=(5, 5), stride=(1, 1))
  (conv1_bn): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(24, 32, kernel_size=(7, 7), stride=(1, 1))
  (conv2_bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(32, 64, kernel_size=(9, 9), stride=(2, 2))
  (conv3_bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv2d(64, 64, kernel_size=(11, 11), stride=(2, 2))
  (conv4_bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=576, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=3, bias=Tr

In [12]:
## load the latest chunk of audio input :
from torch import Tensor
from torchvision import transforms

audio_transformation = transforms.Compose(
    [
        lambda x: librosa.feature.melspectrogram(
            x, sr=SAMPLE_RATE, n_fft=2048, hop_length=512, n_mels=128, fmin=20, fmax=8300
        ),  # MFCC
        lambda x: librosa.power_to_db(x, top_db=80),
        lambda x: x.reshape(1, x.shape[0], x.shape[1]),
    ]
)

In [13]:
from src.real_time_inference import RecordThread
## SAME SAMPLE RATE AS HOW WE LOAD AUDIO FOR TRAINING ###
SAMPLE_RATE = 16000

In [14]:
label_mapper = {
    0 : 'no_action',
    1 : 'start_audio', 
    2 : 'pause_audio'
}

In [15]:
# THIS PUTS LATEST AUDIO COPY IN THE BACKGROUND
record = RecordThread("sample_record.wav", 4, SAMPLE_RATE)
print(record.start())

import time
prev_data = None


while True:
    try:
        data, sr = librosa.load("inference_0.wav", sr=SAMPLE_RATE)

        data = audio_transformation(data)
        data = Tensor(data.reshape(-1, 1, 128, 126))

        out = model(data)
        out = torch.nn.functional.softmax(out)
        out_ind = torch.argmax(out).item()
        out_val = torch.max(out).item()
        # print(out)
        # import pdb; pdb.set_trace()
        
        label_out = label_mapper[out_ind]
        if label_out != 'no_action':
            print(f'Output Label - {label_out} || Prob - {out_val}' )
        
        if out_ind == 1 and out_val > 0.9:
            print("###### - Started Video - #####")
            time.sleep(0.1)

        if out_ind == 2 and out_val > 0.9:
            print("###### - Paused Video - ######")
            time.sleep(0.1)

        print(out_ind, out_val)
        time.sleep(2)

    except Exception as e:
        print(f'GOT ERROR - {e}')
        break


record.stoprecord()

None




Output Label - pause_audio || Prob - 0.8005961179733276
2 0.8005961179733276
0 0.6293120980262756
0 0.5940048098564148
0 0.5715474486351013
Output Label - pause_audio || Prob - 0.6553778648376465
2 0.6553778648376465
0 0.8386073112487793
0 0.6938309669494629
0 0.5055956840515137
0 0.806901216506958
Output Label - pause_audio || Prob - 0.8730143308639526
###### - Paused Video - ######
2 0.8730143308639526
0 0.7022837996482849
0 0.4363968074321747
Output Label - pause_audio || Prob - 0.638204038143158
2 0.638204038143158
Output Label - pause_audio || Prob - 0.9366726279258728
###### - Paused Video - ######
2 0.9366726279258728
0 0.7641756534576416
0 0.8512588739395142
0 0.7208192348480225
0 0.929861307144165
0 0.8878988027572632
0 0.8993860483169556
0 0.8657199144363403
Output Label - start_audio || Prob - 0.5646669268608093
1 0.5646669268608093
0 0.8316587209701538
Output Label - start_audio || Prob - 0.9247081875801086
###### - Started Video - #####
1 0.9247081875801086
0 0.99849772453



Output Label - start_audio || Prob - 0.9533557891845703
###### - Started Video - #####
1 0.9533557891845703
Output Label - start_audio || Prob - 0.6161341667175293
1 0.6161341667175293
Output Label - start_audio || Prob - 0.8101665377616882
1 0.8101665377616882
Output Label - start_audio || Prob - 0.9960092306137085
###### - Started Video - #####
1 0.9960092306137085
0 0.5727951526641846
0 0.5446871519088745
0 0.5641118884086609
Output Label - start_audio || Prob - 0.8006323575973511
1 0.8006323575973511
0 0.7655408382415771
Output Label - start_audio || Prob - 0.9399402737617493
###### - Started Video - #####
1 0.9399402737617493
Output Label - start_audio || Prob - 0.9949347376823425
###### - Started Video - #####
1 0.9949347376823425
Output Label - pause_audio || Prob - 0.9791972041130066
###### - Paused Video - ######
2 0.9791972041130066
Output Label - pause_audio || Prob - 0.9902979731559753
###### - Paused Video - ######
2 0.9902979731559753
0 0.6661310791969299
Output Label - p

KeyboardInterrupt: 

In [9]:
data, sr = librosa.load("inference_0.wav", sr=SAMPLE_RATE)
# if prev_data is not None and all(prev_data == data):
#     continue
# prev_data = data.copy()

data = audio_transformation(data)

Finished recording an interval 
Saving the interval
Finished recording an interval 
Saving the interval
Finished recording an interval 
Saving the interval
