#### This notebook generates d_vectors with the model '/net/vol/saadmann/models/SID/2021-07-17-22-01-02' which is trained with f_banks cascaded class for 100dim d_vectors

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from numpy import asarray
from numpy import save
from numpy import load

import scipy

import seaborn as sns
import itertools
from tqdm import tqdm
from collections import OrderedDict 

import torch
import torch.nn as nn

import pydub
from pydub import AudioSegment

import padertorch as pt
import padercontrib as pc
import paderbox as pb
from padertorch import Model
from paderbox.array import interval
from padercontrib.database.fearless import Fearless
from padertorch import Model
from paderbox.transform import mfcc
from paderbox.transform import stft,fbank

In [2]:
exp_dir = '/net/vol/dheerajpr/models/SID/2021-10-12-23-24-11'
ckpt_name = 'ckpt_best_loss.pth'
device = 0
model_SID = Model.from_storage_dir(
    exp_dir, consider_mpi=True, checkpoint_name=ckpt_name
)
model_SID.to(device)
model_SID.eval()

ResNet_SID(
  size=ModelParameterSize(total_count=5388634, trainable_count=5388634, total_bytes=21554536, trainable_bytes=21554536)
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): LeakyReLU(negative_slope=0.01)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (avgpool2d): AvgPool2d(kernel_size=3, stride=2, padding=1)
  (layer1): Sequential(
    (0): Block(
      (conv1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): LeakyReLU(negative_slope=0.01)
    )
    (1): Block(
      (conv1): Conv2d(32, 32, kernel_size=(3, 3), stride=(

In [3]:
class NewModel(nn.Module):
    def __init__(self, output_layers, *args):
        super().__init__(*args)
        self.output_layers = output_layers
        #print(self.output_layers)
        self.selected_out = OrderedDict()
        #PRETRAINED MODEL
        self.pretrained = model_SID
        self.fhooks = []

        for i,l in enumerate(list(self.pretrained._modules.keys())):
            if i in self.output_layers:
                self.fhooks.append(getattr(self.pretrained,l).register_forward_hook(self.forward_hook(l)))
    
    def forward_hook(self,layer_name):
        def hook(module, input, output):
            self.selected_out[layer_name] = output
        return hook

    def forward(self, x):
        out = self.pretrained(x)
        return out, self.selected_out

## Dev_segment

In [8]:
from padercontrib.database.fearless import Fearless

In [4]:
FearlessData = Fearless.data
devList=list(FearlessData['datasets']['Dev_segment'].items())
devSegLst=[]
for a,b in devList:
    devSegLst.append(b)
dfS = pd.DataFrame(devSegLst)

dfS['vector_path']=np.nan
dfS['intersection_ids']=np.nan
dfS['d_vector']=np.nan

np.nan_to_num(dfS['d_vector'], copy=False)


intersection_set = load('intersection_set.npy')

for i in range(len(dfS)):
    dfS.loc[i,'intersection_ids'] = dfS['speaker_id'][i] in intersection_set
dfS_filtered_2 = dfS[dfS['intersection_ids'] == True]
dfS_filtered = dfS_filtered_2[dfS_filtered_2['num_samples'] > 4000]
dfS_filtered_RI_sort_D = dfS_filtered.sort_values(by=['speaker_id'], ignore_index=True)
dfS_filtered_RI_sort_D.head()

Unnamed: 0,audio_path,num_samples,speaker_id,transcription,vector_path,intersection_ids,d_vector
0,{'observation': '/net/db/fearless/Audio/Segmen...,70000,AFD1,NO WAIT UNTIL AFTER LAUNCH AND WE GET TRANSMIT...,,True,0.0
1,{'observation': '/net/db/fearless/Audio/Segmen...,18880,AFD1,[unk] A F D ON YOUR LOOP,,True,0.0
2,{'observation': '/net/db/fearless/Audio/Segmen...,10000,AFD1,GO NETWORK,,True,0.0
3,{'observation': '/net/db/fearless/Audio/Segmen...,27040,AFD1,PROCEDURES A F D ON A F D CONFERENCE,,True,0.0
4,{'observation': '/net/db/fearless/Audio/Segmen...,19440,AFD1,MOCR SYSTEMS ONE AND THREE,,True,0.0


In [24]:
#Dev-segment
d_vec_dev_seg = torch.empty(1, 128).to(device)

with torch.no_grad():
    for i in tqdm(range(len(dfS_filtered_RI_sort_D))):
        sid_dict = dict()
        padded_audio = []
        fbank_data = []
        """ Obtain audio segments from the dataset"""
        """ If segments smaller than 4secs, pad with silence. Else, extract 4secs from larger audio segments """
        audio = pb.io.load_audio(dfS_filtered_RI_sort_D['audio_path'][i]['observation'],dtype=np.int16)
        if len(audio) < 32000:
            pad = (32000 - len(audio))
            padded = np.concatenate((audio,np.zeros(pad)))
            
        elif len(audio) >= 32000:
            pad = 0
            audio = audio[0:32000]
            padded = audio     
        
        padded_audio.append(padded)
        """ Compute the 64 dimensional filter banks for the 4secs fixed length audio segments"""
    
        fbank = pb.transform.fbank(padded, sample_rate=8000, window_length=400, stft_shift=160,number_of_filters=64,
                        stft_size=512,lowest_frequency=0,highest_frequency=None, preemphasis_factor=0.97, 
                        window=scipy.signal.windows.hamming)
        fbank_data.append(fbank)
        float_fbank = np.float32(fbank_data)
        float_fbank = np.squeeze(float_fbank,0)

        float_fbank = np.expand_dims(float_fbank, axis=0)
        float_fbank = np.expand_dims(float_fbank, axis=0)
        float_fbank = torch.from_numpy(float_fbank).to(device)
        sid_dict['features'] = (float_fbank)
        sid_dict['features'] = sid_dict['features']
#        model = NewModel(output_layers = [2][]).to(device)
        x = sid_dict
        model = NewModel(output_layers = [10]).to(device)
        preds = model(x)
#        preds[1]['fc1']

        d_vec_dev_seg = torch.cat((d_vec_dev_seg, preds[1]['fc1']))
    d_vec_dev_seg = d_vec_dev_seg[1:].type(torch.DoubleTensor)

100%|██████████| 8479/8479 [03:21<00:00, 42.18it/s]


In [25]:
data_dev_seg = asarray(d_vec_dev_seg)
save('d_vector_dev_seg.npy', data_dev_seg)

## Dev-stream

In [9]:
db = Fearless()
dataset_Dev_str = db.get_dataset('Dev_stream')
df_Dev_str = pd.DataFrame(dataset_Dev_str)
df_Dev_str.head()

Unnamed: 0,audio_path,end,num_samples,num_speakers,speaker_id,start,transcription,example_id,dataset
0,{'observation': '/net/db/fearless/Audio/Stream...,"[1716960, 1724240, 1738480, 1742720, 1967600, ...",14816000,8,"[PROCEDURES1, FD1, PROCEDURES1, FD1, CONTROL1,...","[1710480, 1716960, 1724720, 1739440, 1955360, ...","[FLIGHT PROCEDURES., Go PROCEDURES., Upper clo...",FS02_dev_001,Dev_stream
1,{'observation': '/net/db/fearless/Audio/Stream...,"[24800, 128560, 143040, 164080, 172320, 206240...",14400000,30,"[EECOM3, TRACK0, MADRID, TRACK0, EECOM3, MADRI...","[16800, 123360, 140320, 148560, 164080, 176000...","[SPAN EECOM conference., MADRID TRACK., MADRID...",FS02_dev_002,Dev_stream
2,{'observation': '/net/db/fearless/Audio/Stream...,"[98400, 158960, 169680, 262480, 526720, 767200...",14400000,26,"[CAPCOM1, EMU, FD1, BUZZ, BUZZ, BUZZ, NEIL, BU...","[36320, 98400, 162720, 169680, 285840, 544480,...","[Uh roger BUZZ, and break break COLUMBIA this ...",FS02_dev_003,Dev_stream
3,{'observation': '/net/db/fearless/Audio/Stream...,"[748080, 760480, 770000, 802720, 826080, 88120...",14955294,11,"[RETRO1, FIDO1, RETRO1, UNK, FIDO1, RETRO1, FI...","[739600, 757520, 761920, 797040, 815840, 84056...","[FIDO this is RETRO., Go ahead., Did you get t...",FS02_dev_004,Dev_stream
4,{'observation': '/net/db/fearless/Audio/Stream...,"[28240, 102000, 146320, 182160, 258720, 376480...",14400000,14,"[CAPCOM1, BUZZ, CAPCOM1, BUZZ, CAPCOM1, BUZZ, ...","[0, 61920, 103840, 172560, 190080, 337840, 390...",[Is HOUSTON uh radio check and verify T.V. cir...,FS02_dev_005,Dev_stream


In [30]:
preds=[]
d_vector_dev_str = torch.empty(1, 128).to(device)
with torch.no_grad():
    for i in range(len(df_Dev_str)):
        for j in tqdm(range(len(df_Dev_str['start'][i]))):
            #print(df_Dev_str['audio_path'][i]['observation'])
            audio = pb.io.load_audio(df_Dev_str['audio_path'][i]['observation'],
                             start =df_Dev_str['start'][i][j], stop = df_Dev_str['end'][i][j], dtype=np.int16)

            sid_dict=dict()
            fbank_data=[]
            f_banks = pb.transform.fbank(audio, sample_rate=8000, window_length=400, stft_shift=180, number_of_filters=64,
                        stft_size=512,lowest_frequency=0,highest_frequency=None, preemphasis_factor=0.97,
                        window=scipy.signal.windows.hamming, denoise=False)

            fbank_data.append(f_banks)
            float_fbank = np.float32(fbank_data)
            float_fbank = np.squeeze(float_fbank,0)
            float_fbank = np.expand_dims(float_fbank, axis=0)
            float_fbank = np.expand_dims(float_fbank, axis=0)
            float_fbank = torch.from_numpy(float_fbank).to(device)
            sid_dict['features'] = (float_fbank)
            x = sid_dict
            model = NewModel(output_layers = [10]).to(device)
            preds = model(x)
            
            d_vector_dev_str = torch.cat((d_vector_dev_str, preds[1]['fc1']))
        #d_vector_dev_str = d_vector_dev_str[1:].type(torch.DoubleTensor)

100%|██████████| 62/62 [00:02<00:00, 30.61it/s]
100%|██████████| 412/412 [00:11<00:00, 36.38it/s]
100%|██████████| 239/239 [00:06<00:00, 35.04it/s]
100%|██████████| 107/107 [00:03<00:00, 34.26it/s]
100%|██████████| 259/259 [00:07<00:00, 34.85it/s]
100%|██████████| 183/183 [00:05<00:00, 36.22it/s]
100%|██████████| 270/270 [00:07<00:00, 34.89it/s]
100%|██████████| 387/387 [00:10<00:00, 35.28it/s]
100%|██████████| 574/574 [00:16<00:00, 35.72it/s]
100%|██████████| 554/554 [00:15<00:00, 35.29it/s]
100%|██████████| 385/385 [00:11<00:00, 34.62it/s]
100%|██████████| 307/307 [00:09<00:00, 33.52it/s]
100%|██████████| 570/570 [00:16<00:00, 34.41it/s]
100%|██████████| 383/383 [00:11<00:00, 33.53it/s]
100%|██████████| 446/446 [00:13<00:00, 33.03it/s]
100%|██████████| 464/464 [00:14<00:00, 33.05it/s]
100%|██████████| 178/178 [00:06<00:00, 29.51it/s]
100%|██████████| 132/132 [00:04<00:00, 31.79it/s]
100%|██████████| 98/98 [00:03<00:00, 31.37it/s]
100%|██████████| 277/277 [00:08<00:00, 31.59it/s]
100%

In [33]:
d_vector_dev_str[1:].shape

torch.Size([9203, 128])

In [32]:
data_dev_str = asarray(d_vector_dev_str[1:].cpu())
save('d_vector_dev_str.npy', data_dev_str)