# LFCC-GMM trainning baseline
   #### number of training = 25,380
    genuine 2,580 spoof 22,800
   
   #### number of development = 24,986
    genuine 2,548 spoof 22,296

In [1]:
# GMMs(Gaussian Mixture Models) front-end are LFCCs and CQCCs.
# My library
from lfcc import *

# Library for dataloader
import os.path
import glob

# Library for LFCC-GMM
import numpy as np
import pandas as pd
#from sklearn.externals import joblib
from sklearn.mixture import GaussianMixture

# Library for reading flac audio file
import soundfile as sf
#from scipy.io.wavfile import read

# Library for pytorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision
from torchvision import models, transforms

In [2]:
float_formatter = "{:.4f}".format

np.set_printoptions(formatter={'float_kind': float_formatter})

# Preproess, Dataset, Dataloader definition

In [3]:
class Preprocess(object):
    """
    Preprocessing class for audio data
    
    Attributes:
    
    """
    def __init__(self):
        """
        Parameters
        ----------
        
        """
        self.extractor = None
        self.features = None
        
    def __call__(self, y, sr, feature, dynamic=True):
        """
        Extract fetures with lfcc, mfcc, cqcc and other method
        
        Parameters
        ----------
        
        """
        if feature == 'LFCC':
            self.extractor = LFCC(y, sr)
            
        elif feature == 'MFCC':
            self.extractor = MFCC(y, sr)
        
        elif feature == 'CQCC':
            self.extractor = CQCC(y, sr)
        
        self.features = self.extractor.extract_feature(delta=True)
        
        return self.features

In [4]:
# Preprocess test
process = Preprocess()

In [5]:
!ls

cqcc.py		   lfcc_genuine.csv  lfcc_spoofed.csv  s1260057_report1
GMMs_train.ipynb   lfcc_gmm.py	     mfcc.py	       utterance0.wav
LA_T_1028533.flac  lfcc.py	     __pycache__       utterance3.wav


In [11]:
y, sr = sf.read('LA_T_1028533.flac')

print(sr*32//1000)

ext = LFCC(y, sr)
lfcc = ext.extract_feature(delta=True).T

print(lfcc)
print(lfcc.shape)

512
[[-41.1161 -0.4636 0.3527 ... -0.1933 -0.1557 0.0113]
 [-41.1636 -0.4733 0.1317 ... 0.0647 -0.0181 0.2531]
 [-41.2046 -0.3002 0.2902 ... 0.4425 0.2905 0.4339]
 ...
 [-41.5126 -0.8424 0.3637 ... 0.0375 -0.3388 0.2125]
 [-41.0282 -0.4408 -0.0175 ... -0.0367 0.1962 -0.2170]
 [-40.4114 0.2499 0.3737 ... -0.0225 0.1719 -0.1833]]
(113, 60)


In [7]:
def make_datapath_list(phase='train'):
    """
    make a list containing a path to data
    
    Parameters
    ----------
    phase: 'train' or 'dev' or 'eval'
        specify whether data is for train or development or evaluation
    
    Returns
    ----------
    path_list : list
        return a list containing a path to data
    """
    
    root_path = "/DB/Audio/English/ASVspoof2019/LA/"
    target_path = os.path.join(root_path+'ASVspoof2019_LA_'+phase+'/flac/*.flac')
    print(target_path)
    
    path_list = []
    
    # Get a filepath to subdir by using glob module
    for path in glob.glob(target_path):
        path_list.append(path)
    
    return path_list

# test
train_list = make_datapath_list(phase='train')
dev_list = make_datapath_list(phase='dev')

#print(train_list)

#print(dev_list)

#print(len(train_list), len(dev_list))
    

/DB/Audio/English/ASVspoof2019/LA/ASVspoof2019_LA_train/flac/*.flac
/DB/Audio/English/ASVspoof2019/LA/ASVspoof2019_LA_dev/flac/*.flac


In [29]:
# Make dataloader
class ASVspoofDataSet(data.Dataset):
    """
    Dataset class for ASVspoof2019, which derived from torch.utils.data.Dataset class
    
    Attributes:
    --------------
    file_list: list
        list containing a path to data
        
    transform: object
        instance of PreProcessor
    
    phase: str
        'train' or 'dev' or 'eval'
    """
    
    def __init__(self, file_list, label_list=None, preprocess=None, phase='train'):
        """
        Parameters
        ----------
        file_list: list
            list of audio files to read
        
        label_list: list
            list of labels('bonafide' or 'spoof'), which is changed to 0, 1
        
        transform: class PreProcess
            instance of PreProcess to be used for pre-process to audio data
        
        phase: str
            specify whether data is for training or development or evaluation('train' or 'dev' or 'eval')
            
        """
        
        self.phase = phase
        self.preprocess = preprocess
        self.root_path = '/DB/Audio/English/ASVspoof2019/LA/'
        #self.file_path = None
        self.file_list = file_list
        self.label_path = None
        self.label_list = label_list
        
        if self.phase == 'train':
            self.label_path = os.path.join(self.root_path+'ASVspoof2019_LA_cm_protocols/')
            self.label_list = []
            with open(self.label_path+'ASVspoof2019.LA.cm.train.trn.txt', mode='r') as protocols:
                for line in protocols:
                    line = line.split() # read line by line
                    filename, label = line[1], line[-1] # get filename and label from protocols file
                    self.label_list.append((filename, label))
                    
        elif self.phase == 'dev':
            self.label_path = None
        else:
            self.label_path = None
        
    def __len__(self): # this is needed to be overrided
        return len(self.file_list)
    
    def __getitem__(self, index): # this is also needed to be overrided
        """
        Get data and its label that was pre-processed
        """
        
        # load audio
        speech_path = self.file_list[index]
        speech, sr = sf.read(speech_path)
        
        # preprocessing and extract features
        features = self.preprocess(y=speech, sr=sr, feature='LFCC') # preprocess to speech, not implemented yet
        
        label = None
        speech_name = speech_path.split('/')[-1].rstrip('.flac')
        
        for fname, la in self.label_list:
            #print(fname)
            if fname == speech_name: # compare to speech_name with '==' annotation, check if they have same value.
                label = la
                #print("filename: {}, label: {}".format(fname, label))
        
        #print("sp name:", speech_name)
        return features, label
    
# test

file_list = train_list

asvspoof_train = ASVspoofDataSet(file_list=file_list, preprocess=process, phase='train')

# get 10 files and its label
iterations = 10

for itr in range(iterations):
    feature, label = asvspoof_train.__getitem__(itr)
    print("12-dimentional vectors", feature.T.shape)
    print("audiofile label: ", label)
    print()


12-dimentional vectors (149, 60)
audiofile label:  spoof

12-dimentional vectors (155, 60)
audiofile label:  spoof

12-dimentional vectors (152, 60)
audiofile label:  spoof

12-dimentional vectors (139, 60)
audiofile label:  spoof

12-dimentional vectors (196, 60)
audiofile label:  spoof

12-dimentional vectors (345, 60)
audiofile label:  spoof

12-dimentional vectors (215, 60)
audiofile label:  spoof

12-dimentional vectors (143, 60)
audiofile label:  spoof

12-dimentional vectors (149, 60)
audiofile label:  spoof

12-dimentional vectors (201, 60)
audiofile label:  bonafide



# GMMs training section

Hyper parameters for training

LFCCs:

    window_len = 20ms
    nfft = 512
    # of filters = 20
    0th-ceps = removed
    dynamic features = delta, delta-delta included

GMMs:

    n_components = 512
    

In [30]:
batch_size = 32

# instanciate DataLoader
train_dataloader = data.DataLoader(asvspoof_train, batch_size=batch_size, shuffle=True)

val_dataloader = None #data.DataLoader()

dataloader_dict = {
    "train": train_dataloader,
    "val": val_dataloader
}

batch_iterator = iter(dataloader_dict["train"])
inputs, labels = next(batch_iterator) # get first element


RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 193 and 144 in dimension 2 at /opt/conda/conda-bld/pytorch_1579022030672/work/aten/src/TH/generic/THTensor.cpp:612

In [31]:

train_path = '/DB/Audio/English/ASVspoof2019/LA/ASVspoof2019_LA_train/flac/*'
dev_path = '/DB/Audio/English/ASVspoof2019/LA/ASVspoof2019_LA_dev/flac/*'

speech_count = 0
total = len(glob.glob(train_path))
print("total_speech:", total)

genuine_df = pd.DataFrame()
spoofed_df = pd.DataFrame()

progress=np.arange(10.0,110.0,10)

for itr in range(total):
    
    prog = speech_count/total*100
    if prog in progress:
        print('%.2f % complete' % prog)
    
    feature, label = asvspoof_train.__getitem__(itr)
    #print("12-dimentional vectors", feature.T.shape)
    #print("audiofile label: ", label)
    #print()
    feature_df = pd.DataFrame(feature.T)
    
    #print(feature_df.shape)
    if label == 'bonafide':
        genuine_df = genuine_df.append(feature_df, ignore_index=True)
    else:
        spoofed_df = spoofed_df.append(feature_df, ignore_index=True)
    
    speech_count += 1
    

total_speech: 25380
10.00 complete
20.00 complete
30.00 complete
40.00 complete
50.00 complete
60.00 complete
70.00 complete
80.00 complete
90.00 complete


In [32]:
genuine_df.dropna(inplace=True)
genuine_df.reset_index(drop=True).to_csv('./lfcc_genuine.csv', index=False)

spoofed_df.dropna(inplace=True)
spoofed_df.reset_index(drop=True).to_csv('./lfcc_spoofed.csv', index=False)

In [33]:
Xg = pd.read_csv('./lfcc_genuine.csv')
Xs = pd.read_csv('./lfcc_spoofed.csv')

In [34]:
# check count
print(len(Xg), len(Xs))
# drop nan or inf
Xg.dropna(inplace=True)
Xs.dropna(inplace=True)

# check count again
print(len(Xg), len(Xs))

542574 4853674
542574 4853674


In [35]:
# GMMs training

# speaker embedding by using GMMs, where n_components = 512
n_components = 512

genuine_gmms = GaussianMixture(n_components=n_components, covariance_type='diag', max_iter=10, random_state=None)

spoofed_gmms = GaussianMixture(n_components=n_components, covariance_type='diag', max_iter=10, random_state=None)

# Train the other parameters using the EM algorithm
genuine_gmms.fit(Xg)
spoofed_gmms.fit(Xs)

Xg.head()
#Xs.head()

KeyboardInterrupt: 

In [None]:

n_speech = len(glob.glob('the path to database of test data'))

for i, speech in enumerate(sorted(glob.glob('the path to the database'))):
    
    print(i/n_speech,'percent completed')
    
    lfcc = LFCC(wavfile=speech).get_lfcc()

    score = np.array([])
    for lfcc_frame in lfcc.T:
        loglh_genuine = genuine_gmms.score(lfcc_frame.reshape(1, -1))
        loglh_spoofed = spoofed_gmms.score(lfcc_frame.reshape(1, -1))
    
    # compute mean
    
    # compute log-likelihood ratio
    score = loglh_genuine - loglh_spoofed
    
    # store score to file
    