# LFCC-GMM trainning baseline
   #### number of training = 25,380
    genuine 2,580 spoof 22,800
   
   #### number of development = 24,986
    genuine 2,548 spoof 22,296

In [2]:
# GMMs(Gaussian Mixture Models) front-end are LFCCs and CQCCs.
# My library
from lfcc import *

# Library for dataloader
import os.path
import glob

# Library for LFCC-GMM
import numpy as np
import pandas as pd
#from sklearn.externals import joblib
from sklearn.mixture import GaussianMixture

# Library for reading flac audio file
import soundfile as sf
#from scipy.io.wavfile import read

# Library for pytorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision
from torchvision import models, transforms

In [3]:
float_formatter = "{:.4f}".format

np.set_printoptions(formatter={'float_kind': float_formatter})

# Preproess, Dataset, Dataloader definition

In [4]:
class Preprocess(object):
    """
    Preprocessing class for audio data
    
    Attributes:
    
    """
    def __init__(self):
        """
        Parameters
        ----------
        
        """
        self.extractor = None
        self.features = None
        
    def __call__(self, y, sr, feature, dynamic=True):
        """
        Extract fetures with lfcc, mfcc, cqcc and other method
        
        Parameters
        ----------
        
        """
        if feature == 'LFCC':
            self.extractor = LFCC(y, sr)
            
        elif feature == 'MFCC':
            self.extractor = MFCC(y, sr)
        
        elif feature == 'CQCC':
            self.extractor = CQCC(y, sr)
        
        self.features = self.extractor.extract_feature()
        
        return self.features

In [5]:
# Preprocess test
process = Preprocess()

In [6]:
!ls

cqcc.py		   lfcc_genuine.csv  lfcc_spoofed.csv  s1260057_report1
GMMs_train.ipynb   lfcc_gmm.py	     mfcc.py	       utterance0.wav
LA_T_1028533.flac  lfcc.py	     __pycache__       utterance3.wav


In [7]:
y, sr = sf.read('LA_T_1028533.flac')

print(sr*32//1000)

ext = LFCC(y, sr)
lfcc = ext.extract_feature().T

print(lfcc[0])
print()
print(lfcc[1])

512
[-41.1161 -0.4636 0.3527 0.3396 -0.0897 -0.0397 0.2201 -0.0644 -0.0815
 0.4321 0.0985 0.1912]

[-41.1636 -0.4733 0.1317 0.1415 -0.0173 0.4239 0.0381 0.2045 0.2349
 -0.0618 -0.0771 -0.0280]


In [8]:
def make_datapath_list(phase='train'):
    """
    make a list containing a path to data
    
    Parameters
    ----------
    phase: 'train' or 'dev' or 'eval'
        specify whether data is for train or development or evaluation
    
    Returns
    ----------
    path_list : list
        return a list containing a path to data
    """
    
    root_path = "/DB/Audio/English/ASVspoof2019/LA/"
    target_path = os.path.join(root_path+'ASVspoof2019_LA_'+phase+'/flac/*.flac')
    print(target_path)
    
    path_list = []
    
    # Get a filepath to subdir by using glob module
    for path in glob.glob(target_path):
        path_list.append(path)
    
    return path_list

# test
train_list = make_datapath_list(phase='train')
dev_list = make_datapath_list(phase='dev')

#print(train_list)

#print(dev_list)

#print(len(train_list), len(dev_list))
    

/DB/Audio/English/ASVspoof2019/LA/ASVspoof2019_LA_train/flac/*.flac
/DB/Audio/English/ASVspoof2019/LA/ASVspoof2019_LA_dev/flac/*.flac


In [11]:
# Make dataloader
class ASVspoofDataSet(data.Dataset):
    """
    Dataset class for ASVspoof2019, which derived from torch.utils.data.Dataset class
    
    Attributes:
    --------------
    file_list: list
        list containing a path to data
        
    transform: object
        instance of PreProcessor
    
    phase: str
        'train' or 'dev' or 'eval'
    """
    
    def __init__(self, file_list, label_list=None, preprocess=None, phase='train'):
        """
        Parameters
        ----------
        file_list: list
            list of audio files to read
        
        label_list: list
            list of labels('bonafide' or 'spoof'), which is changed to 0, 1
        
        transform: class PreProcess
            instance of PreProcess to be used for pre-process to audio data
        
        phase: str
            specify whether data is for training or development or evaluation('train' or 'dev' or 'eval')
            
        """
        
        self.phase = phase
        self.preprocess = preprocess
        self.root_path = '/DB/Audio/English/ASVspoof2019/LA/'
        #self.file_path = None
        self.file_list = file_list
        self.label_path = None
        self.label_list = label_list
        
        if self.phase == 'train':
            self.label_path = os.path.join(self.root_path+'ASVspoof2019_LA_cm_protocols/')
            self.label_list = []
            with open(self.label_path+'ASVspoof2019.LA.cm.train.trn.txt', mode='r') as protocols:
                for line in protocols:
                    line = line.split() # read line by line
                    filename, label = line[1], line[-1] # get filename and label from protocols file
                    self.label_list.append((filename, label))
                    
        elif self.phase == 'dev':
            self.label_path = None
        else:
            self.label_path = None
        
    def __len__(self): # this is needed to be overrided
        return len(self.file_list)
    
    def __getitem__(self, index): # this is also needed to be overrided
        """
        Get data and its label that was pre-processed
        """
        
        # load audio
        speech_path = self.file_list[index]
        speech, sr = sf.read(speech_path)
        
        # preprocessing and extract features
        features = self.preprocess(y=speech, sr=sr, feature='LFCC') # preprocess to speech, not implemented yet
        
        label = None
        speech_name = speech_path.split('/')[-1].rstrip('.flac')
        
        for fname, la in self.label_list:
            #print(fname)
            if fname == speech_name: # compare to speech_name with '==' annotation, check if they have same value.
                label = la
                print("filename: {}, label: {}".format(fname, label))
        
        #print("sp name:", speech_name)
        return features, label
    
# test

file_list = train_list

asvspoof_train = ASVspoofDataSet(file_list=file_list, preprocess=process, phase='train')

# get 10 files and its label
iterations = 10

for itr in range(iterations):
    feature, label = asvspoof_train.__getitem__(itr)
    print("12-dimentional vectors", feature.T.shape)
    print("audiofile label: ", label)
    print()


filename: LA_T_4693111, label: spoof
12-dimentional vectors (149, 12)
audiofile label:  spoof

filename: LA_T_9729458, label: spoof
12-dimentional vectors (155, 12)
audiofile label:  spoof

filename: LA_T_6008224, label: spoof
12-dimentional vectors (152, 12)
audiofile label:  spoof

filename: LA_T_7196968, label: spoof
12-dimentional vectors (139, 12)
audiofile label:  spoof

filename: LA_T_6911273, label: spoof
12-dimentional vectors (196, 12)
audiofile label:  spoof

filename: LA_T_4450164, label: spoof
12-dimentional vectors (345, 12)
audiofile label:  spoof

filename: LA_T_3662604, label: spoof
12-dimentional vectors (215, 12)
audiofile label:  spoof

filename: LA_T_6713123, label: spoof
12-dimentional vectors (143, 12)
audiofile label:  spoof

filename: LA_T_4504002, label: spoof
12-dimentional vectors (149, 12)
audiofile label:  spoof

filename: LA_T_1771704, label: bonafide
12-dimentional vectors (201, 12)
audiofile label:  bonafide



# GMMs training section

Hyper parameters for training

LFCCs:

    window_len = 20ms
    nfft = 512
    # of filters = 20
    0th-ceps = removed
    dynamic features = delta, delta-delta included

GMMs:

    n_components = 512
    

In [12]:
batch_size = 32

# instanciate DataLoader
train_dataloader = data.DataLoader(asvspoof_train, batch_size=batch_size, shuffle=True)

val_dataloader = None #data.DataLoader()

dataloader_dict = {
    "train": train_dataloader,
    "val": val_dataloader
}

batch_iterator = iter(dataloader_dict["train"])
inputs, labels = next(batch_iterator) # get first element


filename: LA_T_6956794, label: spoof
filename: LA_T_4814564, label: spoof
filename: LA_T_5266968, label: spoof
filename: LA_T_7393963, label: spoof
filename: LA_T_2987671, label: bonafide
filename: LA_T_9052152, label: spoof
filename: LA_T_3617820, label: spoof
filename: LA_T_2012289, label: bonafide
filename: LA_T_6900974, label: spoof
filename: LA_T_4080381, label: spoof
filename: LA_T_7743702, label: spoof
filename: LA_T_4974655, label: spoof
filename: LA_T_2444482, label: spoof
filename: LA_T_2403525, label: spoof
filename: LA_T_8681938, label: spoof
filename: LA_T_9183771, label: spoof
filename: LA_T_9364074, label: spoof
filename: LA_T_3510282, label: spoof
filename: LA_T_8791775, label: spoof
filename: LA_T_8101792, label: spoof
filename: LA_T_2323946, label: spoof
filename: LA_T_9089994, label: spoof
filename: LA_T_1237039, label: spoof
filename: LA_T_6564983, label: spoof
filename: LA_T_8523979, label: spoof
filename: LA_T_9279859, label: spoof
filename: LA_T_4092741, label: s

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 169 and 212 in dimension 2 at /opt/conda/conda-bld/pytorch_1579022030672/work/aten/src/TH/generic/THTensor.cpp:612

In [25]:

train_path = '/DB/Audio/English/ASVspoof2019/LA/ASVspoof2019_LA_train/flac/*'
dev_path = '/DB/Audio/English/ASVspoof2019/LA/ASVspoof2019_LA_dev/flac/*'

speech_count = 0
total = len(glob.glob(train_path))
print("total_speech:", total)

genuine_df = pd.DataFrame()
spoofed_df = pd.DataFrame()

for itr in range(10):
    
    print('{} % complete'.format(speech_count/total))
    
    feature, label = asvspoof_train.__getitem__(itr)
    #print("12-dimentional vectors", feature.T.shape)
    #print("audiofile label: ", label)
    #print()
    feature_df = pd.DataFrame(feature.T)
    
    print(feature_df.shape)
    if label == 'bonafide':
        genuine_df = genuine_df.append(feature_df, ignore_index=True)
    else:
        spoofed_df = spoofed_df.append(feature_df, ignore_index=True)
    
    speech_count += 1
    

total_speech: 25380
0.0 % complete
filename: LA_T_4693111, label: spoof
(149, 12)
3.940110323089047e-05 % complete
filename: LA_T_9729458, label: spoof
(155, 12)
7.880220646178094e-05 % complete
filename: LA_T_6008224, label: spoof
(152, 12)
0.00011820330969267139 % complete
filename: LA_T_7196968, label: spoof
(139, 12)
0.00015760441292356187 % complete
filename: LA_T_6911273, label: spoof
(196, 12)
0.00019700551615445234 % complete
filename: LA_T_4450164, label: spoof
(345, 12)
0.00023640661938534278 % complete
filename: LA_T_3662604, label: spoof
(215, 12)
0.0002758077226162333 % complete
filename: LA_T_6713123, label: spoof
(143, 12)
0.00031520882584712374 % complete
filename: LA_T_4504002, label: spoof
(149, 12)
0.0003546099290780142 % complete
filename: LA_T_1771704, label: bonafide
(201, 12)


In [26]:
# check count
print(len(genuine_df), len(spoofed_df))


201 1643


In [32]:
genuine_df.reset_index(drop=True).to_csv('./lfcc_genuine.csv', index=False)
spoofed_df.reset_index(drop=True).to_csv('./lfcc_spoofed.csv', index=False)

In [37]:
# GMMs training

# speaker embedding by using GMMs, where n_components = 512
n_components = 512

genuine_gmms = GaussianMixture(n_components=n_components, covariance_type='diag', max_iter=100, random_state=None)

spoofed_gmms = GaussianMixture(n_components=n_components, covariance_type='diag', max_iter=100, random_state=None)

Xg = pd.read_csv('./lfcc_genuine.csv')
Xs = pd.read_csv('./lfcc_spoofed.csv')

# Train the other parameters using the EM algorithm
genuine_gmms.fit(Xg)
spoofed_gmms.fit(Xs)

Xg.head()
Xs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-43.232835,0.773243,0.032616,0.320308,0.319756,0.349676,0.112323,0.140922,0.274338,0.078688,0.411345,-0.015514
1,-43.927803,0.460822,0.203643,0.238726,0.201849,0.41829,0.098958,0.19332,0.209587,0.291883,0.041371,-0.081493
2,-43.867943,0.522619,0.005994,0.257253,0.374527,0.40709,-0.013139,0.165098,0.280819,0.19364,0.145726,0.052421
3,-40.993793,2.217705,0.834241,-0.266068,0.601328,1.159979,0.612538,0.113617,0.434787,0.406434,0.053986,0.023301
4,-40.459587,3.130816,0.93709,-0.785774,0.258141,1.041662,0.683956,0.321447,0.759091,0.424619,-0.017425,0.123753


In [None]:

n_speech = len(glob.glob('the path to database of test data'))

for i, speech in enumerate(sorted(glob.glob('the path to the database'))):
    
    print(i/n_speech,'percent completed')
    
    lfcc = LFCC(wavfile=speech).get_lfcc()

    score = np.array([])
    for lfcc_frame in lfcc.T:
        loglh_genuine = genuine_gmms.score(lfcc_frame.reshape(1, -1))
        loglh_spoofed = spoofed_gmms.score(lfcc_frame.reshape(1, -1))
    
    # compute mean
    
    # compute log-likelihood ratio
    score = loglh_genuine - loglh_spoofed
    
    # store score to file
    