# LFCC-GMM trainning baseline
   #### number of training = 25,380
    genuine 2,580 spoof 22,800
   
   #### number of development = 24,986
    genuine 2,548 spoof 22,296

In [1]:
# GMMs(Gaussian Mixture Models) front-end are LFCCs and CQCCs.
# My library
from lfcc import *

# Library for dataloader
import os.path
import glob

# Library for LFCC-GMM
import numpy as np
import pandas as pd
#from sklearn.externals import joblib
from sklearn.mixture import GaussianMixture

# Library for reading flac audio file
import soundfile as sf
#from scipy.io.wavfile import read

# Library for pytorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision
from torchvision import models, transforms

In [2]:
float_formatter = "{:.4f}".format

np.set_printoptions(formatter={'float_kind': float_formatter})

# Preproess, Dataset, Dataloader definition

In [3]:
class Preprocess(object):
    """
    Preprocessing class for audio data
    
    Attributes:
    
    """
    def __init__(self):
        """
        Parameters
        ----------
        
        """
        self.extractor = None
        self.features = None
        
    def __call__(self, y, sr, feature, dynamic=True):
        """
        Extract fetures with lfcc, mfcc, cqcc and other method
        
        Parameters
        ----------
        
        """
        if feature == 'LFCC':
            self.extractor = LFCC(y, sr)
            
        elif feature == 'MFCC':
            self.extractor = MFCC(y, sr)
        
        elif feature == 'CQCC':
            self.extractor = CQCC(y, sr)
        
        self.features = self.extractor.extract_feature(delta=True)
        
        return self.features

In [4]:
# Preprocess test
process = Preprocess()

In [5]:
!ls

cqcc.py		    lfcc_genuine.csv  mfcc.py		utterance3.wav
genuine_gmms.model  lfcc_gmm.py       __pycache__
GMMs_train.ipynb    lfcc.py	      s1260057_report1
LA_T_1028533.flac   lfcc_spoofed.csv  utterance0.wav


In [8]:
y, sr = sf.read('LA_T_1028533.flac')

print(sr*32//1000)

ext = LFCC(y, sr)
lfcc = ext.extract_feature(delta=True).T

print(lfcc[3])
print(lfcc.shape)

512
[-16.5522 -0.4726 0.1725 0.3888 0.1829 0.3511 -0.0702 0.1470 0.1563 0.3022
 0.0253 0.2734 -0.0234 0.1721 -0.0298 0.0777 0.0121 0.1056 0.0020 -0.0753
 0.0384 -0.2158 -0.3082 -0.2376 -0.1667 0.2446 0.1779 -0.0823 -0.0529
 -0.2009 0.1487 0.1664 -0.1119 -0.0733 0.2524 -0.2201 0.1062 0.2401 0.2084
 0.2276 -0.7961 -0.2876 -0.3175 -0.4325 -0.4009 0.0602 -0.0161 0.0760
 -0.2515 -0.2867 -0.2888 -0.3918 0.1669 0.0195 -0.1686 0.0731 -0.1411
 -0.1029 -0.0962 0.0587]
(113, 60)


In [None]:
def make_datapath_list(phase='train'):
    """
    make a list containing a path to data
    
    Parameters
    ----------
    phase: 'train' or 'dev' or 'eval'
        specify whether data is for train or development or evaluation
    
    Returns
    ----------
    path_list : list
        return a list containing a path to data
    """
    
    root_path = "/DB/Audio/English/ASVspoof2019/LA/"
    target_path = os.path.join(root_path+'ASVspoof2019_LA_'+phase+'/flac/*.flac')
    print(target_path)
    
    path_list = []
    
    # Get a filepath to subdir by using glob module
    for path in glob.glob(target_path):
        path_list.append(path)
    
    return path_list

# test
train_list = make_datapath_list(phase='train')
dev_list = make_datapath_list(phase='dev')

#print(train_list)

#print(dev_list)

#print(len(train_list), len(dev_list))
    

In [None]:
# Make dataloader
class ASVspoofDataSet(data.Dataset):
    """
    Dataset class for ASVspoof2019, which derived from torch.utils.data.Dataset class
    
    Attributes:
    --------------
    file_list: list
        list containing a path to data
        
    transform: object
        instance of PreProcessor
    
    phase: str
        'train' or 'dev' or 'eval'
    """
    
    def __init__(self, file_list, label_list=None, preprocess=None, phase='train'):
        """
        Parameters
        ----------
        file_list: list
            list of audio files to read
        
        label_list: list
            list of labels('bonafide' or 'spoof'), which is changed to 0, 1
        
        transform: class PreProcess
            instance of PreProcess to be used for pre-process to audio data
        
        phase: str
            specify whether data is for training or development or evaluation('train' or 'dev' or 'eval')
            
        """
        
        self.phase = phase
        self.preprocess = preprocess
        self.root_path = '/DB/Audio/English/ASVspoof2019/LA/'
        #self.file_path = None
        self.file_list = file_list
        self.label_path = None
        self.label_list = label_list
        
        if self.phase == 'train':
            self.label_path = os.path.join(self.root_path+'ASVspoof2019_LA_cm_protocols/')
            self.label_list = []
            with open(self.label_path+'ASVspoof2019.LA.cm.train.trn.txt', mode='r') as protocols:
                for line in protocols:
                    line = line.split() # read line by line
                    filename, label = line[1], line[-1] # get filename and label from protocols file
                    self.label_list.append((filename, label))
                    
        elif self.phase == 'dev':
            self.label_path = os.path.join(self.root_path+'ASVspoof2019_LA_cm_protocols/')
            self.label_list = []
            with open(self.label_path+'ASVspoof2019.LA.cm.dev.trl.txt', mode='r') as protocols:
                for line in protocols:
                    line = line.split() # read line by line
                    filename, label = line[1], line[-1] # get filename and label from protocols file
                    self.label_list.append((filename, label))
        else:
            print("You must pass either phase='train' or phase='dev'")
        
    def __len__(self): # this is needed to be overrided
        return len(self.file_list)
    
    def __getitem__(self, index): # this is also needed to be overrided
        """
        Get data and its label that was pre-processed
        """
        
        # load audio
        speech_path = self.file_list[index]
        speech, sr = sf.read(speech_path)
        
        # preprocessing and extract features
        features = self.preprocess(y=speech, sr=sr, feature='LFCC') # preprocess to speech, not implemented yet
        
        label = None
        speech_name = speech_path.split('/')[-1].rstrip('.flac')
        
        for fname, la in self.label_list:
            #print(fname)
            if fname == speech_name: # compare to speech_name with '==' annotation, check if they have same value.
                label = la
                #print("filename: {}, label: {}".format(fname, label))
        
        #print("sp name:", speech_name)
        return features, label
    
# test

file_list = train_list

asvspoof_train = ASVspoofDataSet(file_list=file_list, preprocess=process, phase='train')

# get 10 files and its label
iterations = 10

for itr in range(iterations):
    #print(asvspoof_train.file_list[itr])
    feature, label = asvspoof_train.__getitem__(itr)
    print("60 vectors", feature.T.shape)
    print("audiofile label: ", label)
    print()


# GMMs training section

Hyper parameters for training

LFCCs:

    window_len = 20ms
    nfft = 512
    # of filters = 20
    dynamic features = delta, delta-delta included

GMMs:

    n_components = 512
    

In [None]:
batch_size = 32

# instanciate DataLoader
train_dataloader = data.DataLoader(asvspoof_train, batch_size=batch_size, shuffle=True)

val_dataloader = None #data.DataLoader()

dataloader_dict = {
    "train": train_dataloader,
    "val": val_dataloader
}

batch_iterator = iter(dataloader_dict["train"])
inputs, labels = next(batch_iterator) # get first element


In [None]:

train_path = '/DB/Audio/English/ASVspoof2019/LA/ASVspoof2019_LA_train/flac/*'
dev_path = '/DB/Audio/English/ASVspoof2019/LA/ASVspoof2019_LA_dev/flac/*'

speech_count = 0
total = len(glob.glob(train_path))
print("total_speech:", total)

genuine_df = pd.DataFrame()
spoofed_df = pd.DataFrame()

progress=np.arange(10.0,110.0,10)

for itr in range(total):
    
    prog = speech_count/total*100
    if prog in progress:
        print('%.2f % complete' % prog)
    
    feature, label = asvspoof_train.__getitem__(itr)
    #print("12-dimentional vectors", feature.T.shape)
    #print("audiofile label: ", label)
    #print()
    feature_df = pd.DataFrame(feature.T)
    
    #print(feature_df.shape)
    if label == 'bonafide':
        genuine_df = genuine_df.append(feature_df, ignore_index=True)
    else:
        spoofed_df = spoofed_df.append(feature_df, ignore_index=True)
    
    speech_count += 1
    

In [None]:
print(len(genuine_df), len(spoofed_df))

In [None]:
genuine_df.dropna(inplace=True)
genuine_df.reset_index(drop=True).to_csv('./lfcc_genuine.csv', index=False)

spoofed_df.dropna(inplace=True)
spoofed_df.reset_index(drop=True).to_csv('./lfcc_spoofed.csv', index=False)

In [None]:
Xg = pd.read_csv('./lfcc_genuine.csv')
Xs = pd.read_csv('./lfcc_spoofed.csv')

In [None]:
# check count
print(len(Xg), len(Xs))
# drop nan or inf
Xg.dropna(inplace=True)
Xs.dropna(inplace=True)

Xs.replace([np.inf, -np.inf], np.nan).dropna(inplace=True)
# check count again
print(len(Xg), len(Xs))

In [None]:
# GMMs training
# speaker embedding by using GMMs, where n_components = 512
n_components = 512

genuine_gmms = GaussianMixture(n_components=n_components, covariance_type='diag', max_iter=10, random_state=None)

# Train the other parameters using the EM algorithm
genuine_gmms.fit(Xg)

Xg.head()

In [None]:
Xs = pd.read_csv('./lfcc_spoofed.csv')
# check count
print(len(Xs))
# drop nan or inf
Xs.dropna(inplace=True)

Xs.replace([np.inf, -np.inf], np.nan).dropna(inplace=True)
# check count again
print(len(Xs))

In [None]:
from sklearn.mixture import GaussianMixture

n_components = 512

spoofed_gmms = GaussianMixture(n_components=n_components, covariance_type='diag', max_iter=10, random_state=None)

# Train the other parameters using the EM algorithm
spoofed_gmms.fit(Xs)

Xs.head()

In [None]:
import joblib

joblib.dump(genuine_gmms, 'genuine_gmms.model')
joblib.dump(spoofed_gmms, 'spoofed_gmms.model')

In [None]:

asvspoof_dev = ASVspoofDataSet(file_list=dev_list, preprocess=process, phase='dev')

# get 10 files and its label
iterations = 10

for itr in range(iterations):
    feature, label = asvspoof_dev.__getitem__(itr)
    print("60 vectors", feature.T.shape)
    print("audiofile label: ", label)
    print()

In [None]:

dev_path = '/DB/Audio/English/ASVspoof2019/LA/ASVspoof2019_LA_dev/flac/*'

speech_count = 0
total = len(glob.glob(dev_path))
print("total_speech:", total)

progress=np.arange(10.0,110.0,10)

score = np.array()

for itr in range(3):
    
    prog = speech_count/total*100
    if prog in progress:
        print('%.2f % complete' % prog)
    
    feature, label = asvspoof_dev.__getitem__(itr)
    
    print(genuine_gmms.score(lfccs).shape)
    
    loglh_genuine = np.mean(genuine_gmms.score(lfccs), axis=0)
    loglh_spoofed = np.mean(spoofed_gmms.score(lfccs), axis=0)
    
    # compute log-likelihood ratio
    score = np.append(score, loglh_genuine - loglh_spoofed)

# store score to file
np.save('scores_cm_LA_LFCC.txt', score)

In [None]:
with open('scores_cm_LA_LFCC.txt', mode='w') as score_file:
    