# LFCC-GMM trainning baseline
   #### number of training = 25,380
    genuine 2,580 spoof 22,800
   
   #### number of development = 24,986
    genuine 2,548 spoof 22,296

In [1]:
# GMMs(Gaussian Mixture Models) front-end are LFCCs and CQCCs
# My library
from lfcc import *

# Library for dataloader
import os.path
import glob

# Library for LFCC-GMM
import numpy as np
import pandas as pd
import joblib
from sklearn.mixture import GaussianMixture

# Library for reading flac audio file
import soundfile as sf
#from scipy.io.wavfile import read

# Library for pytorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision
from torchvision import models, transforms

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-cpf811uq because the default path (/home/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
float_formatter = "{:.4f}".format

np.set_printoptions(formatter={'float_kind': float_formatter})

# Preproess, Dataset, Dataloader definition

In [4]:
class Preprocess(object):
    """
    Preprocessing class for audio data
    
    Attributes:
    
    """
    def __init__(self):
        """
        Parameters
        ----------
        
        """
        self.extractor = None
        self.features = None
        
    def __call__(self, y, sr, feature, dynamic=True):
        """
        Extract fetures with lfcc, mfcc, cqcc and other method
        
        Parameters
        ----------
        
        """
        if feature == 'LFCC':
            self.extractor = LFCC(y, sr)
            
        elif feature == 'MFCC':
            self.extractor = MFCC(y, sr)
        
        elif feature == 'CQCC':
            self.extractor = CQCC(y, sr)
        
        self.features = self.extractor.extract_feature(delta=True)
        
        return self.features

In [5]:
!ls

baseline_evaluation.ipynb  eval_metrics.py		libCB_v01.py
cm_LA_LFCC_frame.score	   evaluate_tDCF_asvspoof19.py	mfcc.py
cqcc.py			   GMM_sequential.ipynb		models
dask-worker-space	   GMMs_train.ipynb		__pycache__
datasets		   gmm_train.py			pytorch_gmm.ipynb
em_algorithm.py		   LA_T_1028533.flac		scores
em_algorithm_v2.py	   lfcc.py


In [6]:
import librosa

y, sr = sf.read('LA_T_1028533.flac')

B = 96 # number of bins per octave
fmax = sr//2
fmin = fmax/(2**9) # 9 being number of octave
fbas = B * int(np.log2(fmax/fmin))
print(fbas)

x_fea = librosa.cqt(y, sr, hop_length=(2**7)*1, fmin=fmin, n_bins=fbas, bins_per_octave=B,
                    tuning=None, norm=len(y), sparsity=0)
x_fea = np.abs(x_fea)
x_fea.shape # This should be 863, 214

864




(864, 229)

In [3]:
y, sr = sf.read('LA_T_1028533.flac')

print(sr*20//1000)

extractor = LFCC(y, sr)
lfcc = extractor.extract_feature(delta=True).T

for i in range(5):
    print(lfcc[i][:5])
    
print(lfcc.shape)

320
step_length: 160
wave_length: 29219, frame_length: 320, nshift: 181
[-17.7583 -0.5317 0.4484 0.3518 -0.2962]
[-18.0314 -0.4648 0.3014 0.3144 0.1563]
[-17.8703 -0.4279 -0.0296 0.1293 -0.0894]
[-17.9633 -0.3921 0.5168 0.5181 0.2447]
[-17.9169 -0.2804 0.0427 0.5191 0.1362]
(181, 60)


In [None]:
def make_datapath_list(phase='train'):
    """
    make a list containing a path to data
    
    Parameters
    ----------
    phase: 'train' or 'dev' or 'eval'
        specify whether data is for train or development or evaluation
    
    Returns
    ----------
    path_list : list
        return a list containing a path to data
    """
    
    root_path = "/DB/Audio/English/ASVspoof2019/LA/"
    target_path = os.path.join(root_path+'ASVspoof2019_LA_'+phase+'/flac/*.flac')
    print(target_path)
    
    path_list = []
    
    # Get a filepath to subdir by using glob module
    for path in sorted(glob.glob(target_path)):
        path_list.append(path)
    
    return path_list

# test
train_list = make_datapath_list(phase='train')
dev_list = make_datapath_list(phase='dev')

#print(train_list)

#print(dev_list)

#print(len(train_list), len(dev_list))
    

In [None]:
# Make dataloader
class ASVspoofDataSet(data.Dataset):
    """
    Dataset class for ASVspoof2019, which derived from torch.utils.data.Dataset class
    
    Attributes:
    --------------
    file_list: list
        list containing a path to data
        
    transform: object
        instance of PreProcessor
    
    phase: str
        'train' or 'dev' or 'eval'
    """
    
    def __init__(self, file_list, label_list=None, preprocess=None, phase='train'):
        """
        Parameters
        ----------
        file_list: list
            list of audio files to read
        
        label_list: list
            list of labels('bonafide' or 'spoof'), which is changed to 0, 1
        
        transform: class PreProcess
            instance of PreProcess to be used for pre-process to audio data
        
        phase: str
            specify whether data is for training or development or evaluation('train' or 'dev' or 'eval')
            
        """
        
        self.phase = phase
        self.preprocess = preprocess
        self.root_path = '/DB/Audio/English/ASVspoof2019/LA/'
        #self.file_path = None
        self.file_list = file_list
        self.label_path = None
        self.label_list = label_list
        
        if self.phase == 'train':
            self.label_path = os.path.join(self.root_path+'ASVspoof2019_LA_cm_protocols/')
            self.label_list = []
            with open(self.label_path+'ASVspoof2019.LA.cm.train.trn.txt', mode='r') as protocols:
                for line in protocols:
                    line = line.split() # read line by line
                    filename, label = line[1], line[-1] # get filename and label from protocols file
                    self.label_list.append((filename, label))
                    
        elif self.phase == 'dev':
            self.label_path = os.path.join(self.root_path+'ASVspoof2019_LA_cm_protocols/')
            self.label_list = []
            with open(self.label_path+'ASVspoof2019.LA.cm.dev.trl.txt', mode='r') as protocols:
                for line in protocols:
                    line = line.split() # read line by line
                    filename, label = line[1], (line[0], line[3], line[-1]) # get items from protocols file
                    self.label_list.append((filename, label))
        else:
            print("You must pass either phase='train' or phase='dev'")
        
    def __len__(self): # this is needed to be overrided
        return len(self.file_list)
    
    def __getitem__(self, index): # this is also needed to be overrided
        """
        Get data and its label that was pre-processed
        """
        
        # load audio
        speech_path = self.file_list[index]
        speech, sr = sf.read(speech_path)
        
        # preprocessing and extract features
        features = self.preprocess(y=speech, sr=sr, feature='LFCC')
        
        label = None
        
        speech_name = speech_path.split('/')[-1].rstrip('.flac')
        
        for fname, key in self.label_list:
            #print(fname)
            if fname == speech_name: # compare to speech_name with '==' annotation, check if they have same value.
                label = key
                #print("filename: {}, label: {}".format(fname, label))
        
        #print("sp name:", speech_name)
        if label is None:
            pass
            #print('[debug print] filename:', speech_name)
            
        return features, label
    
# test

asvspoof_train = ASVspoofDataSet(file_list=train_list, preprocess=process, phase='train')

# get 10 files and its label
iterations = 10

for itr in range(iterations):
    #print(asvspoof_train.file_list[itr])
    feature, label = asvspoof_train.__getitem__(itr)
    print("60 vectors", feature.T.shape)
    print("audiofile label: ", label)
    print()

In [None]:
# undersampling number of bonafide 'file' from spoofed speech 'file'
# n_bonafide file = 2,580
# n_spoof file = 22,800
"""
n_bonafide_file = 2580

genuine_list = []
spoof_list = []

for fname, la in asvspoof_train.label_list:
    if la == 'bonafide':
        genuine_list.append(fname)
    else:
        spoof_list.append(fname)

print(len(genuine_list),len(spoof_list))

# randomly sample n_bonafide data from spoof list
downsampled = np.random.choice(spoof_list, size=n_bonafide_file, replace=False)

spoof_list = downsampled
print(len(spoof_list))
print(genuine_list[:4])
print(spoof_list[:4])
"""


In [None]:
##########################################
# Making csv undersampled dataset for GMMs
##########################################
#! This method doesnt use asvdataset.__getitem__
"""
train_path = '/DB/Audio/English/ASVspoof2019/LA/ASVspoof2019_LA_train/flac/'

genuine_df = pd.DataFrame()

total = len(genuine_list)
count = 0
print('total file', total)

progress=np.linspace(10,100,10)

for fname in genuine_list:
    # spoof_list is undersampled
    prog = count/total*100
    if prog in progress:
        print('{}% complete'.format(prog))

    # load audio
    speech, sr = sf.read(train_path+fname+'.flac')
    # preprocessing and extract features
    features = process(y=speech, sr=sr, feature='LFCC') # preprocess to speech, not implemented yet

    feature_df = pd.DataFrame(feature.T)

    genuine_df = genuine_df.append(feature_df, ignore_index=True)
    
    count += 1
    
print('end')
"""


In [None]:
"""
##########################################
# Making csv undersampled dataset for GMMs
##########################################
#! This method doesnt use asvdataset.__getitem__

train_path = '/DB/Audio/English/ASVspoof2019/LA/ASVspoof2019_LA_train/flac/'

spoofed_df = pd.DataFrame()

total = len(spoof_list)
count = 0
print('total file', total)

progress=np.linspace(10,100,10)

for file in spoof_list:
    # spoof_list is undersampled
    prog = count/total*100
    if prog in progress:
        print('{}% complete'.format(prog))

    # load audio
    speech, sr = sf.read(train_path+file+'.flac')
    # preprocessing and extract features
    features = process(y=speech, sr=sr, feature='LFCC') # preprocess to speech, not implemented yet

    feature_df = pd.DataFrame(feature.T)
    
    spoofed_df = spoofed_df.append(feature_df, ignore_index=True)
    
    count += 1

print('end')
"""


# GMMs training section

Hyper parameters for training

LFCCs:

    window_len = 20ms
    nfft = 512
    # of filters = 20
    dynamic features = delta, delta-delta included

GMMs:

    n_components = 512
    

In [None]:
batch_size = 32

# instanciate DataLoader
train_dataloader = data.DataLoader(asvspoof_train, batch_size=batch_size, shuffle=True)

val_dataloader = None #data.DataLoader()

dataloader_dict = {
    "train": train_dataloader,
    "val": val_dataloader
}

batch_iterator = iter(dataloader_dict["train"])
inputs, labels = next(batch_iterator) # get first element


In [None]:

train_path = '/DB/Audio/English/ASVspoof2019/LA/ASVspoof2019_LA_train/flac/*'

speech_count = 0
total = len(glob.glob(train_path))
print("total_speech:", total)

genuine_df = pd.DataFrame()
spoofed_df = pd.DataFrame()

progress=np.linspace(10,100,10)

for itr in range(total):
    
    prog = speech_count/total*100
    
    if prog in progress:
        print('{}% complete'.format(prog))
    
    feature, label = asvspoof_train.__getitem__(itr)
    #print("12-dimentional vectors", feature.T.shape)
    #print("audiofile label: ", label)
    #print()
    feature_df = pd.DataFrame(feature.T)
    
    #print(feature_df.shape)
    if label == 'bonafide':
        genuine_df = genuine_df.append(feature_df, ignore_index=True)
    else:
        spoofed_df = spoofed_df.append(feature_df, ignore_index=True)
    
    speech_count += 1

print('end')

In [None]:
feature, label = asvspoof_train.__getitem__(0)

# Sentence-based-scaling npy file save

In [None]:
# Make sentence based scaling dataset

import numpy as np
from sklearn import preprocessing

train_path = '/DB/Audio/English/ASVspoof2019/LA/ASVspoof2019_LA_train/flac/*'

lfcc_scaled_bona = np.array([]).reshape(0, 60)
lfcc_scaled_spoof = np.array([]).reshape(0, 60)

for itr in range(len(asvspoof_train)):
    
    feature, label = asvspoof_train.__getitem__(itr)
    
    np.save('./datasets/original/train/lfcc/{}_{}'.format(label, itr), feature.T)
    
    feature_scaled = preprocessing.scale(feature.T)
    
    #print(feature_scaled.shape, feature_scaled.mean(axis=0)[:4], feature_scaled.std(axis=0)[:4])
    np.save('./datasets/scaled/train/lfcc/{}_{}'.format(label, itr), feature_scaled)
    
    if label == 'bonafide':
        lfcc_scaled_bona = np.vstack((lfcc_scaled_bona, feature_scaled))
    else:
        lfcc_scaled_spoof = np.vstack((lfcc_scaled_spoof, feature_scaled))

print('end')
print(lfcc_scaled_bona.shape)# should be 542574
print(lfcc_scaled_spoof.shape)# should be 4853674

In [None]:
# Save stacking lfcc matrix calculated by sentence-based-scaling
np.save('./datasets/scaled/train/lfcc/all_bonafide.npy', lfcc_scaled_bona)
np.save('./datasets/scaled/train/lfcc/all_spoof.npy', lfcc_scaled_spoof)

In [None]:
d = np.load('./datasets/scaled/train/all_data_lfcc.npy')

In [None]:

asvspoof_dev = ASVspoofDataSet(file_list=dev_list, preprocess=process, phase='dev')

#lfcc_scaled_bona_dev = np.array([]).reshape(0, 60)
#lfcc_scaled_spoof_dev = np.array([]).reshape(0, 60)

for itr in range(len(asvspoof_dev)):
    
    feature, label = asvspoof_dev.__getitem__(itr)
    
    if label is None:
        continue
    
    np.save('./datasets/original/dev/lfcc/{}_{}.npy'.format(label[-1], itr), feature.T)
    
    feature_scaled = preprocessing.scale(feature.T)
    #print(label)
    #print(feature_scaled.shape, feature_scaled.mean(axis=0)[:4], feature_scaled.std(axis=0)[:4])
    np.save('./datasets/scaled/dev/lfcc/{}_{}.npy'.format(label[-1], itr), feature_scaled)
    
    #lfcc_scaled = np.vstack((lfcc_scaled, feature_scaled))
    
print('end')
#print(lfcc_scaled.shape)

In [None]:
print(len(genuine_df), len(spoofed_df))

In [None]:
# remove inf and nan from genuine_df and spoofed_df

genuine_df.replace([np.inf, -np.inf], np.nan).dropna(inplace=True)
genuine_df.reset_index(drop=True).to_csv('./datasets/lfcc_genuine_downsampled.csv', index=False)

spoofed_df.replace([np.inf, -np.inf], np.nan).dropna(inplace=True)
spoofed_df.reset_index(drop=True).to_csv('./datasets/lfcc_spoofed_downsampled.csv', index=False)

In [None]:
Xg = pd.read_csv('./datasets/lfcc_genuine.csv')

n_genuine = Xg.shape[0]

In [None]:
print('n_genuine:{}, n_spoofed:{}'.format(n_genuine, n_spoofed))

In [None]:
# GMMs training

#Xg = pd.read_csv('./lfcc_genuine.csv')

# speaker embedding by using GMMs, where n_components = 512
n_components = 512

genuine_gmms = GaussianMixture(n_components=n_components, covariance_type='diag', init_params='random', max_iter=10, random_state=None)

# Train the other parameters using the EM algorithm
genuine_gmms.fit(Xg)

Xg.head()

In [None]:
Xs = pd.read_csv('./datasets/lfcc_spoofed.csv')

n_spoofed = Xs.shape[0]

In [None]:
print('n_genuine:{}, n_spoofed:{}'.format(n_genuine, n_spoofed), n_genuine+n_spoofed)

In [None]:
Xs_downsampled_idx = np.random.choice(np.arange(n_spoofed), size=n_genuine, replace=False)

print(Xs_downsampled_idx.shape)

Xs_new = Xs.loc[Xs_downsampled_idx]

In [None]:
Xs_new.head()

In [None]:
import pandas as pd
from sklearn.mixture import GaussianMixture

#Xs = pd.read_csv('./lfcc_spoofed.csv')

n_components = 512

#n_sample = Xs.shape[0]

spoofed_gmms = GaussianMixture(n_components=n_components, covariance_type='diag', init_params='random', max_iter=10, random_state=None)

# Train the other parameters using the EM algorithm
spoofed_gmms.fit(Xs_new)

In [None]:
import joblib

joblib.dump(genuine_gmms, 'genuine_gmms.model')
joblib.dump(spoofed_gmms, 'spoofed_gmms.model')

In [None]:

asvspoof_dev = ASVspoofDataSet(file_list=dev_list, preprocess=process, phase='dev')

# get 10 files and its label
iterations = 10

for itr in range(iterations):
    feature, label = asvspoof_dev.__getitem__(itr)
    #print("60 vectors", feature.T.shape)
    print("audiofile label:", label)

In [None]:
# load models
genuine_gmms = joblib.load('./models/genuine_gmms.model')

spoofed_gmms = joblib.load('./models/spoofed_gmms.model')

In [None]:

dev_path = '/DB/Audio/English/ASVspoof2019/LA/ASVspoof2019_LA_dev/flac/*'

speech_count = 0
total = len(glob.glob(dev_path))
print("total_speech:", total)

progress=np.arange(10.0,110.0,10)

cm_LA_LFCC = []

for itr in range(total):
    
    prog = speech_count/total*100
    if prog in progress:
        print('%.2f % complete' % prog)
    
    feature, label = asvspoof_dev.__getitem__(itr)
    
    # compute log-likelihood ratio
    score = genuine_gmms.score(feature.T) - spoofed_gmms.score(feature.T)
    
    if label is None:
        continue
    cm_LA_LFCC.append((*label, score))

print('Done!')
print(len(cm_LA_LFCC))
#np.savetxt('scores_cm_LA_LFCC.txt', score, fmt='%.5f')

In [None]:
len(cm_LA_LFCC) # This should be 24844

In [None]:
with open('scores_cm_LA_LFCC.txt', mode='w') as f:
    
    f.write('\n'.join('{} {} {} {}'.format(x[0], x[1], x[2], x[3]) for x in cm_LA_LFCC))

In [None]:
with open('scores_cm_LA_LFCC.txt', mode='r') as f:
    for line in f:
        print(line)

In [None]:
!mv ./scores_cm_LA_LFCC.txt ./scores/

In [None]:
joblib.dump(cm_LA_LFCC, 'cm_LA_LFCC.score')