In [8]:
%matplotlib inline
import numpy as np
import sklearn.model_selection
import sklearn.ensemble
import sklearn.svm
import sklearn.utils
import os
import librosa
from tqdm import tqdm
import pandas as pd
import scipy as sp
import scipy.io.wavfile
from skimage.util.shape import view_as_windows

import warnings
import matplotlib.pyplot as plt

## Best model parameters

In [9]:
rms_th = 0
norm = 'zscore'
alg = 'svm'
win_min_max = True
n_win_min = 50
n_win_max = 250
filter_outliers = True

## Load the best model

In [3]:
import pickle
pkl_filename = '/nas/home/cborrelli/speech_forensics/models/win_cl.pkl'
with open(pkl_filename, 'rb') as file:  
    cl = pickle.load(file)

pkl_filename = '/nas/home/cborrelli/speech_forensics/models/win_mcl.pkl'
with open(pkl_filename, 'rb') as file:  
    mcl = pickle.load(file)
    
pkl_filename = '/nas/home/cborrelli/speech_forensics/models/win_r.pkl'
with open(pkl_filename, 'rb') as file:  
    r = pickle.load(file)

## Test

In [5]:
## eventually if we have time try to train with windows at least this configuration
def df_to_feat_idx_wind(feat_df, key, fun, window_length):
    X = np.asarray(feat_df[key].values)
    Y_v = np.asarray(feat_df['y_value'])
    Y_l = np.asarray(feat_df['y_label'])
    XX = []
    YY_v = []
    YY_l = []
    wind_number = []
    if key == 'mfcc':
        for i, x in enumerate(X):
            x = np.asarray(x).T.squeeze()
            xw = view_as_windows(x, window_shape=(window_length, x.shape[1]),
                                 step=(int(window_length * 0.5), x.shape[1])).squeeze()
            fun_xw = fun(xw).squeeze()
            XX.append(fun_xw)
            wind_number.append(fun_xw.shape[0])

        XX = np.asarray(np.concatenate(XX, axis=0))        
        
    else:
        for i, x in enumerate(X):
            x = np.asarray(x).squeeze()
            xw = view_as_windows(x, window_shape=(window_length,), step=int(window_length * 0.5))
            fun_xw = fun(xw)
            fun_xw = fun_xw.reshape(-1, 1)
            XX.append(fun_xw)
            wind_number.append(fun_xw.shape[0])


        XX = np.asarray(np.concatenate(XX, axis=0))


    return XX, np.array(wind_number)


In [6]:
win_feature_length = [10, 12, 14, 16, 18, 20]

In [7]:
# Load the features
test_data_path = '/nas/home/cborrelli/speech_forensics/notebook/pickle/sphinx/features_test-clean.pkl'
#train_data_path = '/nas/home/cborrelli/speech_forensics/notebook/pickle/sphinx/features_train-clean-100.pkl'
#dev_data_path = '/nas/home/cborrelli/speech_forensics/notebook/pickle/sphinx/features_dev-clean.pkl'

#feat_train_df = pd.read_pickle(train_data_path)
#feat_train_df['dataset'] = 'train'
feat_test_df = pd.read_pickle(test_data_path)
feat_test_df['dataset'] = 'test'
#feat_dev_df = pd.read_pickle(dev_data_path)
#feat_dev_df['dataset'] = 'dev'
#feat_df = pd.concat([feat_train_df, feat_test_df, feat_dev_df], ignore_index=True)
feat_df = feat_test_df

# Shuffle the dataset
feat_df = sklearn.utils.shuffle(feat_df, random_state=0).reset_index(drop=True)


# Apply threshold on window number
feat_df['n_win'] = feat_df['rms'].apply(lambda x: len(x))
if win_min_max:
    n_win_min = 50
    n_win_max = 250
    feat_df = feat_df.loc[
        np.where(np.logical_and(feat_df['n_win'] >= n_win_min, feat_df['n_win'] <= n_win_max))[0]].reset_index()

# Filter out outliers
if filter_outliers:
    idx_0 = (feat_df['y_value'] >= 0) & (feat_df['y_value'] < 0.35) & (feat_df['snr'] == 0)
    idx_2 = (feat_df['y_value'] >= 0) & (feat_df['y_value'] < 0.4) & (feat_df['snr'] == 2)
    idx_5 = (feat_df['y_value'] >= 0.1) & (feat_df['y_value'] < 0.5) & (feat_df['snr'] == 5)
    idx_7 = (feat_df['y_value'] >= 0.1) & (feat_df['y_value'] < 0.7) & (feat_df['snr'] == 7)
    idx_10 = (feat_df['y_value'] >= 0.25) & (feat_df['y_value'] < 0.7) & (feat_df['snr'] == 10)
    idx_12 = (feat_df['y_value'] >= 0.35) & (feat_df['y_value'] < 1.1) & (feat_df['snr'] == 12)
    idx_15 = (feat_df['y_value'] >= 0.4) & (feat_df['y_value'] < 1.1) & (feat_df['snr'] == 15)

    idx = idx_0 | idx_2 | idx_5 | idx_7 | idx_10 | idx_12 | idx_15

    feat_df = feat_df.loc[np.where(idx == 1)].reset_index()

# Compute feature matrix
key_list = ['mfcc', 'sfl', 'sc', 'sroff', 'zcr', 'rms']

results = pd.DataFrame(columns=['window_length', 'cl_balanced_accuracy', 'cl_f1_score', 'mcl_balanced_accuracy','mcl_f1',
                            'r_R2', 'r_mae'])


for w in win_feature_length:
    print('window '+ str(w))
    print('mean')
    X_mean_list = []
    for key in key_list:
        X_m, _= df_to_feat_idx_wind(feat_df, key, lambda x: np.mean(x, axis=1), window_length=w)
        X_mean_list += [X_m]

    X_mean = np.concatenate(X_mean_list, axis=1)
    
    print('std')
    X_std_list = []
    for key in key_list:
        X_s, _= df_to_feat_idx_wind(feat_df, key, lambda x: np.std(x, axis=1), window_length=w)
        X_std_list += [X_s]
    X_std = np.concatenate(X_std_list, axis=1)
    
    print('max')
    X_max_list = []
    for key in key_list:
        X_m, _ = df_to_feat_idx_wind(feat_df, key, lambda x: np.max(x, axis=1), window_length=w)
        X_max_list += [X_m]
    X_max = np.concatenate(X_max_list, axis=1)
    
    print('min')
    X_min_list = []
    for key in key_list:
        X_m, wind_number = df_to_feat_idx_wind(feat_df, key, lambda x: np.min(x, axis=1),
                                                    window_length=w)
        X_min_list += [X_m]
    X_min = np.concatenate(X_min_list, axis=1)

    X = np.concatenate([X_mean, X_std, X_max, X_min], axis=1)

    
    
    # Retrieve labels
    y_mcl = np.array(feat_df['y_label'], dtype=np.float) - 1  # labels for classification
    y_cl = np.array(feat_df['y_value'], dtype=np.float) >= 0.5  # labels for classification
    y_rg = np.array(feat_df['y_value'], dtype=np.float)    
    
    
    y_mcl = np.repeat(y_mcl, wind_number)
    y_cl = np.repeat(y_cl, wind_number)
    y_rg = np.repeat(y_rg, wind_number)
    
    # Retrieve properties
    snrs = feat_df['snr']
    snrs = [item for item, count in zip(snrs, wind_number) for i in range(count)]
    
    noises = feat_df['noise']
    noises = [item for item, count in zip(noises, wind_number) for i in range(count)]


    # Normalize features
    if norm == 'zscore':
        X_norm = (X - X.mean(axis=0)) / X.std(axis=0)  # z-score
    elif norm == 'minmax':
        X_norm = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))  # [0, 1]
    else:
        X_norm = X

    # Remove nan and inf
    X_norm[np.where(np.isnan(X_norm))] = 0
    X_norm[np.where(np.isinf(X_norm))] = 0


    # Train
    print('predict')
    y_pred_cl = cl.predict(X_norm)
    y_pred_mcl = mcl.predict(X_norm)
    y_pred_r = r.predict(X_norm)
    
    columns = ['noise', 'snr', 'y_binlabel', 'y_value', 'y_label',  'y_pred_cl','y_pred_mcl', 'y_pred_r']
    res_df = pd.DataFrame(columns=columns)
    res_df.loc[:, 'noise'] = noises
    res_df.loc[:, 'snr'] = snrs
    res_df.loc[:, 'y_binlabel'] = y_cl
    res_df.loc[:, 'y_value'] = y_rg
    res_df.loc[:, 'y_label'] = y_mcl
    res_df.loc[:, 'y_pred_cl'] = y_pred_cl
    res_df.loc[:, 'y_pred_mcl'] = y_pred_mcl
    res_df.loc[:, 'y_pred_r'] = y_pred_r
    
    res_file_name ='results_windowlenght-'+str(w)
    res_df.to_pickle(os.path.join('/nas/home/cborrelli/speech_forensics/results_windowing', res_file_name))

    


window 10
mean




std
max
min
predict
window 12
mean




std
max
min
predict
window 14
mean




std
max
min
predict
window 16
mean




std
max
min
predict
window 18
mean




std
max
min
predict
window 20
mean




std
max
min
predict


In [None]:
X.shape

In [None]:
wind_number

In [11]:
y_cl

array([False, False, False, ..., False, False, False])

In [12]:
cl

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [13]:
mcl

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [18]:
X = np.zeros([10,108])

In [19]:
cl.predict(X)

array([2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])

In [22]:
dev_data_path = '/nas/home/cborrelli/speech_forensics/notebook/pickle/sphinx/features_dev-clean.pkl'
feat_dev_df = pd.read_pickle(dev_data_path)
y = np.array(feat_dev_df['y_value'], dtype=np.float) >= 0.5  # labels for classification

0.0

In [29]:
feat_df

Unnamed: 0,level_0,index,path,noise,snr,mfcc,sfl,sc,sroff,zcr,rms,y_value,y_label,dataset,n_win
0,1,1,/nas/home/cborrelli/speech_forensics/dataset/t...,industrial,7,"[[[-372.8707438781263, -384.1968288221327, -38...","[0.007405185, 0.0076814815, 0.014162619, 0.008...","[1094.4627149676808, 1056.3904304514836, 1242....","[1820.0, 1860.0, 2220.0, 2000.0, 2040.0, 2140....","[0.0375, 0.08125, 0.08375, 0.085, 0.08125, 0.0...","[0.0154524045, 0.014622331, 0.01420999, 0.0148...",0.250000,2,test,132
1,4,4,/nas/home/cborrelli/speech_forensics/dataset/t...,speech2,0,"[[[-714.4790809640697, -708.5762698342506, -70...","[0.011840169, 0.023660272, 0.010634671, 0.0276...","[1642.7581786147812, 1658.1788100543781, 1374....","[3560.0, 3480.0, 2880.0, 3400.0, 2640.0, 2320....","[0.05375, 0.07875, 0.09375, 0.1375, 0.125, 0.0...","[0.00021192695, 0.00030955227, 0.0003126656, 0...",0.095238,1,test,168
2,5,5,/nas/home/cborrelli/speech_forensics/dataset/t...,restaurant,10,"[[[-412.68806061065504, -411.348978005117, -41...","[0.008928871, 0.0073570665, 0.012912545, 0.019...","[1313.320433676049, 1171.388987686987, 1311.91...","[2460.0, 2220.0, 3040.0, 3400.0, 3320.0, 2880....","[0.0475, 0.075, 0.0775, 0.09625, 0.09125, 0.08...","[0.011549563, 0.01254182, 0.011527487, 0.00912...",0.611111,4,test,245
3,6,6,/nas/home/cborrelli/speech_forensics/dataset/t...,speech2,10,"[[[-720.3983294436528, -727.192467812684, -720...","[0.0011262023, 0.0023709997, 0.0026563185, 0.0...","[1144.911748017688, 1271.540108659101, 1251.51...","[3280.0, 3600.0, 3440.0, 3280.0, 3360.0, 3980....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0025, 0....","[0.001267001, 0.0011848143, 0.0010690412, 0.00...",0.578947,3,test,189
4,7,7,/nas/home/cborrelli/speech_forensics/dataset/t...,speech1,7,"[[[-533.2813782860953, -513.8764163466564, -50...","[0.18204966, 0.33735844, 0.31969595, 0.3222704...","[3683.5513442844604, 3654.5825560833327, 3547....","[5580.0, 5640.0, 5480.0, 5520.0, 5220.0, 5200....","[0.22625, 0.42875, 0.42, 0.43125, 0.405, 0.35,...","[0.001958215, 0.0020088207, 0.0020604057, 0.00...",0.571429,3,test,196
5,8,8,/nas/home/cborrelli/speech_forensics/dataset/t...,industrial,0,"[[[-302.9348008386861, -314.33923558564857, -3...","[0.007638894, 0.00755912, 0.013054266, 0.00865...","[1096.449937710313, 1062.1327423610728, 1241.5...","[1820.0, 1860.0, 2220.0, 1980.0, 2020.0, 2080....","[0.04, 0.09125, 0.09125, 0.08, 0.07625, 0.0737...","[0.031650975, 0.029662864, 0.029047921, 0.0307...",0.000000,1,test,117
6,9,10,/nas/home/cborrelli/speech_forensics/dataset/t...,industrial,7,"[[[-369.6913662074841, -369.17360602931666, -3...","[0.009675563, 0.019553812, 0.034464456, 0.0245...","[1196.6250906767236, 1321.6745069238095, 1558....","[2060.0, 2680.0, 3280.0, 2900.0, 2900.0, 2740....","[0.05, 0.10125, 0.09375, 0.0925, 0.09125, 0.08...","[0.014752289, 0.013965561, 0.013582396, 0.0142...",0.307692,2,test,142
7,10,12,/nas/home/cborrelli/speech_forensics/dataset/t...,speech2,2,"[[[-574.0514507062935, -568.5221122195068, -55...","[0.122258194, 0.21331753, 0.20657103, 0.164718...","[2722.8018496710138, 2951.1859973319224, 2768....","[5280.0, 5420.0, 5280.0, 5340.0, 5420.0, 5160....","[0.1225, 0.28, 0.31375, 0.285, 0.21875, 0.255,...","[0.00096838985, 0.0010414071, 0.0011840196, 0....",0.000000,1,test,88
8,11,13,/nas/home/cborrelli/speech_forensics/dataset/t...,speech1,2,"[[[-515.9902392868177, -509.12418082424216, -5...","[0.07438009, 0.15195464, 0.16475394, 0.1519738...","[2617.438621212832, 2756.697276644263, 2682.35...","[5280.0, 5380.0, 5340.0, 5240.0, 5240.0, 5140....","[0.1125, 0.2475, 0.25375, 0.2275, 0.1825, 0.16...","[0.002119266, 0.0020522967, 0.0019767631, 0.00...",0.263158,2,test,153
9,12,14,/nas/home/cborrelli/speech_forensics/dataset/t...,traffic,15,"[[[-476.42789356793116, -471.3189289916306, -4...","[0.025037955, 0.02253066, 0.02649249, 0.036797...","[1510.3055801864696, 1405.9177693493211, 1538....","[2920.0, 2620.0, 2660.0, 2940.0, 3000.0, 2800....","[0.06125, 0.12625, 0.135, 0.13375, 0.1425, 0.1...","[0.0038493383, 0.003482782, 0.0033321776, 0.00...",1.000000,5,test,123
