## Imports

In [1]:
import sys
sys.path.append("/home/ubuntu/MultiModalDeepFake")
import pandas as pd
import numpy as np
#import nemo.collections.asr as nemo_asr 

In [2]:
from packages.LJDataLoader import LJDataLoader
#from packages.AudioEmbeddingsManager import AudioEmbeddingsManager
from packages.ModelManager import ModelManager
from packages.CadenceModelManager import CadenceModelManager
import packages.AnalysisManager as am
from packages.SmileFeatureManager import SmileFeatureManager

In [3]:
from packages.SmileFeatureSelector import *

# Smile Code Review

## Functions from Pipeline

In [None]:
def generate_split(fake_cols, file_path):

    
    loader = LJDataLoader(data_path=file_path)
    
    #GK: comment out sample cos shuffle = True which causes sampling to be redundant
    
    loader.sample(0.1)
    loader.splitData()

    source_architectures = ['Full_Band_MelGan', 'HifiGan', 'MelGan', 'MelGanLarge', 'Multi_Band_MelGan', 'Parallel_WaveGan', 'Waveglow']
    new_col_name = 'RandWaveFake'
    loader.selectRandomArchitecture(target_col=new_col_name, source_cols=source_architectures)
    
    source_architectures = ['RandWaveFake', 'ElevenLabs', 'UberDuck']
    new_col_name = 'Fake'
    loader.selectRandomArchitecture(target_col=new_col_name, source_cols=source_architectures)

    #data_df = loader.generateFinalDataFrame(real_col='Real', fake_cols=['RandWaveFake', 'ElevenLabs', 'UberDuck'])
    data_df = loader.generateFinalDataFrame(real_col='Real', fake_cols=fake_cols)
    
    train_count = data_df[data_df['type'] == 'train'].shape[0]
    dev_count = data_df[data_df['type'] == 'dev'].shape[0]
    test_count = data_df[data_df['type'] == 'test'].shape[0]

    print(f'# of Train instances: {train_count}')
    print(f'# of Dev instances: {dev_count}')
    print(f'# of Test instances: {test_count}')
    
    return data_df

In [None]:
def generate_features(data_df, window_size, silence_threshold):
    speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(model_name='titanet_large')
    embedding_manager = AudioEmbeddingsManager(model=speaker_model, data=data_df)
    em_feature_df, em_feature_cols = embedding_manager.generateFeatureDf()
    
    """
    cadence_manager = CadenceModelManager(data_df)
    cad_feature_df, cad_feature_cols, scalar =  cadence_manager.run_cadence_feature_extraction_pipeline(window_size, silence_threshold) # Add param for load features or not
    """
    
    smile_manager = SmileFeatureManager(data_df)
    #change number of features (feature_count=10 default)
    os_binary_feature_df, os_binary_feature_cols = smile_manager.generateFeatureDf('random_forest', label_type='binary', feature_count=25)
    os_multiclass_feature_df, os_multiclass_feature_cols = smile_manager.generateFeatureDf('random_forest', label_type='multiclass', feature_count=25)
    
    feature_store = {}
    feature_store['titanet'] = (em_feature_df, em_feature_cols)
    feature_store['openSmile_binary'] = (os_binary_feature_df, os_binary_feature_cols)
    feature_store['openSmile_multiclass'] = (os_multiclass_feature_df, os_multiclass_feature_cols)
    #feature_store['cadence'] = (cad_feature_df, cad_feature_cols)
    
    return feature_store
    

In [None]:
def train_eval(feature_store, fake_cols):
    results_cols = ['model', 'fake_cols', 'label_type', 'acc', 'cls_acc', 'loss']
    results = pd.DataFrame(columns=results_cols)
    
    for label_type in ['label', 'multiclass_label']:
        for k, v in feature_store.items():
            model_manager = ModelManager('decision_tree', v[0], v[1], merge_train_dev=True)
            model_manager.trainPredict(label_col=label_type)
            print("\nClass Accuracy")
            print(model_manager.class_accuracy)
            print("\nAccuracy")
            print(model_manager.accuracy)
            results = results.append(pd.DataFrame({'model':[k], 'label_type':[label_type], 'fake_cols':[fake_cols], 'acc':[model_manager.accuracy], 'cls_acc':[model_manager.class_accuracy],  'loss':[model_manager.log_loss_value]}))
    
    return results  
    

In [None]:
def run(fake_cols, metadata_path, name, data_df=None, window_size=100, silence_threshold=0.1):
    if data_df is None:
        data_df = generate_split(fake_cols, metadata_path)
    feature_store = generate_features(data_df, window_size, silence_threshold)
    results = train_eval(feature_store, fake_cols)
    results.to_csv(f'/home/ubuntu/data/results/{name}.csv', index=False)
    

## Testing Code

### Data Loading

In [None]:
fcols = ['ElevenLabs', 'UberDuck', 'Full_Band_MelGan', 'HifiGan', 'MelGan', 'MelGanLarge', 'Multi_Band_MelGan', 'Parallel_WaveGan', 'Waveglow']

#filepaths
fp = '/home/ubuntu/data/wavefake_data/LJ_metadata_16000KHz.csv'
fp2 = '/home/ubuntu/data/wavefake_data/LJ_metadata_16KHz_Laundered.csv'

df = generate_split(fcols, fp)

In [None]:
smile_manager = SmileFeatureManager(df)

In [None]:
testing_df = smile_manager.feature_df.copy()

In [None]:
testing_df.columns

In [None]:
list(df.columns)

In [None]:
list(df.architecture.unique())[1:]

### BruteForce Testing

In [None]:
sm_fs_bf = smileFeatureSelectorBruteForce(testing_df, metadata=list(df.columns),
                                          real_col='Real',
                                          fake_cols=list(df.architecture.unique())[1:])

In [None]:
sm_fs_bf.generate_data()

In [None]:
sm_fs_bf.bffs_data.columns

In [None]:
sm_fs_bf.bffs_data[(sm_fs_bf.bffs_data.iloc[:,1:] > 0.9).any(axis=1)]

In [None]:
sm_fs_bf.bffs_data[(sm_fs_bf.bffs_data.iloc[:,1] > 0.8)]

In [None]:
mask = sm_fs_bf.bffs_data['features'].str.contains('zcr')
td = sm_fs_bf.bffs_data[mask]
td.sort_values(by='ElevenLabs', ascending=False)

In [None]:
sm_fs_bf.FS_from_col(sort_col='ElevenLabs')

### SelectFromModel Testing

In [None]:
selector = smileFeatureSelectFromModel(testing_df, metadata=list(df.columns),
                                          real_col='Real',
                                          fake_cols=list(df.architecture.unique())[1:])

In [None]:
df_fb = selector.select_features_binary(max_features=25, return_df=True)

In [None]:
df_fm = selector.select_features_multiclass(max_features=25, return_df=True)

### Model Testing

In [None]:
type(df_fb[0]), type(df_fb[1])

In [None]:
df_fb[0]

In [None]:
df_fb[0].label.value_counts()

In [None]:
model_manager1 = ModelManager('svm', df_fb[0], df_fb[1], merge_train_dev=True)
model_manager1.trainPredict(label_col='label')
print(model_manager1.class_accuracy)

In [None]:
model_manager1.data.label.value_counts()

In [None]:
model_manager1.data.multiclass_label.value_counts()

In [None]:
model_manager1.accuracy

In [None]:
model_manager2 = ModelManager('svm', df_fm[0], df_fm[1], merge_train_dev=True)
model_manager2.trainPredict(label_col='multiclass_label')
print(model_manager2.accuracy)

In [None]:
print(model_manager2.class_accuracy)

In [None]:
model_manager3 = ModelManager('decision_tree', df_fb[0], df_fb[1], merge_train_dev=True)
model_manager3.trainPredict(label_col='label')
print(model_manager3.accuracy)
print(model_manager3.class_accuracy)

In [None]:
model_manager4 = ModelManager('decision_tree', df_fm[0], df_fm[1], merge_train_dev=True)
model_manager4.trainPredict(label_col='multiclass_label')
print(model_manager4.accuracy)
print(model_manager4.class_accuracy)

### Testing ACC SCORE

In [None]:
metadata_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16000KHz.csv'
fake_cols = ['ElevenLabs', 'UberDuck', 'RandWaveFake']
data_df = generate_split(fake_cols, metadata_path)

In [None]:
feature_store = generate_features(data_df, window_size=100, silence_threshold=0.1)

In [None]:
data = feature_store['openSmile_multiclass'][0]
features = feature_store['openSmile_multiclass'][1]

In [None]:
#model_manager = ModelManager('logreg', v0, v1, merge_train_dev=True)

In [None]:
#model_manager.trainPredict(label_col='multiclass_label')

In [None]:
train = data[(data.type=='train') | (data.type=='dev')].dropna()
test = data[data.type=='test'].dropna()

In [None]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(max_iter=100000000)

X_train = train[features].reset_index(drop=True)
X_test = test[features].reset_index(drop=True)

y_train = train['multiclass_label'].reset_index(drop=True)
y_test = test['multiclass_label'].reset_index(drop=True)

model_lr.fit(X_train, y_train)

y_pred = model_lr.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, log_loss, roc_curve

accuracy_score(y_test, y_pred)

In [None]:
set(y_test)

In [None]:
a = np.where(y_test==0)[0]

In [None]:
a

In [None]:
y_test.index

In [None]:
class_accuracy = {}
for cls in range(len(set(y_test))):
    cls_idx = np.where(y_test==cls)[0]
    cls_test = y_test[cls_idx]
    cls_pred = y_pred[cls_idx]
    class_accuracy[cls] = accuracy_score(cls_test, cls_pred)

In [None]:
class_accuracy

In [None]:
np.mean(list(class_accuracy.values()))

In [None]:
cls_idx = np.where(y_test==1)[0]
len(cls_idx)

In [None]:
def find_same_values(array1, array2, number):
    result = []
    for i in range(len(array1)):
        if array1[i] == array2[i] == number:
            result.append(i)
    return result

In [None]:
len(find_same_values(y_test, y_pred, 1))/len(np.where(y_test==1)[0])

In [None]:
for cls in range(len(set(y_test))):
    a = len(find_same_values(y_test, y_pred, cls))/len(np.where(y_test==cls)[0])
    print(a)

# Cadence Code Review

## Data Loading

In [4]:
def generate_split(fake_cols, file_path):

    
    loader = LJDataLoader(data_path=file_path)
    loader.splitData()

    source_architectures = ['Full_Band_MelGan', 'HifiGan', 'MelGan', 'MelGanLarge', 'Multi_Band_MelGan', 'Parallel_WaveGan', 'Waveglow']
    new_col_name = 'RandWaveFake'
    loader.selectRandomArchitecture(target_col=new_col_name, source_cols=source_architectures)
    
    source_architectures = ['ElevenLabs', 'UberDuck']
    new_col_name = 'EL_UD_Fake'
    loader.selectRandomArchitecture(target_col=new_col_name, source_cols=source_architectures)
    
    source_architectures = ['RandWaveFake', 'ElevenLabs', 'UberDuck']
    new_col_name = 'Fake'
    loader.selectRandomArchitecture(target_col=new_col_name, source_cols=source_architectures)

    #data_df = loader.generateFinalDataFrame(real_col='Real', fake_cols=['RandWaveFake', 'ElevenLabs', 'UberDuck'])
    data_df = loader.generateFinalDataFrame(real_col='Real', fake_cols=fake_cols)
    
    train_count = data_df[data_df['type'] == 'train'].shape[0]
    dev_count = data_df[data_df['type'] == 'dev'].shape[0]
    test_count = data_df[data_df['type'] == 'test'].shape[0]

    print(f'# of Train instances: {train_count}')
    print(f'# of Dev instances: {dev_count}')
    print(f'# of Test instances: {test_count}')
    
    return data_df

In [5]:
fcols = ['ElevenLabs', 'UberDuck', 'Full_Band_MelGan', 'HifiGan', 'MelGan', 'MelGanLarge', 'Multi_Band_MelGan', 'Parallel_WaveGan', 'Waveglow']

#filepaths
fp = '/home/ubuntu/data/wavefake_data/LJ_metadata_16000KHz.csv'
fp2 = '/home/ubuntu/data/wavefake_data/LJ_metadata_16KHz_Laundered.csv'

df = generate_split(fcols, fp)

# of Train instances: 78330
# of Dev instances: 26110
# of Test instances: 26110


In [6]:
df.head()

Unnamed: 0,type,id,architecture,path,label,multiclass_label
0,train,LJ049-0064,Real,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0
1,train,LJ027-0024,Real,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0
2,train,LJ016-0258,Real,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0
3,train,LJ008-0284,Real,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0
4,train,LJ037-0141,Real,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0


In [7]:
df.architecture.value_counts()

Real                 13055
ElevenLabs           13055
UberDuck             13055
Full_Band_MelGan     13055
HifiGan              13055
MelGan               13055
MelGanLarge          13055
Multi_Band_MelGan    13055
Parallel_WaveGan     13055
Waveglow             13055
Name: architecture, dtype: int64

In [8]:
df.columns

Index(['type', 'id', 'architecture', 'path', 'label', 'multiclass_label'], dtype='object')

## Testing

In [9]:
cadence_manager = CadenceModelManager(df)

In [10]:
cad_feature_df, cad_feature_cols, scalar =  cadence_manager.run_cadence_feature_extraction_pipeline()

100%|██████████| 10/10 [00:00<00:00, 16.61it/s]


In [11]:
cad_feature_df.head()

Unnamed: 0,type,id,architecture,path,label,multiclass_label,Mix_pause_ratio,Mix_pause_mean,Mix_pause_std,Mix_n_pauses,Mix_amp_deriv,Mix_amp_mean
0,train,LJ049-0064,Real,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0,0.11036,0.026212,0.063718,0.112593,0.000851,0.198723
1,train,LJ027-0024,Real,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0,0.111218,0.006088,0.020646,0.345185,0.017901,0.165211
2,train,LJ016-0258,Real,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0,0.131858,0.016751,0.063415,0.140741,0.023905,0.157353
3,train,LJ008-0284,Real,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0,0.138596,0.008182,0.018847,0.305185,0.031064,0.166307
4,train,LJ037-0141,Real,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0,0.200678,0.007185,0.03123,0.358519,0.00633,0.08802


In [12]:
cad_feature_cols

['Mix_pause_ratio',
 'Mix_amp_mean',
 'Mix_amp_deriv',
 'Mix_n_pauses',
 'Mix_pause_mean',
 'Mix_pause_std']

In [13]:
cadence_manager.data.columns

Index(['type', 'id', 'architecture', 'path', 'label', 'multiclass_label'], dtype='object')

## Temp

In [20]:
import sys, os
sys.path.insert(0, '.')
import pandas as pd
import json
from packages.CadenceModelManager import CadenceModelManager


params_path = '/home/ubuntu/data/wavefake_data/Cadence_features/16khz_Laundered/params.json'
metadata_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16KHz_Laundered.csv'
output_dir = '/home/ubuntu/data/wavefake_data/Cadence_features/16khz_Laundered'

with open(params_path) as file:
    params = json.load(file)
metadata = pd.read_csv(metadata_path)

archs = ['Full_Band_MelGan', 'HifiGan', 'MelGan', 'MelGanLarge', 'Multi_Band_MelGan', 'Parallel_WaveGan', 'Waveglow', 'ElevenLabs', 'UberDuck', 'Real']

arch = 'Real'
file_paths = metadata[[arch]].dropna()
file_paths.columns = ['path']

In [21]:
file_paths

Unnamed: 0,path
0,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...
1,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...
2,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...
3,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...
4,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...
...,...
13095,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...
13096,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...
13097,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...
13098,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...


In [22]:
paths = list(file_paths['path'])

## blockwise testing

In [14]:
import librosa 
import numpy as np
from math import trunc
from scipy import signal
from numpy import diff

In [15]:
#window size = 395

test_file = df.path[130449]

In [16]:
test_file

'/home/ubuntu/data/wavefake_data/generated_audio/ljspeech_waveglow/16000KHz/LJ012-0043.wav'

### normalize audio

In [88]:
sample = librosa.load(test_file)

In [89]:
sample[1]

22050

In [90]:
sample = librosa.load(test_file)[0]
max_abs = np.max(np.abs(sample))
normalized_sample = sample/max_abs

In [26]:
normalized_sample

array([ 0.002754  ,  0.00350152,  0.00101652, ..., -0.00223037,
       -0.00147095,  0.        ], dtype=float32)

In [27]:
len(normalized_sample) == len(sample)

True

## truncate silences

In [35]:
params

{'Mix': {'window_size': 395.0, 'silence_threshold': 0.046224125994119375}}

In [32]:
params['Mix']['window_size']

395.0

In [38]:
window_size = int(params['Mix']['window_size'])
silence_threshold = params['Mix']['silence_threshold']
audio = normalized_sample

for j in range(len(audio)):
    roll_average = np.mean(np.abs(audio[j:j+window_size]))
    if roll_average > silence_threshold:
        truncation_id_start = j
        break
        
print(truncation_id_start, truncation_id_start/len(audio))

5094 0.0291337096580478


In [42]:
len(audio)

174849

In [43]:
for j in reversed(range(len(audio))):
    roll_average = np.mean(np.abs(audio[j-window_size:j]))
    if roll_average > silence_threshold:
        truncation_id_end = j-window_size
        break

print(truncation_id_end, truncation_id_end/len(audio))

170855 0.9771574329850328


## get_silence

In [44]:
thresh = max(abs(audio))*silence_threshold

In [46]:
max(abs(audio))

1.0

In [45]:
thresh, silence_threshold

(0.046224125994119375, 0.046224125994119375)

In [47]:
def moving_average(x, w):
    return np.convolve(x, np.ones(w), 'valid') / w

In [48]:
moving_avg = moving_average(abs(audio), window_size) # Window size = 100 

In [51]:
len(moving_avg), len(audio)

(174455, 174849)

In [53]:
np.max(moving_avg), np.min(moving_avg)

(0.4610534960073949, 0.0006986530545102049)

In [54]:
silent = np.where(abs(moving_avg) < thresh)
voiced = np.where(abs(moving_avg) >= thresh)

In [59]:
len(silent[0]), len(voiced[0])

(74802, 99653)

In [62]:
len(silent[0])+len(voiced[0]), len(silent[0])/len(moving_avg)

(174455, 0.42877532888137343)

## get silence spread

In [63]:
thresh = max(abs(audio))*silence_threshold

In [64]:
moving_avg = moving_average(abs(audio), window_size) # Window size = 100 

In [69]:
silent_windows = np.where(moving_avg < thresh)
silent = np.where(abs(moving_avg) < thresh)

In [71]:
np.all(silent_windows[0] == silent[0])

True

In [72]:
thresh2 = max(audio)*silence_threshold

In [74]:
thresh2 == thresh

True

## get_amplitude

In [112]:
def filter_signal(audio, sr, low_pass_filter_cutoff):
    
    t = np.arange(len(audio)) / sr
    w = low_pass_filter_cutoff / (sr / 2) 
    b, a = signal.butter(5, w, 'low')
    smoothed_signal = signal.filtfilt(b, a, audio)
    
    return smoothed_signal

In [109]:
abs_audio = abs(audio)

In [91]:
low_pass_filter_cutoff: int = 10
sr = librosa.load(test_file)[1]

In [81]:
sr

22050

In [92]:
t = np.arange(len(audio)) / sr

In [97]:
w = low_pass_filter_cutoff / (sr/2)
w

0.0009070294784580499

In [98]:
sr/2

11025.0

In [99]:
b, a = signal.butter(5, w, 'low')

In [100]:
b

array([5.84393663e-15, 2.92196832e-14, 5.84393663e-14, 5.84393663e-14,
       2.92196832e-14, 5.84393663e-15])

In [101]:
a

array([ 1.        , -4.99077877,  9.96315758, -9.9448    ,  4.96324234,
       -0.99082115])

In [113]:
smoothed_signal = signal.filtfilt(b, a, audio)

In [114]:
smoothed_signal = filter_signal(abs_audio, sr, low_pass_filter_cutoff)

In [116]:
abs_audio

array([0.002754  , 0.00350152, 0.00101652, ..., 0.00223037, 0.00147095,
       0.        ], dtype=float32)

In [115]:
smoothed_signal

array([0.00498179, 0.00498696, 0.00499213, ..., 0.0029359 , 0.0029359 ,
       0.0029359 ])

In [118]:
len(smoothed_signal), len(diff(smoothed_signal))

(174849, 174848)

In [119]:
np.mean(diff(smoothed_signal))

-1.1700963102304922e-08

# Gathering Data for Elevenlabs

In [121]:
path = '/home/ubuntu/launder_metadata.csv'
lddf = pd.read_csv(path)

In [122]:
lddf

Unnamed: 0.1,Unnamed: 0,Path,isTranscode,BitRate,isNoise,SNR
0,0,/home/ubuntu/data/wavefake_data/generated_audi...,1,196k,0,
1,1,/home/ubuntu/data/wavefake_data/generated_audi...,1,64k,1,62.0
2,2,/home/ubuntu/data/wavefake_data/generated_audi...,1,127k,0,
3,3,/home/ubuntu/data/wavefake_data/generated_audi...,0,,1,38.0
4,4,/home/ubuntu/data/wavefake_data/generated_audi...,0,,1,47.0
...,...,...,...,...,...,...
130963,130963,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,,0,
130964,130964,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,1,127k,1,60.0
130965,130965,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,,1,80.0
130966,130966,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,,0,


In [162]:
transcoded = lddf[(lddf['isTranscode']==1) & (lddf['isNoise']==0) & lddf['Path'].str.contains('generated_audio', case=False)]
noisy_transcoded = lddf[(lddf['isTranscode']==1) & (lddf['isNoise']==1) & lddf['Path'].str.contains('generated_audio', case=False)]
regular = lddf[(lddf['isTranscode']==0) & (lddf['isNoise']==0) & lddf['Path'].str.contains('generated_audio', case=False)]

In [163]:
l1=transcoded.sample(n=10)
l2=noisy_transcoded.sample(n=10)
l3=regular.sample(n=10)
laundered_test_eleven = pd.concat([l1, l2, l3])
laundered_set = set(laundered_test_eleven.Path.to_list())

In [164]:
laundered_test_eleven

Unnamed: 0.1,Unnamed: 0,Path,isTranscode,BitRate,isNoise,SNR
12697,12697,/home/ubuntu/data/wavefake_data/generated_audi...,1,196k,0,
51847,51847,/home/ubuntu/data/wavefake_data/generated_audi...,1,196k,0,
101359,101359,/home/ubuntu/data/wavefake_data/generated_audi...,1,127k,0,
69489,69489,/home/ubuntu/data/wavefake_data/generated_audi...,1,64k,0,
10064,10064,/home/ubuntu/data/wavefake_data/generated_audi...,1,127k,0,
51314,51314,/home/ubuntu/data/wavefake_data/generated_audi...,1,127k,0,
87255,87255,/home/ubuntu/data/wavefake_data/generated_audi...,1,127k,0,
1583,1583,/home/ubuntu/data/wavefake_data/generated_audi...,1,127k,0,
2072,2072,/home/ubuntu/data/wavefake_data/generated_audi...,1,64k,0,
115794,115794,/home/ubuntu/data/wavefake_data/generated_audi...,1,196k,0,


In [165]:
laundered_set

{'/home/ubuntu/data/wavefake_data/generated_audio/ljspeech_elevenlabs/16000KHz/LJ009-0256.wav',
 '/home/ubuntu/data/wavefake_data/generated_audio/ljspeech_elevenlabs/16000KHz/LJ022-0030.wav',
 '/home/ubuntu/data/wavefake_data/generated_audio/ljspeech_elevenlabs/16000KHz/LJ030-0039.wav',
 '/home/ubuntu/data/wavefake_data/generated_audio/ljspeech_elevenlabs/16000KHz/LJ038-0075.wav',
 '/home/ubuntu/data/wavefake_data/generated_audio/ljspeech_full_band_melgan/16000KHz/LJ009-0233_gen.wav',
 '/home/ubuntu/data/wavefake_data/generated_audio/ljspeech_hifiGAN/16000KHz/LJ005-0248_generated.wav',
 '/home/ubuntu/data/wavefake_data/generated_audio/ljspeech_hifiGAN/16000KHz/LJ021-0178_generated.wav',
 '/home/ubuntu/data/wavefake_data/generated_audio/ljspeech_hifiGAN/16000KHz/LJ027-0016_generated.wav',
 '/home/ubuntu/data/wavefake_data/generated_audio/ljspeech_hifiGAN/16000KHz/LJ031-0151_generated.wav',
 '/home/ubuntu/data/wavefake_data/generated_audio/ljspeech_melgan/16000KHz/LJ028-0308_gen.wav',
 '

In [166]:
len(laundered_set)

30

In [168]:
#!pip install paramiko

# Final Code Review

In [169]:
from packages.TIMITDataLoader import TIMITDataLoader

In [171]:
timit_data_loader = TIMITDataLoader('/home/ubuntu/data/TIMIT_and_ElevenLabs/TIMIT and ElevenLabs')

In [172]:
df = timit_data_loader.generate_split()

N real and fake phrases: 500, 500
492 491
/home/ubuntu/data/TIMIT_and_ElevenLabs/TIMIT and ElevenLabs/SI580/real/MPRT0_SI580.WAV
# of Train instances: 589
# of Dev instances: 196
# of Test instances: 198


In [174]:
df.shape

(983, 7)

In [175]:
timit_data_loader2 = TIMITDataLoader('/home/ubuntu/data/TIMIT_and_ElevenLabs/TIMIT and ElevenLabs')

In [176]:
df2 = timit_data_loader2.generate_split()

N real and fake phrases: 500, 500
492 491
/home/ubuntu/data/TIMIT_and_ElevenLabs/TIMIT and ElevenLabs/SI580/real/MPRT0_SI580.WAV
# of Train instances: 589
# of Dev instances: 196
# of Test instances: 198


In [178]:
np.all(df == df2)

True

In [202]:
path1 = '/home/ubuntu/data/results/TIMIT/testing/test1.csv'
path2 = '/home/ubuntu/data/results/TIMIT/testing/test2.csv'

In [203]:
rdf1 = pd.read_csv(path1).drop(['Unnamed: 0'], axis=1)
rdf2 = pd.read_csv(path2).drop(['Unnamed: 0'], axis=1)

In [204]:
rdf1

Unnamed: 0,feature_method,model,label_type,acc,class_acc_0,class_acc_1,eer_score
0,cadence,logreg,label,0.690909,0.814815,0.650602,0.228916
1,cadence,logreg,multiclass_label,0.690909,0.814815,0.650602,
2,cadence,random_forest,label,0.727273,0.814815,0.698795,0.216867
3,cadence,random_forest,multiclass_label,0.727273,0.814815,0.698795,
4,openSmile_binary,logreg,label,0.754545,0.0,1.0,0.096386
5,openSmile_binary,logreg,multiclass_label,0.754545,0.0,1.0,
6,openSmile_binary,random_forest,label,0.990909,0.962963,1.0,0.0
7,openSmile_binary,random_forest,multiclass_label,0.990909,0.962963,1.0,
8,openSmile_multiclass,logreg,label,0.754545,0.0,1.0,0.096386
9,openSmile_multiclass,logreg,multiclass_label,0.754545,0.0,1.0,


In [205]:
rdf2

Unnamed: 0,feature_method,model,label_type,acc,class_acc_0,class_acc_1,eer_score
0,cadence,logreg,label,0.787611,0.851852,0.767442,0.185185
1,cadence,logreg,multiclass_label,0.787611,0.851852,0.767442,
2,cadence,random_forest,label,0.778761,0.888889,0.744186,0.185185
3,cadence,random_forest,multiclass_label,0.778761,0.888889,0.744186,
4,openSmile_binary,logreg,label,1.0,1.0,1.0,0.0
5,openSmile_binary,logreg,multiclass_label,1.0,1.0,1.0,
6,openSmile_binary,random_forest,label,1.0,1.0,1.0,0.0
7,openSmile_binary,random_forest,multiclass_label,1.0,1.0,1.0,
8,openSmile_multiclass,logreg,label,1.0,1.0,1.0,0.0
9,openSmile_multiclass,logreg,multiclass_label,1.0,1.0,1.0,


In [200]:
path3='/home/ubuntu/data/results/TIMIT/archive_16KHz_ElevenLabs_TIMIT_k_fold_regenerated.csv'
rdf3 = pd.read_csv(path3).drop(['Unnamed: 0'], axis=1)

In [201]:
rdf3

Unnamed: 0,feature_method,model,label_type,acc,class_acc_0,class_acc_1
0,cadence,logreg,label,0.898438,0.846154,0.911765
1,cadence,logreg,multiclass_label,0.898438,0.846154,0.911765
2,cadence,random_forest,label,0.872396,0.890313,0.867829
3,cadence,random_forest,multiclass_label,0.873915,0.888177,0.87028
4,cadence,svm,label,0.898438,0.846154,0.911765
5,cadence,svm,multiclass_label,0.898438,0.846154,0.911765
6,openSmile_binary,logreg,label,0.947627,0.753205,0.997186
7,openSmile_binary,logreg,multiclass_label,0.947627,0.753205,0.997186
8,openSmile_binary,random_forest,label,1.0,1.0,1.0
9,openSmile_binary,random_forest,multiclass_label,1.0,1.0,1.0


In [206]:
path5 = '/home/ubuntu/data/results/TIMIT/testing/test5.csv'
rdf5 = pd.read_csv(path5).drop(['Unnamed: 0'], axis=1)

In [207]:
rdf5

Unnamed: 0,feature_method,model,label_type,acc,class_acc_0,class_acc_1,eer_score
0,cadence,logreg,label,0.872,0.724138,0.916667,0.172414
1,cadence,logreg,multiclass_label,0.872,0.724138,0.916667,
2,cadence,random_forest,label,0.888,0.758621,0.927083,0.172414
3,cadence,random_forest,multiclass_label,0.888,0.758621,0.927083,
4,openSmile_binary,logreg,label,0.768,0.0,1.0,0.114583
5,openSmile_binary,logreg,multiclass_label,0.768,0.0,1.0,
6,openSmile_binary,random_forest,label,0.992,1.0,0.989583,0.010417
7,openSmile_binary,random_forest,multiclass_label,0.992,1.0,0.989583,
8,openSmile_multiclass,logreg,label,0.768,0.0,1.0,0.114583
9,openSmile_multiclass,logreg,multiclass_label,0.768,0.0,1.0,


In [208]:
path6 = '/home/ubuntu/data/results/TIMIT/testing/test6.csv'
rdf6 = pd.read_csv(path5).drop(['Unnamed: 0'], axis=1)

In [209]:
rdf6

Unnamed: 0,feature_method,model,label_type,acc,class_acc_0,class_acc_1,eer_score
0,cadence,logreg,label,0.872,0.724138,0.916667,0.172414
1,cadence,logreg,multiclass_label,0.872,0.724138,0.916667,
2,cadence,random_forest,label,0.888,0.758621,0.927083,0.172414
3,cadence,random_forest,multiclass_label,0.888,0.758621,0.927083,
4,openSmile_binary,logreg,label,0.768,0.0,1.0,0.114583
5,openSmile_binary,logreg,multiclass_label,0.768,0.0,1.0,
6,openSmile_binary,random_forest,label,0.992,1.0,0.989583,0.010417
7,openSmile_binary,random_forest,multiclass_label,0.992,1.0,0.989583,
8,openSmile_multiclass,logreg,label,0.768,0.0,1.0,0.114583
9,openSmile_multiclass,logreg,multiclass_label,0.768,0.0,1.0,
