In [None]:
import sys
sys.path.append("/home/ubuntu/MultiModalDeepFake")
import nemo.collections.asr as nemo_asr 
import pandas as pd

In [None]:
from packages.LJDataLoader import LJDataLoader
from packages.AudioEmbeddingsManager import AudioEmbeddingsManager
from packages.ModelManager import ModelManager
from packages.CadenceModelManager import CadenceModelManager
import packages.AnalysisManager as am
from packages.SmileFeatureManager import SmileFeatureManager

In [None]:
def generate_split(fake_cols, file_path):

    
    loader = LJDataLoader(data_path=file_path, filter_cols=['ElevenLabsCloneClip'])
    loader.splitData()

    source_architectures = ['Full_Band_MelGan', 'HifiGan', 'MelGan', 'MelGanLarge', 'Multi_Band_MelGan', 'Parallel_WaveGan', 'Waveglow']
    new_col_name = 'RandWaveFake'
    loader.selectRandomArchitecture(target_col=new_col_name, source_cols=source_architectures)
    
    source_architectures = ['ElevenLabs', 'UberDuck']
    new_col_name = 'EL_UD_Fake'
    loader.selectRandomArchitecture(target_col=new_col_name, source_cols=source_architectures)
    
    source_architectures = ['RandWaveFake', 'ElevenLabs', 'UberDuck']
    new_col_name = 'Fake'
    loader.selectRandomArchitecture(target_col=new_col_name, source_cols=source_architectures)

    #data_df = loader.generateFinalDataFrame(real_col='Real', fake_cols=['RandWaveFake', 'ElevenLabs', 'UberDuck'])
    data_df = loader.generateFinalDataFrame(real_col='Real', fake_cols=fake_cols)
    
    train_count = data_df[data_df['type'] == 'train'].shape[0]
    dev_count = data_df[data_df['type'] == 'dev'].shape[0]
    test_count = data_df[data_df['type'] == 'test'].shape[0]

    print(f'# of Train instances: {train_count}')
    print(f'# of Dev instances: {dev_count}')
    print(f'# of Test instances: {test_count}')
    
    return data_df

In [None]:
df = generate_split(['ElevenLabs', 'UberDuck', 'Full_Band_MelGan', 'HifiGan', 'MelGan', 'MelGanLarge', 'Multi_Band_MelGan', 'Parallel_WaveGan', 'Waveglow'], '/home/ubuntu/data/wavefake_data/LJ_metadata_16000KHz.csv')
df

In [None]:
df[['architecture', 'label']].value_counts()

In [None]:
def generate_features(data_df):
    speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(model_name='titanet_large')
    embedding_manager = AudioEmbeddingsManager(model=speaker_model, data=data_df)
    em_feature_df, em_feature_cols = embedding_manager.generateFeatureDf()
    
    cadence_manager = CadenceModelManager(data_df)
    cad_feature_df, cad_feature_cols, scalar =  cadence_manager.run_cadence_feature_extraction_pipeline(fill_na=-1) # Add param for load features or not
    
    smile_manager = SmileFeatureManager(data_df)
    #change number of features (feature_count=10 default)
    os_binary_feature_df, os_binary_feature_cols = smile_manager.generateFeatureDf('random_forest', label_type='binary', feature_count=20)
    os_multiclass_feature_df, os_multiclass_feature_cols = smile_manager.generateFeatureDf('random_forest', label_type='multiclass', feature_count=20)
    
    feature_store = {}
    feature_store['titanet'] = (em_feature_df, em_feature_cols)
    feature_store['openSmile_binary'] = (os_binary_feature_df, os_binary_feature_cols)
    feature_store['openSmile_multiclass'] = (os_multiclass_feature_df, os_multiclass_feature_cols)
    feature_store['cadence'] = (cad_feature_df, cad_feature_cols)
    
    return feature_store
    
    

In [None]:
def train_eval(feature_store, fake_cols):
    results_cols = ['feature_method', 'model', 'fake_cols', 'label_type', 'acc', 'cls_acc', 'loss', 'eer_score', 'eer_threshold']
    results = pd.DataFrame(columns=results_cols)
    
    models = ['logreg', 'random_forest']
    for model in models:
        for label_type in ['label', 'multiclass_label']:
            for k, v in feature_store.items():
                model_manager = ModelManager(model, v[0], v[1], merge_train_dev=True)
                model_manager.trainPredict(label_col=label_type)
                results = results.append(pd.DataFrame({'feature_method':[k], 'label_type':[label_type], 'fake_cols':[fake_cols], 'acc':[model_manager.accuracy], 'cls_acc':[model_manager.class_accuracy],  'loss':[model_manager.log_loss_value], 'model':[model], 
                                                       'eer_score':[model_manager.eer_score], 'eer_threshold':[model_manager.eer_threshold]}))
    
    return results
    
    

In [None]:
def run(fake_cols, metadata_path, name, data_df=None):
    if data_df is None:
        data_df = generate_split(fake_cols, metadata_path)
    feature_store = generate_features(data_df)
    results = train_eval(feature_store, fake_cols)
    results.to_csv(f'/home/ubuntu/data/results/6-30-23_2/{name}.csv', index=False)
    return results
    
    
    

In [None]:
agg_df = pd.DataFrame(columns=['feature_method', 'model', 'fake_cols', 'label_type', 'acc', 'cls_acc', 'loss', 'laundered', 'eer_score', 'eer_threshold'])

In [None]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16000KHz.csv'
results = run(['ElevenLabs'], file_path, '16KHz_ElevenLabs')
results['laundered'] = 0
agg_df = agg_df.append(results)

In [None]:
## SB_Comment - where are the final uned cadence results? I can see the params used below but dont think they're being used 
## SB_Comment - should set the window size and threshold as global params e.g. uberduck_window_size, elevenlabs_window_size, general_window_size etc 

In [None]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16000KHz.csv'
results = run(['UberDuck'], file_path, '16KHz_UberDuck')
results['laundered'] = 0
agg_df = agg_df.append(results)


In [None]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16000KHz.csv'
results = run(['RandWaveFake'], file_path, '16KHz_RandWaveFake_B')
results['laundered'] = 0
agg_df = agg_df.append(results)


In [None]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16000KHz.csv'
results = run(['UberDuck', 'ElevenLabs'], file_path, '16KHz_ElevenLabs_and_UberDuck')
results['laundered'] = 0
agg_df = agg_df.append(results)


In [None]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16000KHz.csv'
results = run(['ElevenLabs', 'UberDuck', 'RandWaveFake'], file_path, '16KHz_Mix')
results['laundered'] = 0
agg_df = agg_df.append(results)


In [None]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16000KHz.csv'
results = run(['EL_UD_Fake'], file_path, '16KHz_EL_UD_Fake_B')
results['laundered'] = 0
agg_df = agg_df.append(results)


In [None]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16000KHz.csv'
results = run(['Fake'], file_path, '16KHz_Fake')
results['laundered'] = 0
agg_df = agg_df.append(results)


In [None]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16KHz_Laundered.csv'
results = run(['ElevenLabs'], file_path, '16KHz_ElevenLabs_Laundered')
results['laundered'] = 1
agg_df = agg_df.append(results)


In [None]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16KHz_Laundered.csv'
results = run(['UberDuck'], file_path, '16KHz_UberDuck_Laundered')
results['laundered'] = 1
agg_df = agg_df.append(results)


In [None]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16KHz_Laundered.csv'
results = run(['RandWaveFake'], file_path, '16KHz_RandWaveFake_Laundered')
results['laundered'] = 1
agg_df = agg_df.append(results)


In [None]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16KHz_Laundered.csv'
results = run(['UberDuck', 'ElevenLabs'], file_path, '16KHz_ElevenLabs_and_UberDuck_Laundered')
results['laundered'] = 1
agg_df = agg_df.append(results)


In [None]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16KHz_Laundered.csv'
results = run(['ElevenLabs', 'UberDuck', 'RandWaveFake'], file_path, '16KHz_Mix_Laundered')
results['laundered'] = 1
agg_df = agg_df.append(results)


In [None]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16KHz_Laundered.csv'
results = run(['EL_UD_Fake'], file_path, '16KHz_EL_UD_Fake_Laundered_B')
results['laundered'] = 1
agg_df = agg_df.append(results)


In [None]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16KHz_Laundered.csv'
results = run(['Fake'], file_path, '16KHz_Fake_Laundered_B')
results['laundered'] = 1
agg_df = agg_df.append(results)


In [None]:
agg_df.to_csv('/home/ubuntu/data/results/6-30-23_2/agg_results.csv', index=False)