In [13]:
import opensmile 
import pandas as pd
import numpy as np
import os
import random
from tqdm import tqdm
base_path = "/home/ubuntu/"

## Sampling and Mixing Data

In [2]:
ljs_real_path = base_path + 'data/wavefake_data/LJSpeech_1.1/wavs/Original'
ljs_fbm_path = base_path + 'data/wavefake_data/generated_audio/ljspeech_full_band_melgan/Original'
ljs_mg_path = base_path + 'data/wavefake_data/generated_audio/ljspeech_melgan/Original'
ljs_mgl_path = base_path + 'data/wavefake_data/generated_audio/ljspeech_melgan_large/Original'
ljs_mbmg_path = base_path + 'data/wavefake_data/generated_audio/ljspeech_multi_band_melgan/Original'
ljs_pwg_path = base_path + 'data/wavefake_data/generated_audio/ljspeech_parallel_wavegan/Original'
ljs_wg_path = base_path + 'data/wavefake_data/generated_audio/ljspeech_waveglow/Original'
ljs_hfg_path = base_path + 'data/wavefake_data/generated_audio/ljspeech_hifiGAN/Original'

In [3]:
data_paths = [ljs_real_path, ljs_fbm_path, ljs_mg_path, ljs_mgl_path, ljs_mbmg_path, ljs_pwg_path, ljs_wg_path, ljs_hfg_path]

In [4]:
def generate_sample(paths, samples=700):
    
    #split for samples
    re_samples = int(samples/2)
    fk_samples = samples - re_samples
    
    #get path to real data and create real sample
    real_path = paths[0]
    real_path_lst = os.listdir(real_path) 
    real_path_lst = [r for r in real_path_lst if ".wav" in r]
    real_files = random.sample(real_path_lst, re_samples)
    real_files_all = [os.path.join(real_path,f) for f in real_files]
    
    #iteratively sample and store fake files
    fake_files_all = [] 
    
    for i in range(len(paths)-1):
        
        fake_path = paths[i+1]
        fake_files = os.listdir(fake_path)
        fake_files = [f for f in fake_files if ".wav" in f]
        
        sr_low = int((i/(len(paths)-1))*fk_samples)
        sr_high = int(((i+1)/(len(paths)-1))*fk_samples)
        
        #print(sr_low, sr_high)
        
        sample_files = [f.split(".")[0] for f in real_files[sr_low:sr_high]]
        fake_files_all.extend([os.path.join(fake_path, ff) for fname in sample_files for ff in fake_files if fname in ff])
        
        
    return {'real_files': real_files_all, 'fake_files': fake_files_all}

In [19]:
data = generate_sample(data_paths, samples=11200)

In [20]:
len(data['real_files']), len(data['fake_files'])

(5600, 5600)

## Generate OpenSmile Features

In [21]:
smile_functionals_ComParE = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.Functionals)

In [22]:
def generate_df(files_dict, smile_object, save=False, savefile=None):
    
    print("generating features for real files\n")
    real_df = pd.DataFrame()
    for i in tqdm(range(len(files_dict['real_files']))):
        df = smile_object.process_file(files_dict['real_files'][i])
        real_df = pd.concat([real_df,df])
    real_df['label'] = 0
    
    print("\ngenerating features for fake files\n")
    fake_df = pd.DataFrame()
    for i in tqdm(range(len(files_dict['fake_files']))):
        df = smile_object.process_file(files_dict['fake_files'][i])
        fake_df = pd.concat([fake_df,df])
    fake_df['label'] = 1
    
    df = pd.concat([real_df, fake_df]).reset_index()
    
    if save==True:
        df.to_csv(savefile, index=False)
        print("saved to {}".format(savefile))        
        
    if save==False:
        return df

In [23]:
savefile = base_path + 'testing-code/opensmile-feature-importance/smile_dfs/' + 'LJ_sample_11200.csv'
generate_df(data, smile_functionals_ComParE, save=True, savefile=savefile)

generating features for real files



100%|██████████| 5600/5600 [14:14<00:00,  6.56it/s]



generating features for fake files



100%|██████████| 5600/5600 [14:12<00:00,  6.57it/s]


saved to /home/ubuntu/testing-code/opensmile-feature-importance/smile_dfs/LJ_sample_11200.csv
