# Sampling and Mixing data for experiments

- update 03/19/2023: need to fix `.mp4` issues

In [1]:
import opensmile 
import pandas as pd
import numpy as np
import os
import random
from tqdm import tqdm
from pathlib import Path
base_path = "/home/ubuntu/"

In [2]:
metadata_original = base_path + 'data/wavefake_data/LJ_metadata_Original.csv'
metadata_16k = base_path + 'data/wavefake_data/LJ_metadata_16000KHz.csv'
metadata_16k64 = base_path + 'data/wavefake_data/LJ_metadata_16000KHz_AAC_64K.csv'
metadata_16k196 = base_path + 'data/wavefake_data/LJ_metadata_16000KHz_AAC_196K.csv'

In [3]:
df_metadata_original = pd.read_csv(metadata_original).drop(columns=['Unnamed: 0'])
df_metadata_16k = pd.read_csv(metadata_16k).drop(columns=['Unnamed: 0'])
df_metadata_16k64 = pd.read_csv(metadata_16k64).drop(columns=['Unnamed: 0'])
df_metadata_16k196 = pd.read_csv(metadata_16k196).drop(columns=['Unnamed: 0'])

In [4]:
df_metadata_16k64['id'] = df_metadata_16k64['id'].apply(lambda x: x.split('.')[0])

In [5]:
df_metadata_original.columns

Index(['id', 'Main_ID', 'Secondary_ID', 'Real', 'Full_Band_MelGan', 'HifiGan',
       'MelGan', 'MelGanLarge', 'Multi_Band_MelGan', 'Parallel_WaveGan',
       'Waveglow', 'ElevenLabs', 'transcript_1', 'transcript_2'],
      dtype='object')

In [6]:
df_metadata_original.isna().sum()

id                       0
Main_ID                  0
Secondary_ID             0
Real                     0
Full_Band_MelGan         0
HifiGan                  0
MelGan                   0
MelGanLarge              0
Multi_Band_MelGan        0
Parallel_WaveGan         0
Waveglow                 0
ElevenLabs           11389
transcript_1             0
transcript_2            16
dtype: int64

In [7]:
np.all(df_metadata_original['id'] == df_metadata_16k['id']), np.all(df_metadata_original['id'] == df_metadata_16k64['id']), np.all(df_metadata_original['id'] == df_metadata_16k196['id'])

(True, True, True)

In [8]:
def process_dataframe_for_arch(df, arch, n_sample=800, random_state=42):
    
    #select and remove samples from provided dataframe
    df_sample = df[df[arch].notna()].sample(n=n_sample, random_state=random_state)
    df.drop(df_sample.index, axis=0, inplace=True)
    
    #process real
    df_real = df_sample[['id', 'Real']].rename(columns={'Real':'file'})
    df_real['type'] = arch
    df_real['fake'] = 0
    
    #process fake
    df_fake = df_sample[['id', arch]].rename(columns={arch:'file'})
    df_fake['type'] = arch
    df_fake['fake'] = 1
    
    #concat and return
    df_selected = pd.concat([df_real, df_fake]).reset_index()
    
    return df_selected    

## Without transcoding samples

In [9]:
#this df contains all samples of data
df_all_samples = df_metadata_original.copy()

#get list of all archs and print to check Elevenlabs goes first since it has limited data
archs = df_all_samples.columns.to_list()[4:12]
archs.reverse()
print(archs)

#create sampled df
df_selected_samples = pd.DataFrame()
for arch in archs:
    df_ = process_dataframe_for_arch(df_all_samples, arch)
    df_selected_samples = pd.concat([df_selected_samples, df_])

df_selected_samples.reset_index(inplace=True)
df_selected_samples.drop(columns=df_selected_samples.columns.to_list()[:2], inplace=True)

['ElevenLabs', 'Waveglow', 'Parallel_WaveGan', 'Multi_Band_MelGan', 'MelGanLarge', 'MelGan', 'HifiGan', 'Full_Band_MelGan']


In [10]:
#sanity check
df_selected_samples.type.value_counts()

ElevenLabs           1600
Waveglow             1600
Parallel_WaveGan     1600
Multi_Band_MelGan    1600
MelGanLarge          1600
MelGan               1600
HifiGan              1600
Full_Band_MelGan     1600
Name: type, dtype: int64

## Generate OpenSmile Features

In [11]:
smile_functionals_ComParE = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.Functionals)

In [12]:
def generate_smile_features(df_sample, smile_object, save=False, savefile=None):
    
    print("Generating openSMILE features\n")
    
    smile_df = pd.DataFrame()
    
    for i in tqdm(range(df_sample.shape[0])):
        
        df = smile_object.process_file(df_sample.file[i]).reset_index()
        
        duration = df['end'] - df['start']
        duration = duration.astype('timedelta64[ms]')/1000
        df.insert(loc=1, column='duration(seconds)', value=duration)
        
        df.drop(columns=['start', 'end'], inplace=True)
        
        smile_df = pd.concat([smile_df, df_sample.merge(df, on='file')]).reset_index(drop=True)
        
    if save==True:
        smile_df.to_csv(savefile, index=False)
        print("\nsaved to {}".format(savefile))        
        
    if save==False:
        return smile_df

In [13]:
savefile = base_path + 'testing-code/opensmile-feature-importance/smile_dfs/' + '0319-lj_experimental_data_v2.csv'
generate_smile_features(df_selected_samples, smile_functionals_ComParE, save=True, savefile=savefile)

Generating openSMILE features



100%|██████████| 12800/12800 [49:16<00:00,  4.33it/s] 



saved to /home/ubuntu/testing-code/opensmile-feature-importance/smile_dfs/0319-lj_experimental_data_v2.csv
