# LJ Data Loading Example 

The purpose of this file is to show the available features in the LJ Data Loader and provide an example that the team can reference when creating their train/test/splits

In [1]:
import sys
sys.path.append("/home/ubuntu/MultiModalDeepFake")

In [2]:
from packages.LJDataLoader import LJDataLoader

  from pandas.core.computation.check import NUMEXPR_INSTALLED


## Initialize the Data Loader Object

In [3]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16000KHz.csv'
loader = LJDataLoader(data_path=file_path)

## Splitting the Data
You first want to split the data into train, dev, and test. You can provide percentages for these if you want, but it will default to 60%, 20%, 20%

This will add a column called "type" with the values train, test, and dev

In [4]:
loader.splitData()

In [5]:
loader.metadata.head()

Unnamed: 0.1,index,Unnamed: 0,id,Main_ID,Secondary_ID,Real,Full_Band_MelGan,HifiGan,MelGan,MelGanLarge,Multi_Band_MelGan,Parallel_WaveGan,Waveglow,ElevenLabs,transcript_1,transcript_2,type
216,1379,1379,LJ005-0261,LJ005,261,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,"As a means of securing this uniformity,","As a means of securing this uniformity,",train
317,2221,2221,LJ008-0254,LJ008,254,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,It was followed by the deep and solemn booming...,It was followed by the deep and solemn booming...,train
260,1767,1767,LJ007-0042,LJ007,42,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,"so that men in the most awful situation, daily...","so that men in the most awful situation, daily...",train
205,1300,1300,LJ005-0182,LJ005,182,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,that for long no idea of interfering with them...,that for long no idea of interfering with them...,train
35,35,35,LJ001-0036,LJ001,36,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,But about the same year Mentelin at Strasburg ...,But about the same year Mentelin at Strasburg ...,train


## Column Combination

There may be instances where you may want to mix the column archicture data. 

For example, we know that many of the WaveFake Architectures are very similar. So you may want to randomly sample from each of the architectures and create a new column.

In [6]:
source_architectures = ['Full_Band_MelGan', 'HifiGan', 'MelGan', 'MelGanLarge', 'Multi_Band_MelGan', 'Parallel_WaveGan', 'Waveglow']
new_col_name = 'RandWaveFake'

In [7]:
loader.selectRandomArchitecture(target_col=new_col_name, source_cols=source_architectures)

In [8]:
#Notice a new column called RandWaveFake
loader.metadata.head()

Unnamed: 0.1,index,Unnamed: 0,id,Main_ID,Secondary_ID,Real,Full_Band_MelGan,HifiGan,MelGan,MelGanLarge,Multi_Band_MelGan,Parallel_WaveGan,Waveglow,ElevenLabs,transcript_1,transcript_2,type,RandWaveFake
216,1379,1379,LJ005-0261,LJ005,261,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,"As a means of securing this uniformity,","As a means of securing this uniformity,",train,/home/ubuntu/data/wavefake_data/generated_audi...
317,2221,2221,LJ008-0254,LJ008,254,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,It was followed by the deep and solemn booming...,It was followed by the deep and solemn booming...,train,/home/ubuntu/data/wavefake_data/generated_audi...
260,1767,1767,LJ007-0042,LJ007,42,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,"so that men in the most awful situation, daily...","so that men in the most awful situation, daily...",train,/home/ubuntu/data/wavefake_data/generated_audi...
205,1300,1300,LJ005-0182,LJ005,182,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,that for long no idea of interfering with them...,that for long no idea of interfering with them...,train,/home/ubuntu/data/wavefake_data/generated_audi...
35,35,35,LJ001-0036,LJ001,36,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,But about the same year Mentelin at Strasburg ...,But about the same year Mentelin at Strasburg ...,train,/home/ubuntu/data/wavefake_data/generated_audi...


## Generate The Final DF

To get the final dataframe, you need to provide the name of the real column and fake columns

We may not want both the real and fake audio clips going into the training data. To account for this, there is a boolean parameter called "single_id_entry". When turned on, it will sample one audio clip per id. 
- This will of course result in a big decrease in the number of total training and testing data points. 

In [9]:
#example where Single ID Entry is turned off
df = loader.generateFinalDataFrame(real_col='Real', fake_cols=['RandWaveFake', 'ElevenLabs'])

In [10]:
df

Unnamed: 0,type,path,label,multiclass_label
0,train,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0
1,train,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0
2,train,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0
3,train,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0
4,train,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0
...,...,...,...,...
5122,test,/home/ubuntu/data/wavefake_data/generated_audi...,1,2
5123,test,/home/ubuntu/data/wavefake_data/generated_audi...,1,2
5124,test,/home/ubuntu/data/wavefake_data/generated_audi...,1,2
5125,test,/home/ubuntu/data/wavefake_data/generated_audi...,1,2


In [11]:
#example where Single ID Entry is turned on
df = loader.generateFinalDataFrame(real_col='Real', fake_cols=['RandWaveFake', 'ElevenLabs'], single_id_entry=True)

In [12]:
df

Unnamed: 0,path,label,multiclass_label,type
216,/home/ubuntu/data/wavefake_data/generated_audi...,1,2,train
317,/home/ubuntu/data/wavefake_data/generated_audi...,1,2,train
260,/home/ubuntu/data/wavefake_data/generated_audi...,1,2,train
205,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0,train
35,/home/ubuntu/data/wavefake_data/generated_audi...,1,2,train
...,...,...,...,...
119,/home/ubuntu/data/wavefake_data/generated_audi...,1,1,test
1009,/home/ubuntu/data/wavefake_data/generated_audi...,1,1,test
1035,/home/ubuntu/data/wavefake_data/generated_audi...,1,2,test
1529,/home/ubuntu/data/wavefake_data/generated_audi...,1,1,test
