# LJ Data Loading Example 

The purpose of this file is to show the available features in the LJ Data Loader and provide an example that the team can reference when creating their train/test/splits

In [1]:
import sys
sys.path.append("/home/ubuntu/MultiModalDeepFake")

In [2]:
from packages.LJDataLoader import LJDataLoader

  from pandas.core.computation.check import NUMEXPR_INSTALLED


## Initialize the Data Loader Object

In [3]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16000KHz.csv'
loader = LJDataLoader(data_path=file_path)

## Splitting the Data
You first want to split the data into train, dev, and test. You can provide percentages for these if you want, but it will default to 60%, 20%, 20%

This will add a column called "type" with the values train, test, and dev

In [4]:
loader.splitData()

13055
7833 10444


In [5]:
loader.metadata.head()

Unnamed: 0.1,index,Unnamed: 0,id,Main_ID,Secondary_ID,Real,Full_Band_MelGan,HifiGan,MelGan,MelGanLarge,Multi_Band_MelGan,Parallel_WaveGan,Waveglow,ElevenLabs,UberDuck,transcript_1,transcript_2,type
0,12006,12006,LJ046-0206,LJ046,206,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,and at the working level with personnel of the...,and at the working level with personnel of the...,train
1,5459,5459,LJ018-0323,LJ018,323,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,"A year or two after, when the prisoners were u...","A year or two after, when the prisoners were u...",train
2,6282,6282,LJ022-0032,LJ022,32,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,"to go fishing or back home to Hyde Park, so th...","to go fishing or back home to Hyde Park, so th...",train
3,3953,3953,LJ014-0190,LJ014,190,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,"Marley, disturbed, picked up a cigar and parce...","Marley, disturbed, picked up a cigar and parce...",train
4,221,221,LJ002-0036,LJ002,36,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,1. The male debtors' side consisted of a yard ...,one. The male debtors' side consisted of a yar...,train


## Column Combination

There may be instances where you may want to mix the column archicture data. 

For example, we know that many of the WaveFake Architectures are very similar. So you may want to randomly sample from each of the architectures and create a new column.

In [6]:
source_architectures = ['Full_Band_MelGan', 'HifiGan', 'MelGan', 'MelGanLarge', 'Multi_Band_MelGan', 'Parallel_WaveGan', 'Waveglow']
new_col_name = 'RandWaveFake'

In [7]:
loader.selectRandomArchitecture(target_col=new_col_name, source_cols=source_architectures)

In [8]:
#Notice a new column called RandWaveFake
loader.metadata.head()

Unnamed: 0.1,index,Unnamed: 0,id,Main_ID,Secondary_ID,Real,Full_Band_MelGan,HifiGan,MelGan,MelGanLarge,Multi_Band_MelGan,Parallel_WaveGan,Waveglow,ElevenLabs,UberDuck,transcript_1,transcript_2,type,RandWaveFake
0,12006,12006,LJ046-0206,LJ046,206,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,and at the working level with personnel of the...,and at the working level with personnel of the...,train,/home/ubuntu/data/wavefake_data/generated_audi...
1,5459,5459,LJ018-0323,LJ018,323,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,"A year or two after, when the prisoners were u...","A year or two after, when the prisoners were u...",train,/home/ubuntu/data/wavefake_data/generated_audi...
2,6282,6282,LJ022-0032,LJ022,32,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,"to go fishing or back home to Hyde Park, so th...","to go fishing or back home to Hyde Park, so th...",train,/home/ubuntu/data/wavefake_data/generated_audi...
3,3953,3953,LJ014-0190,LJ014,190,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,"Marley, disturbed, picked up a cigar and parce...","Marley, disturbed, picked up a cigar and parce...",train,/home/ubuntu/data/wavefake_data/generated_audi...
4,221,221,LJ002-0036,LJ002,36,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,/home/ubuntu/data/wavefake_data/generated_audi...,1. The male debtors' side consisted of a yard ...,one. The male debtors' side consisted of a yar...,train,/home/ubuntu/data/wavefake_data/generated_audi...


## Generate The Final DF

To get the final dataframe, you need to provide the name of the real column and fake columns

We may not want both the real and fake audio clips going into the training data. To account for this, there is a boolean parameter called "single_id_entry". When turned on, it will sample one audio clip per id. 
- This will of course result in a big decrease in the number of total training and testing data points. 

In [9]:
#example where Single ID Entry is turned off
df = loader.generateFinalDataFrame(real_col='Real', fake_cols=['RandWaveFake', 'ElevenLabs'])

In [10]:
df

Unnamed: 0,type,id,architecture,path,label,multiclass_label
0,train,LJ046-0206,Real,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0
1,train,LJ018-0323,Real,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0
2,train,LJ022-0032,Real,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0
3,train,LJ014-0190,Real,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0
4,train,LJ002-0036,Real,/home/ubuntu/data/wavefake_data/LJSpeech_1.1/w...,0,0
...,...,...,...,...,...,...
39160,test,LJ019-0132,ElevenLabs,/home/ubuntu/data/wavefake_data/generated_audi...,1,2
39161,test,LJ016-0426,ElevenLabs,/home/ubuntu/data/wavefake_data/generated_audi...,1,2
39162,test,LJ019-0008,ElevenLabs,/home/ubuntu/data/wavefake_data/generated_audi...,1,2
39163,test,LJ003-0232,ElevenLabs,/home/ubuntu/data/wavefake_data/generated_audi...,1,2


In [13]:
#example where Single ID Entry is turned on
df = loader.generateFinalDataFrame(real_col='Real', fake_cols=['RandWaveFake', 'ElevenLabs', 'UberDuck'])

In [16]:
df.architecture.value_counts()

Real            13055
RandWaveFake    13055
ElevenLabs      13055
UberDuck        13055
Name: architecture, dtype: int64

# LJ Data Loading Example 

The purpose of this file is to show the available features in the LJ Data Loader and provide an example that the team can reference when creating their train/test/splits

In [None]:
import sys
sys.path.append("/home/ubuntu/MultiModalDeepFake")

In [None]:
from packages.LJDataLoader import LJDataLoader

## Initialize the Data Loader Object

In [None]:
file_path = '/home/ubuntu/data/wavefake_data/LJ_metadata_16000KHz.csv'
loader = LJDataLoader(data_path=file_path)

## Splitting the Data
You first want to split the data into train, dev, and test. You can provide percentages for these if you want, but it will default to 60%, 20%, 20%

This will add a column called "type" with the values train, test, and dev

In [None]:
loader.splitData()

In [None]:
loader.metadata.head()

## Column Combination

There may be instances where you may want to mix the column archicture data. 

For example, we know that many of the WaveFake Architectures are very similar. So you may want to randomly sample from each of the architectures and create a new column.

In [None]:
source_architectures = ['Full_Band_MelGan', 'HifiGan', 'MelGan', 'MelGanLarge', 'Multi_Band_MelGan', 'Parallel_WaveGan', 'Waveglow']
new_col_name = 'RandWaveFake'

In [None]:
loader.selectRandomArchitecture(target_col=new_col_name, source_cols=source_architectures)

In [None]:
#Notice a new column called RandWaveFake
loader.metadata.head()

## Generate The Final DF

To get the final dataframe, you need to provide the name of the real column and fake columns

We may not want both the real and fake audio clips going into the training data. To account for this, there is a boolean parameter called "single_id_entry". When turned on, it will sample one audio clip per id. 
- This will of course result in a big decrease in the number of total training and testing data points. 

In [None]:
#example where Single ID Entry is turned off
df = loader.generateFinalDataFrame(real_col='Real', fake_cols=['RandWaveFake', 'ElevenLabs'])

In [None]:
df

In [None]:
#example where Single ID Entry is turned on
df = loader.generateFinalDataFrame(real_col='Real', fake_cols=['RandWaveFake', 'ElevenLabs'], single_id_entry=True)

In [None]:
df