## Create Training Set:

## Create Training Set:

In [60]:
from fastai.vision import *
from tqdm import tqdm
import cv2

In [61]:
source_path = Path('../data/')
save_path = Path('../val_data/')

## Create Validation Set:

### Videos from folder 0->3

In [62]:
def get_annots(folder):
    f = get_files(folder, extensions=['.json']) # Extract the metadata
    a = pd.read_json(f[0]).T
    a.reset_index(inplace=True)
    a.rename({'index':'fname'}, axis=1, inplace=True)
    a.fname = folder.name + '/' + a.fname.astype(str)
    return a

In [63]:
def get_metadata(SOURCE, include=None, exclude=None):
    """
    extract the metadata from all the folders contained in SOURCE.
    """
    meta = []
    
    for i in SOURCE.iterdir(): # iterate over the files in SOURCE
        if i.is_dir() and (i/'metadata.json').is_file(): # Get only the directories
            if include is not None and i.name in include: # Check if in include
                print(f'Extracting data from the {i.name} folder')
                a = get_annots(i)
                meta.append(a)
            if exclude is not None and i.name not in exclude: # Check if not in exlcude
                print(f'Extracting data from the {i.name} folder')
                a = get_annots(i)
                meta.append(a)
    
    metadata = pd.concat(meta)
    metadata.reset_index(drop=True, inplace=True)
    
    return metadata

In [64]:
source_path = Path('../data/')

In [65]:
#train_meta = get_metadata(source_path, exclude=['dfdc_train_part_0', 'dfdc_train_part_1', 'dfdc_train_part_2', 'dfdc_train_part_3'])
val_meta = get_metadata(source_path, include=['dfdc_train_part_0', 'dfdc_train_part_1', 'dfdc_train_part_2', 'dfdc_train_part_3'])

Extracting data from the dfdc_train_part_2 folder
Extracting data from the dfdc_train_part_3 folder
Extracting data from the dfdc_train_part_0 folder
Extracting data from the dfdc_train_part_1 folder


In [66]:
val_meta.head()

Unnamed: 0,fname,label,split,original
0,dfdc_train_part_2/qyyebirxwe.mp4,FAKE,train,ejhhokmvpe.mp4
1,dfdc_train_part_2/ntjlknlcvn.mp4,FAKE,train,nthpnwylxo.mp4
2,dfdc_train_part_2/qivpypiwlp.mp4,FAKE,train,hszwwswewp.mp4
3,dfdc_train_part_2/lpkgabskbw.mp4,FAKE,train,rnxzqumvvl.mp4
4,dfdc_train_part_2/vctemjbusz.mp4,FAKE,train,sznkemeqro.mp4


In [67]:
#export
def runnit(f):
    def _func(*args, **kwargs):
        command = f(*args, **kwargs)
        p = subprocess.run(command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return p
    return _func

In [68]:
#export
def _ffmpeg_web_defaults():
    '''
    These are some reasonable values for uploading. i.e. YouTube, etc.
    '''
    return dict(video_encoder='libx264', video_bitrate='1.5M', fps=30, scale=.5, crf=23, #17-28
                audio_encoder='aac', audio_bitrate='128k')

def _ffmpeg_defaults():
    return dict(video_encoder=None, video_bitrate=None, fps=None, scale=None, crf=None,
                audio_encoder=None, audio_bitrate=None)

def _ffmpeg_fmts():
    "ffmpeg options syntax"
    return dict(video_encoder='-c:v {video_encoder:s}', 
                video_bitrate='-b:v {video_bitrate:s}', 
                fps='-r {fps:d}', 
                #scale='-vf scale=iw*{scale:.2f}:ih*{scale:.2f}', 
                scale='-vf scale=iw*{scale:.2f}:-1',
                crf='-crf {crf:d}',
                audio_bitrate='-b:a {audio_bitrate}', 
                audio_encoder='-c:a {audio_encoder}')

#@show_vid_info
@runnit
def run_ffmpeg(fpath_from=None, fpath_to=None, **kwargs):
    '''
    Run ffmpeg
    '''
    ps = _ffmpeg_defaults()
    ps.update(kwargs)
    pstr = []
    for n, s in _ffmpeg_fmts().items():
        if ps[n] is None: pstr.append('')
        else: pstr.append(s.format(**ps))
    pstr = ' '.join(pstr)
    return f'ffmpeg -i {fpath_from} {pstr} {fpath_to}'

In [101]:
def create_degraded_videos(source_path, save_path, metadata):
    
        fnames = metadata.fname
        
        new_meta = metadata.copy()
        new_meta["degradation"] = None
    
        for i,fname in tqdm(enumerate(fnames)):
            degraded = random.randint(1,3)
            
            if degraded==1: # If we have degraded==1 (33% chance)
                f = Path(f'{save_path}/{Path(fname).parents[0].name}').mkdir(parents=True, exist_ok=True)
                
                !cp {source_path/fname} {save_path/fname}
                
                new_meta.degradation[i] = 'copy'
            
            else: # 66% chance
                degradation = random.randint(1,3)
                
                f = Path(f'{save_path}/{Path(fname).parents[0].name}').mkdir(parents=True, exist_ok=True)
                
                if degradation == 1:
                    run_ffmpeg(fpath_from=source_path/fname, fpath_to=save_path/fname, crf=28)
                    new_meta.degradation[i] = 'quality'
                elif degradation == 2:
                    run_ffmpeg(fpath_from=source_path/fname, fpath_to=save_path/fname, crf=20, scale=0.5)
                    new_meta.degradation[i] = 'scale'
                else:
                    run_ffmpeg(fpath_from=source_path/fname, fpath_to=save_path/fname, crf=20, fps=15)
                    new_meta.degradation[i] = 'fps'

                    ## The default crf value is 23, however when saving a video with that value, there is a decrease
                    ## in file size, suggesting a compression. 20 is the value I manually found to have almost no difference
        return new_meta

In [102]:
new_meta = create_degraded_videos(source_path, save_path, val_meta)

6236it [2:53:59,  1.67s/it]


In [106]:
new_meta.head()

Unnamed: 0,fname,label,split,original,degradation
0,dfdc_train_part_2/qyyebirxwe.mp4,FAKE,train,ejhhokmvpe.mp4,quality
1,dfdc_train_part_2/ntjlknlcvn.mp4,FAKE,train,nthpnwylxo.mp4,copy
2,dfdc_train_part_2/qivpypiwlp.mp4,FAKE,train,hszwwswewp.mp4,copy
3,dfdc_train_part_2/lpkgabskbw.mp4,FAKE,train,rnxzqumvvl.mp4,copy
4,dfdc_train_part_2/vctemjbusz.mp4,FAKE,train,sznkemeqro.mp4,copy


In [107]:
assert(len(new_meta) == len(val_meta))

In [105]:
new_meta.to_csv(save_path/'metadata.csv', index = False)

In [76]:
ls ../val_data

[0m[01;34mdfdc_train_part_0[0m/  [01;34mdfdc_train_part_2[0m/  metadata.csv
[01;34mdfdc_train_part_1[0m/  [01;34mdfdc_train_part_3[0m/


In [77]:
SOURCE = Path('../val_data/')

In [91]:
f = get_files(SOURCE, extensions=['.csv'])[0]

In [92]:
annots = pd.read_csv(f)
annots.head()

Unnamed: 0,fname,label,split,original,degradation
0,dfdc_train_part_2/qyyebirxwe.mp4,FAKE,train,ejhhokmvpe.mp4,fps
1,dfdc_train_part_2/ntjlknlcvn.mp4,FAKE,train,nthpnwylxo.mp4,fps
2,dfdc_train_part_2/qivpypiwlp.mp4,FAKE,train,hszwwswewp.mp4,fps
3,dfdc_train_part_2/lpkgabskbw.mp4,FAKE,train,rnxzqumvvl.mp4,fps
4,dfdc_train_part_2/vctemjbusz.mp4,FAKE,train,sznkemeqro.mp4,fps


In [76]:
ls ../val_data

[0m[01;34mdfdc_train_part_0[0m/  [01;34mdfdc_train_part_2[0m/  metadata.csv
[01;34mdfdc_train_part_1[0m/  [01;34mdfdc_train_part_3[0m/


In [77]:
SOURCE = Path('../val_data/')

In [91]:
f = get_files(SOURCE, extensions=['.csv'])[0]

In [92]:
annots = pd.read_csv(f)
annots.head()

Unnamed: 0,fname,label,split,original,degradation
0,dfdc_train_part_2/qyyebirxwe.mp4,FAKE,train,ejhhokmvpe.mp4,fps
1,dfdc_train_part_2/ntjlknlcvn.mp4,FAKE,train,nthpnwylxo.mp4,fps
2,dfdc_train_part_2/qivpypiwlp.mp4,FAKE,train,hszwwswewp.mp4,fps
3,dfdc_train_part_2/lpkgabskbw.mp4,FAKE,train,rnxzqumvvl.mp4,fps
4,dfdc_train_part_2/vctemjbusz.mp4,FAKE,train,sznkemeqro.mp4,fps


In [60]:
from fastai.vision import *
from tqdm import tqdm
import cv2

In [61]:
source_path = Path('../data/')
save_path = Path('../val_data/')

## Create Validation Set:

### Videos from folder 0->3

In [62]:
def get_annots(folder):
    f = get_files(folder, extensions=['.json']) # Extract the metadata
    a = pd.read_json(f[0]).T
    a.reset_index(inplace=True)
    a.rename({'index':'fname'}, axis=1, inplace=True)
    a.fname = folder.name + '/' + a.fname.astype(str)
    return a

In [63]:
def get_metadata(SOURCE, include=None, exclude=None):
    """
    extract the metadata from all the folders contained in SOURCE.
    """
    meta = []
    
    for i in SOURCE.iterdir(): # iterate over the files in SOURCE
        if i.is_dir() and (i/'metadata.json').is_file(): # Get only the directories
            if include is not None and i.name in include: # Check if in include
                print(f'Extracting data from the {i.name} folder')
                a = get_annots(i)
                meta.append(a)
            if exclude is not None and i.name not in exclude: # Check if not in exlcude
                print(f'Extracting data from the {i.name} folder')
                a = get_annots(i)
                meta.append(a)
    
    metadata = pd.concat(meta)
    metadata.reset_index(drop=True, inplace=True)
    
    return metadata

In [64]:
source_path = Path('../data/')

In [65]:
#train_meta = get_metadata(source_path, exclude=['dfdc_train_part_0', 'dfdc_train_part_1', 'dfdc_train_part_2', 'dfdc_train_part_3'])
val_meta = get_metadata(source_path, include=['dfdc_train_part_0', 'dfdc_train_part_1', 'dfdc_train_part_2', 'dfdc_train_part_3'])

Extracting data from the dfdc_train_part_2 folder
Extracting data from the dfdc_train_part_3 folder
Extracting data from the dfdc_train_part_0 folder
Extracting data from the dfdc_train_part_1 folder


In [66]:
val_meta.head()

Unnamed: 0,fname,label,split,original
0,dfdc_train_part_2/qyyebirxwe.mp4,FAKE,train,ejhhokmvpe.mp4
1,dfdc_train_part_2/ntjlknlcvn.mp4,FAKE,train,nthpnwylxo.mp4
2,dfdc_train_part_2/qivpypiwlp.mp4,FAKE,train,hszwwswewp.mp4
3,dfdc_train_part_2/lpkgabskbw.mp4,FAKE,train,rnxzqumvvl.mp4
4,dfdc_train_part_2/vctemjbusz.mp4,FAKE,train,sznkemeqro.mp4


In [67]:
#export
def runnit(f):
    def _func(*args, **kwargs):
        command = f(*args, **kwargs)
        p = subprocess.run(command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return p
    return _func

In [68]:
#export
def _ffmpeg_web_defaults():
    '''
    These are some reasonable values for uploading. i.e. YouTube, etc.
    '''
    return dict(video_encoder='libx264', video_bitrate='1.5M', fps=30, scale=.5, crf=23, #17-28
                audio_encoder='aac', audio_bitrate='128k')

def _ffmpeg_defaults():
    return dict(video_encoder=None, video_bitrate=None, fps=None, scale=None, crf=None,
                audio_encoder=None, audio_bitrate=None)

def _ffmpeg_fmts():
    "ffmpeg options syntax"
    return dict(video_encoder='-c:v {video_encoder:s}', 
                video_bitrate='-b:v {video_bitrate:s}', 
                fps='-r {fps:d}', 
                #scale='-vf scale=iw*{scale:.2f}:ih*{scale:.2f}', 
                scale='-vf scale=iw*{scale:.2f}:-1',
                crf='-crf {crf:d}',
                audio_bitrate='-b:a {audio_bitrate}', 
                audio_encoder='-c:a {audio_encoder}')

#@show_vid_info
@runnit
def run_ffmpeg(fpath_from=None, fpath_to=None, **kwargs):
    '''
    Run ffmpeg
    '''
    ps = _ffmpeg_defaults()
    ps.update(kwargs)
    pstr = []
    for n, s in _ffmpeg_fmts().items():
        if ps[n] is None: pstr.append('')
        else: pstr.append(s.format(**ps))
    pstr = ' '.join(pstr)
    return f'ffmpeg -i {fpath_from} {pstr} {fpath_to}'

In [101]:
def create_degraded_videos(source_path, save_path, metadata):
    
        fnames = metadata.fname
        
        new_meta = metadata.copy()
        new_meta["degradation"] = None
    
        for i,fname in tqdm(enumerate(fnames)):
            degraded = random.randint(1,3)
            
            if degraded==1: # If we have degraded==1 (33% chance)
                f = Path(f'{save_path}/{Path(fname).parents[0].name}').mkdir(parents=True, exist_ok=True)
                
                !cp {source_path/fname} {save_path/fname}
                
                new_meta.degradation[i] = 'copy'
            
            else: # 66% chance
                degradation = random.randint(1,3)
                
                f = Path(f'{save_path}/{Path(fname).parents[0].name}').mkdir(parents=True, exist_ok=True)
                
                if degradation == 1:
                    run_ffmpeg(fpath_from=source_path/fname, fpath_to=save_path/fname, crf=28)
                    new_meta.degradation[i] = 'quality'
                elif degradation == 2:
                    run_ffmpeg(fpath_from=source_path/fname, fpath_to=save_path/fname, crf=20, scale=0.5)
                    new_meta.degradation[i] = 'scale'
                else:
                    run_ffmpeg(fpath_from=source_path/fname, fpath_to=save_path/fname, crf=20, fps=15)
                    new_meta.degradation[i] = 'fps'

                    ## The default crf value is 23, however when saving a video with that value, there is a decrease
                    ## in file size, suggesting a compression. 20 is the value I manually found to have almost no difference
        return new_meta

In [102]:
new_meta = create_degraded_videos(source_path, save_path, val_meta)

6236it [2:53:59,  1.67s/it]


In [106]:
new_meta.head()

Unnamed: 0,fname,label,split,original,degradation
0,dfdc_train_part_2/qyyebirxwe.mp4,FAKE,train,ejhhokmvpe.mp4,quality
1,dfdc_train_part_2/ntjlknlcvn.mp4,FAKE,train,nthpnwylxo.mp4,copy
2,dfdc_train_part_2/qivpypiwlp.mp4,FAKE,train,hszwwswewp.mp4,copy
3,dfdc_train_part_2/lpkgabskbw.mp4,FAKE,train,rnxzqumvvl.mp4,copy
4,dfdc_train_part_2/vctemjbusz.mp4,FAKE,train,sznkemeqro.mp4,copy


In [107]:
assert(len(new_meta) == len(val_meta))

In [105]:
new_meta.to_csv(save_path/'metadata.csv', index = False)

In [76]:
ls ../val_data

[0m[01;34mdfdc_train_part_0[0m/  [01;34mdfdc_train_part_2[0m/  metadata.csv
[01;34mdfdc_train_part_1[0m/  [01;34mdfdc_train_part_3[0m/


In [77]:
SOURCE = Path('../val_data/')

In [91]:
f = get_files(SOURCE, extensions=['.csv'])[0]

In [92]:
annots = pd.read_csv(f)
annots.head()

Unnamed: 0,fname,label,split,original,degradation
0,dfdc_train_part_2/qyyebirxwe.mp4,FAKE,train,ejhhokmvpe.mp4,fps
1,dfdc_train_part_2/ntjlknlcvn.mp4,FAKE,train,nthpnwylxo.mp4,fps
2,dfdc_train_part_2/qivpypiwlp.mp4,FAKE,train,hszwwswewp.mp4,fps
3,dfdc_train_part_2/lpkgabskbw.mp4,FAKE,train,rnxzqumvvl.mp4,fps
4,dfdc_train_part_2/vctemjbusz.mp4,FAKE,train,sznkemeqro.mp4,fps


## Create Training Set:

In [2]:
from fastai.vision import *
from tqdm import tqdm
import cv2

In [3]:
source_path = Path('../data/')
save_path = Path('../val_data/')

## Create Validation Set:

### Videos from folder 0->3

In [4]:
def get_annots(folder):
    f = get_files(folder, extensions=['.json']) # Extract the metadata
    a = pd.read_json(f[0]).T
    a.reset_index(inplace=True)
    a.rename({'index':'fname'}, axis=1, inplace=True)
    a.fname = folder.name + '/' + a.fname.astype(str)
    return a

In [23]:
def get_metadata(SOURCE, include=None, exclude=None):
    """
    extract the metadata from all the folders contained in SOURCE.
    """
    meta = []
    
    for i in SOURCE.iterdir(): # iterate over the files in SOURCE
        if i.is_dir() and (i/'metadata.json').is_file(): # Get only the directories
            if include is not None and i.name in include: # Check if in include
                print(f'Extracting data from the {i.name} folder')
                a = get_annots(i)
                meta.append(a)
            if exclude is not None and i.name not in exclude: # Check if not in exlcude
                print(f'Extracting data from the {i.name} folder')
                a = get_annots(i)
                meta.append(a)
    
    metadata = pd.concat(meta)
    metadata.reset_index(drop=True, inplace=True)
    
    return metadata

In [6]:
source_path = Path('../data/')

In [7]:
#train_meta = get_metadata(source_path, exclude=['dfdc_train_part_0', 'dfdc_train_part_1', 'dfdc_train_part_2', 'dfdc_train_part_3'])
val_meta = get_metadata(source_path, include=['dfdc_train_part_0', 'dfdc_train_part_1', 'dfdc_train_part_2', 'dfdc_train_part_3'])

Extracting data from the dfdc_train_part_2 folder
Extracting data from the dfdc_train_part_3 folder
Extracting data from the dfdc_train_part_0 folder
Extracting data from the dfdc_train_part_1 folder


In [8]:
val_meta.head()

Unnamed: 0,fname,label,split,original
0,dfdc_train_part_2/qyyebirxwe.mp4,FAKE,train,ejhhokmvpe.mp4
1,dfdc_train_part_2/ntjlknlcvn.mp4,FAKE,train,nthpnwylxo.mp4
2,dfdc_train_part_2/qivpypiwlp.mp4,FAKE,train,hszwwswewp.mp4
3,dfdc_train_part_2/lpkgabskbw.mp4,FAKE,train,rnxzqumvvl.mp4
4,dfdc_train_part_2/vctemjbusz.mp4,FAKE,train,sznkemeqro.mp4


In [9]:
#export
def runnit(f):
    def _func(*args, **kwargs):
        command = f(*args, **kwargs)
        p = subprocess.run(command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return p
    return _func

In [10]:
#export
def _ffmpeg_web_defaults():
    '''
    These are some reasonable values for uploading. i.e. YouTube, etc.
    '''
    return dict(video_encoder='libx264', video_bitrate='1.5M', fps=30, scale=.5, crf=23, #17-28
                audio_encoder='aac', audio_bitrate='128k')

def _ffmpeg_defaults():
    return dict(video_encoder=None, video_bitrate=None, fps=None, scale=None, crf=None,
                audio_encoder=None, audio_bitrate=None)

def _ffmpeg_fmts():
    "ffmpeg options syntax"
    return dict(video_encoder='-c:v {video_encoder:s}', 
                video_bitrate='-b:v {video_bitrate:s}', 
                fps='-r {fps:d}', 
                #scale='-vf scale=iw*{scale:.2f}:ih*{scale:.2f}', 
                scale='-vf scale=iw*{scale:.2f}:-1',
                crf='-crf {crf:d}',
                audio_bitrate='-b:a {audio_bitrate}', 
                audio_encoder='-c:a {audio_encoder}')

#@show_vid_info
@runnit
def run_ffmpeg(fpath_from=None, fpath_to=None, **kwargs):
    '''
    Run ffmpeg
    '''
    ps = _ffmpeg_defaults()
    ps.update(kwargs)
    pstr = []
    for n, s in _ffmpeg_fmts().items():
        if ps[n] is None: pstr.append('')
        else: pstr.append(s.format(**ps))
    pstr = ' '.join(pstr)
    return f'ffmpeg -i {fpath_from} {pstr} {fpath_to}'

In [11]:
def create_degraded_videos(source_path, save_path, fnames):
    
        for fname in tqdm(fnames):
            degraded = random.randint(1,3)
            
            if degraded==1: # If we have degraded==1 (33% chance)
                f = Path(f'{save_path}/{Path(fname).parents[0].name}').mkdir(parents=True, exist_ok=True)
                fpath_to = Path(fname).parts[-2]+'/copy_'+ Path(fname).parts[-1]
                
                !cp {source_path/fname} {save_path/fpath_to}
            
            else: # 66% chance
                
                degradation = random.randint(1,3)
                
                f = Path(f'{save_path}/{Path(fname).parents[0].name}').mkdir(parents=True, exist_ok=True)
                
                
                if degradation == 1:
                    fpath_to = Path(fname).parts[-2]+'/compressed_'+ Path(fname).parts[-1]
                    run_ffmpeg(fpath_from=source_path/fname, fpath_to=save_path/fpath_to, crf=28)
                elif degradation == 2:
                    fpath_to = Path(fname).parts[-2]+'/rescaled_'+ Path(fname).parts[-1]
                    run_ffmpeg(fpath_from=source_path/fname, fpath_to=save_path/fpath_to, crf=20, scale=0.5)
                else:
                    fpath_to = Path(fname).parts[-2]+'/fps_'+ Path(fname).parts[-1]
                    run_ffmpeg(fpath_from=source_path/fname, fpath_to=save_path/fpath_to, crf=20, fps=15)

                    ## The default crf value is 23, however when saving a video with that value, there is a decrease
                    ## in file size, suggesting a compression. 20 is the value I manually found to have almost no difference

In [12]:
create_degraded_videos(source_path, save_path, val_meta.fname)

100%|██████████| 6236/6236 [1:30:13<00:00,  1.15it/s]
