# Look at some samples

In [1]:
%reload_ext autoreload
%autoreload 2

In [19]:
#default_exp data

In [3]:
#export
from fastai.vision import *
import pandas as pd
from IPython.display import display, Video, HTML

In [4]:
SOURCE = Path('../data/train_sample_videos//')

### Displyaing videos

In [5]:
fnames = get_files(SOURCE, extensions=['.mp4'])
fnames[:3], len(fnames)

([PosixPath('../data/train_sample_videos/bxzakyopjf.mp4'),
  PosixPath('../data/train_sample_videos/axwgcsyphv.mp4'),
  PosixPath('../data/train_sample_videos/akvmwkdyuv.mp4')],
 400)

In [38]:
#export
def html_vid(fname):
    "Return HTML for video."
    return f'''
    <video width="300" height="250" controls>
    <source src="{fname}" type="video/mp4">
    </video>
    '''

In [None]:
HTML(html_vid(fnames[9]))

In [39]:
#export
def html_titled_vid(fname, title):
    "Return HTML for titled video."
    return f'<div><p>{title}</p><br>{html_vid(fname)}</div>'

In [40]:
#export
def html_vids(fnames, titles=None, ncols=3):
    "Return HTML for table of (titled) videos."
    n = len(fnames)
    if titles is None: titles = n * ['']
    assert len(titles) == n
    rs = []
    for i in range(0, n, ncols):
        fs, ts = fnames[i:i+ncols], titles[i:i+ncols]
        xs = (html_titled_vid(f, t) for f,t in zip(fs, ts))
        xs = (f'<td>{x}</td>' for x in xs)
        r = f"<tr>{''.join(xs)}</tr>"
        rs.append(r)
    return f"<table>{''.join(rs)}</table>"

In [None]:
HTML(html_vids(fnames[10:15], ['fake', 'real', 'fake', 'carrot', 'muffin'], ncols=3))

0,1,2
fake,real,fake
carrot,muffin,


## Annotations

### FAKE video's original video

In [None]:
f = get_files(SOURCE, extensions=['.json'])[0]
annots = pd.read_json(f).T

Note that FAKE videos have the additional annotation field *original*, which indicates the original, real video from which the fake was generated, perhaps.  For real videos, this field is `NaN`.

In [None]:
annots[annots.label=='REAL'].head()

Unnamed: 0,label,split,original
abarnvbtwb.mp4,REAL,train,
aelfnikyqj.mp4,REAL,train,
afoovlsmtx.mp4,REAL,train,
agrmhtjdlk.mp4,REAL,train,
ahqqqilsxt.mp4,REAL,train,


In [None]:
annots[annots.label=='FAKE'].head()

Unnamed: 0,label,split,original
aagfhgtpmv.mp4,FAKE,train,vudstovrck.mp4
aapnvogymq.mp4,FAKE,train,jdubbvfswz.mp4
abofeumbvv.mp4,FAKE,train,atvmxvwyns.mp4
abqwwspghj.mp4,FAKE,train,qzimuostzz.mp4
acifjvzvpm.mp4,FAKE,train,kbvibjhfzo.mp4


This checks that all the fake videos' original video are also videos in this dataset.

In [None]:
assert set(annots.original[annots.original.notna()].unique())  - set(annots.index.unique()) == set()

AssertionError: 

Display some fake videos next to their original video.

In [None]:
n = 5
s = annots[annots.label == 'FAKE'].sample(n)
fs, rs = s.index.values, s.original.values
vids = np.stack([fs, rs], axis=1).reshape(-1)
vids = [SOURCE/o for o in vids]
HTML(html_vids(vids, ncols=2))

0,1
,
,
,
,
,


### Loading annotation from multiple dataset directories

In [58]:
#export
def get_annots(SOURCE):
    """
    extract the metadata from all the folders contained in SOURCE.
    """
    
    files = []
    annots = []
    
    for i in SOURCE.iterdir(): # iterate over the files in SOURCE
        if i.is_dir() and (i/'metadata.json').is_file(): # Get only the directories
            print(f'Extracting data from the {i.name} folder')
            f = get_files(i, extensions=['.json']) # Extract the metadata
            files.append(f) 
            
            a = pd.read_json(f[0]).T
            a.reset_index(inplace=True)
            a.rename({'index':'fname'}, axis=1, inplace=True)
            a.fname = i.name + '/' + a.fname.astype(str)
            a.loc[a.label=='FAKE', 'original'] = i.name + '/' + a.original[a.label=='FAKE']
            
            annots.append(a)
    return pd.concat(annots).reset_index(drop=True)

In [59]:
SOURCE = Path('../data/')

In [60]:
annots = get_annots(SOURCE)

Extracting data from the dfdc_train_part_0 folder
Extracting data from the train_sample_videos folder
Extracting data from the dfdc_train_part_40 folder
Extracting data from the dfdc_train_part_45 folder
Extracting data from the dfdc_train_part_10 folder
Extracting data from the dfdc_train_part_5 folder


In [63]:
annots_fake = annots[annots.label=='FAKE']
annots.shape

(12175, 4)

In [67]:
idx = 218

fns = [SOURCE/f[idx] for f in (annots_fake.fname, annots_fake.original)]
HTML(html_vids(fns, titles=['FAKE', 'REAL']))

0,1
FAKE,REAL


# -fin

In [1]:
from nbdev.export import *

In [2]:
notebook2script()

Converted 00_data-Copy1.ipynb.
Converted 00_data.ipynb.
Converted 01_face_detection.ipynb.
Converted 01a_faces_probs_examples-Copy1.ipynb.
Converted 01a_faces_probs_examples.ipynb.
Converted 02_fix_luminosity.ipynb.
Converted 02a_create_faceimage_dataset.ipynb.
Converted 02bis_Create_Dataset.ipynb.
Converted 02c_faces_different_dfdc_zips.ipynb.
Converted 03_models.ipynb.
Converted 04_Baseline_Classification.ipynb.
Converted 04_Classification.ipynb.
Converted 04a_classification_videolist.ipynb.
Converted 05_Class_Imbalance.ipynb.
Converted 06_Focal_Loss.ipynb.
Converted 07_full_classification.ipynb.
This cell doesn't have an export destination and was ignored:
e
Converted 07a_classify_video_margin.ipynb.
Converted 07b_classify_resize.ipynb.
Converted deepfake_submission.ipynb.
Converted export_kernel_module.ipynb.
Converted test_submission.ipynb.
