We would load dataset from github directly into pandas dataframe
- github repo - https://github.com/MKLab-ITI/fake-video-corpus
- fvc_dup.csv - https://github.com/MKLab-ITI/fake-video-corpus/blob/master/FVC_dup.csv

# Import libraries

In [1]:
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-2.0.0-py3-none-any.whl (6.3 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-2.0.0


In [2]:
import os, math
import requests
import shutil
from pathlib import Path
import jsonlines
from urllib.parse import urlparse, parse_qs
import json
import ast
import pandas as pd
import numpy as np
import matplotlib as plt
import re

# Load data

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/MKLab-ITI/fake-video-corpus/master/FVC_dup.csv')
df.head()

Unnamed: 0,cascade_id,video_url,label
0,f0,https://www.facebook.com/palasqueseaoficial/vi...,fake
1,f0,https://www.facebook.com/lioncityfeed/videos/9...,fake
2,f0,https://www.youtube.com/watch?v=uqZeCGo0LnE,fake
3,f0,https://www.youtube.com/watch?v=hZ5Kt-ffkA0,fake
4,f0,https://www.youtube.com/watch?v=0ai_IKEmMz8,fake


# Data exploration and preprocessing

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5199 entries, 0 to 5198
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   cascade_id  5199 non-null   object
 1   video_url   5199 non-null   object
 2   label       5199 non-null   object
dtypes: object(3)
memory usage: 122.0+ KB


The video url contains url other than of youtube as well, here we will focus only on fake video on youtube hence we would be filtering out the video url for youtube and discarding others using the regular expression string match. 
The below link is the visualization for the regex we would be using.

https://regexper.com/#https*%3A%5B%2F%2F%5Dwww%5B.%5D%7B1%7Dyoutube%5B.%5D%7B1%7Dcom%5B%2F%5D%7B1%7D%5Cw%2B

In [5]:
def match_url(url):
    url = url.lower() 
    return int(bool(re.match("https*://www[.]{1}youtube[.]{1}com[/]{1}\w+",url)))


In [6]:
df['is_youtube_link'] = df['video_url'].apply(lambda x : match_url(x))
df.sample(10)

Unnamed: 0,cascade_id,video_url,label,is_youtube_link
3003,r22,https://www.facebook.com/VertigoPolitico/video...,real,0
5163,f198,https://www.youtube.com/watch?v=tKhEdbKeefQ,uncertain,1
1892,f104,https://www.youtube.com/watch?v=AWvByC53lFg,fake,1
854,f66,https://www.facebook.com/476980619162254/video...,fake,0
4566,r178,https://www.facebook.com/marchequipmentstore/v...,real,0
2101,f137,https://www.facebook.com/mod.mil.rus/videos/16...,fake,0
340,f22,https://www.facebook.com/564221067107475/video...,fake,0
17,f1,https://www.youtube.com/watch?v=XSRhFjc--_s,fake,1
3718,r91,https://www.facebook.com/238703019587268/video...,real,0
1580,f97,https://www.facebook.com/100017190224921/video...,fake,0


In [7]:
df.is_youtube_link.value_counts()

1    3028
0    2171
Name: is_youtube_link, dtype: int64

In [8]:
df_filtered = df.loc[(df.loc[:,'is_youtube_link']==1),['video_url', 'label']]

In [9]:
df_filtered = df_filtered.loc[(df_filtered['label'].isin(['fake','real']))]

In [10]:
df_filtered.label = df_filtered.label.astype('category')
df_filtered.label.value_counts()

fake    1679
real    1015
Name: label, dtype: int64

In [11]:
df_filtered.label = df_filtered.label.cat.codes

In [12]:
#Shuffle the dataframe for randomness
df_filtered_new = df_filtered.sample(frac=1).reset_index(drop=True)
df_filtered_new.head()

Unnamed: 0,video_url,label
0,https://www.youtube.com/watch?v=Ohn6_teeZKo,0
1,https://www.youtube.com/watch?v=7rkoZBQYyoc,0
2,https://www.youtube.com/watch?v=GZCldPKyZgs,0
3,https://www.youtube.com/watch?v=H-PvRS7OCco,1
4,https://www.youtube.com/watch?v=fV4w-PzMwl8,0


In [13]:
#Save the filtered csv
df_filtered_new.to_csv('FVC_filtered.csv', index=False)

# Download YouTube metadata 

We would be using YouTube data api v3 for fetching the metadata of videos from video url such as video title, descriptions, thumbnails e.t.c.
Get the api key, follow below links for obtaining api key.
- https://developers.google.com/maps/documentation/javascript/get-api-key
- https://console.cloud.google.com/marketplace/product/google/youtube.googleapis.com?q=search&referrer=search&project=decisive-octane-311811

In [14]:
api_key = 'AIzaSyCa6UjQBuMpG9q8O70bkk1QYyCMEzu2oa0'

In [15]:
df = pd.read_csv('FVC_filtered.csv')

In [16]:
fakes = np.array(df.loc[(df.label==0), ['video_url']]).reshape(-1)
reals = np.array(df.loc[(df.label==1), ['video_url']]).reshape(-1)

df.shape, fakes.shape, reals.shape

((2694, 2), (1679,), (1015,))

In [17]:
fvc_fakes_metadata = Path('fvc_fakes_metadata.json')
fvc_reals_metadata = Path('fvc_reals_metadata.json')

fvc_fakes_metadata.touch(exist_ok=True)
fvc_reals_metadata.touch(exist_ok=True)

In [18]:
def extract_video_metadata(urls):
  metadata_list=[]
  for url in urls:
    url_data = urlparse(url)
    query = parse_qs(url_data.query)
    video_id = query["v"][0] 
    youtube_api_url = f'https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={api_key}'
    r = requests.get(youtube_api_url, stream = True)      
    if r.status_code == 200:
       r.raw.decode_content = True
       data = r.json()
       if len(data['items'])>0:
         metadata_list.append(json.dumps(r.json(), ensure_ascii=False))
  return np.array(metadata_list)

def write_json_data(path, data):
    with open(path,'w') as f:
        f.writelines("%s\n" % line for line in data)


In [19]:
fake_videos_metadata = extract_video_metadata(fakes)
real_videos_metadata = extract_video_metadata(reals)
fake_videos_metadata.shape, real_videos_metadata.shape

((1115,), (708,))

In [20]:
write_json_data(fvc_fakes_metadata, fake_videos_metadata)
write_json_data(fvc_reals_metadata, real_videos_metadata)

In [21]:
fake_data_dict_array = []
real_data_dict_array = []
with jsonlines.open(fvc_fakes_metadata) as reader:
    for obj in reader:
      fake_data_dict_array.append(obj)

with jsonlines.open(fvc_reals_metadata) as reader:
    for obj in reader:
      real_data_dict_array.append(obj)

len(real_data_dict_array), len(fake_data_dict_array)

(708, 1115)

In [22]:
df_real_metadata = pd.DataFrame.from_dict(real_data_dict_array)
df_fake_metadata = pd.DataFrame.from_dict(fake_data_dict_array)
df_fake_metadata.shape, df_real_metadata.shape

((1115, 4), (708, 4))

In [23]:
df_real_metadata['is_fake'] = 0
df_fake_metadata['is_fake'] = 1

FVC_df = pd.concat([df_real_metadata, df_fake_metadata], ignore_index=True)
FVC_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1823 entries, 0 to 1822
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   kind      1823 non-null   object
 1   etag      1823 non-null   object
 2   items     1823 non-null   object
 3   pageInfo  1823 non-null   object
 4   is_fake   1823 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 71.3+ KB


In [24]:
FVC_df = FVC_df.sample(frac=1).reset_index(drop=True)
FVC_df.head(10)

Unnamed: 0,kind,etag,items,pageInfo,is_fake
0,youtube#videoListResponse,CqqHVvvYuGS6B520CmOO9UZzE4M,"[{'kind': 'youtube#video', 'etag': 'UXvsfkDpw5...","{'totalResults': 1, 'resultsPerPage': 1}",1
1,youtube#videoListResponse,qMD3kriUs5ILG53K9iiOKEDdIR4,"[{'kind': 'youtube#video', 'etag': '17eBZ_q_K5...","{'totalResults': 1, 'resultsPerPage': 1}",1
2,youtube#videoListResponse,fUyUFDU9jU7M7zmmsPNHGMFGnu8,"[{'kind': 'youtube#video', 'etag': 'oEUtvc3s8f...","{'totalResults': 1, 'resultsPerPage': 1}",1
3,youtube#videoListResponse,cLS9pIqfiuWjElyDMy177tL1cJE,"[{'kind': 'youtube#video', 'etag': '0DIOoZVK6B...","{'totalResults': 1, 'resultsPerPage': 1}",1
4,youtube#videoListResponse,iroXExYgXAdFx0tk6xSuUEfKlE0,"[{'kind': 'youtube#video', 'etag': '3K8qsl4tNw...","{'totalResults': 1, 'resultsPerPage': 1}",1
5,youtube#videoListResponse,dT7_vjTHK8WjABwCAWjax486QEs,"[{'kind': 'youtube#video', 'etag': 'ZOAmRcFhbI...","{'totalResults': 1, 'resultsPerPage': 1}",0
6,youtube#videoListResponse,o-vxQuClgMpwhznUXMOHWUhFzBQ,"[{'kind': 'youtube#video', 'etag': '-ExntCSlaS...","{'totalResults': 1, 'resultsPerPage': 1}",1
7,youtube#videoListResponse,Jmt3bh3P7GpcQFnBRfbtfTAx3sw,"[{'kind': 'youtube#video', 'etag': 'hR4AZJINYs...","{'totalResults': 1, 'resultsPerPage': 1}",0
8,youtube#videoListResponse,Dvhc4kATYtZcWDah3gM3dHDsFEY,"[{'kind': 'youtube#video', 'etag': 'KaaV8h2YPb...","{'totalResults': 1, 'resultsPerPage': 1}",1
9,youtube#videoListResponse,Uhou84aFBooaxA6uxjOI42-nDgE,"[{'kind': 'youtube#video', 'etag': 'ROFkJQCsRd...","{'totalResults': 1, 'resultsPerPage': 1}",1


In [25]:
FVC_df.to_csv('FVC_metadata.csv', index=False)

# Explore & Process metadata

In [26]:
FVC_metadata_df = pd.read_csv('FVC_metadata.csv')

In [27]:
FVC_metadata_df = FVC_metadata_df.loc[:,['items','is_fake']]
FVC_metadata_df.head()

Unnamed: 0,items,is_fake
0,"[{'kind': 'youtube#video', 'etag': 'UXvsfkDpw5...",1
1,"[{'kind': 'youtube#video', 'etag': '17eBZ_q_K5...",1
2,"[{'kind': 'youtube#video', 'etag': 'oEUtvc3s8f...",1
3,"[{'kind': 'youtube#video', 'etag': '0DIOoZVK6B...",1
4,"[{'kind': 'youtube#video', 'etag': '3K8qsl4tNw...",1


In [28]:
def preprocess_string(text):
    text_cleaned = re.sub(r"http\S+", "", text, flags=re.IGNORECASE|re.MULTILINE)  # remove any url in the text
    text_cleaned = re.sub(r"\w+@\w{1,}\.\w{1,}","", text_cleaned, flags=re.IGNORECASE|re.MULTILINE) # remove emails from text
    text_cleaned = re.sub(r'[^a-z\s]+',' ',text_cleaned,flags=re.IGNORECASE|re.MULTILINE) # remove non alphabetics char
    text_cleaned = re.sub(r'(\s+)',' ',text_cleaned, flags=re.IGNORECASE|re.MULTILINE)  # remove spaces

    text_cleaned = text_cleaned.lstrip()
    text_cleaned = text_cleaned.rstrip()
    text_cleaned = text_cleaned.lower() 

    return text_cleaned 

def extract_list_data(row, column_name, sub_column='', clean_text=True):
  row = ast.literal_eval(row)
  items = row[0]
  if len(sub_column)>0:
    if sub_column in items[column_name]:
      text = items[column_name][sub_column]
    else:
      text='NA'
  else:
    text = items[column_name]
  
  if clean_text:
      text = preprocess_string(text)

  return text

def extract_from_dict(row, column, subcolumn):
  return row[column][subcolumn]

  

In [29]:
FVC_metadata_df['id'] = FVC_metadata_df.loc[:,'items'].apply(lambda x : extract_list_data(x,'id', clean_text=False))
FVC_metadata_df['title'] = FVC_metadata_df.loc[:,'items'].apply(lambda x : extract_list_data(x,'snippet','title'))
FVC_metadata_df['description'] = FVC_metadata_df.loc[:,'items'].apply(lambda x : extract_list_data(x,'snippet','title'))
FVC_metadata_df['tags'] = FVC_metadata_df.loc[:,'items'].apply(lambda x : extract_list_data(x,'snippet','tags', clean_text=False))
FVC_metadata_df['thumbnails'] = FVC_metadata_df.loc[:,'items'].apply(lambda x : extract_list_data(x,'snippet','thumbnails', clean_text=False))
FVC_metadata_df['thumbnail_320_180'] = FVC_metadata_df.loc[:,'thumbnails'].apply(lambda x : extract_from_dict(x,'medium','url'))
FVC_metadata_df['thumbnail_120_90'] = FVC_metadata_df.loc[:,'thumbnails'].apply(lambda x : extract_from_dict(x,'default','url'))
FVC_metadata_df['thumbnail_480_360'] = FVC_metadata_df.loc[:,'thumbnails'].apply(lambda x : extract_from_dict(x,'high','url'))

In [30]:
FVC_metadata_df = FVC_metadata_df.drop(['items','thumbnails'], axis=1)

In [31]:
FVC_metadata_filtered_df = FVC_metadata_df.loc[~((FVC_metadata_df.title.isin([''])) & (FVC_metadata_df.description.isin([''])))]

In [32]:
#FVC_metadata_filtered_df.title = FVC_metadata_filtered_df.title.astype(str)
#FVC_metadata_filtered_df.description = FVC_metadata_filtered_df.description.astype(str)
#FVC_metadata_filtered_df.thumbnail_320_180 = FVC_metadata_filtered_df.thumbnail_320_180.astype(str)
#FVC_metadata_filtered_df.thumbnail_120_90 = FVC_metadata_filtered_df.thumbnail_120_90.astype(str)
#FVC_metadata_filtered_df.thumbnail_480_360 = FVC_metadata_filtered_df.thumbnail_480_360.astype(str)

FVC_metadata_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1553 entries, 0 to 1822
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   is_fake            1553 non-null   int64 
 1   id                 1553 non-null   object
 2   title              1553 non-null   object
 3   description        1553 non-null   object
 4   tags               1553 non-null   object
 5   thumbnail_320_180  1553 non-null   object
 6   thumbnail_120_90   1553 non-null   object
 7   thumbnail_480_360  1553 non-null   object
dtypes: int64(1), object(7)
memory usage: 109.2+ KB


In [33]:
FVC_metadata_filtered_df = FVC_metadata_filtered_df.sample(frac=1).reset_index(drop=True)
FVC_metadata_filtered_df.head(5)

Unnamed: 0,is_fake,id,title,description,tags,thumbnail_320_180,thumbnail_120_90,thumbnail_480_360
0,1,AEcVeNnBtag,ufo hiding behind the cloud s o paulo brasil feb,ufo hiding behind the cloud s o paulo brasil feb,"[ufo, Brazil, plasma, probe, fantasy]",https://i.ytimg.com/vi/AEcVeNnBtag/mqdefault.jpg,https://i.ytimg.com/vi/AEcVeNnBtag/default.jpg,https://i.ytimg.com/vi/AEcVeNnBtag/hqdefault.jpg
1,1,0VWw5m-lMZ0,bicicleta fantasma,bicicleta fantasma,,https://i.ytimg.com/vi/0VWw5m-lMZ0/mqdefault.jpg,https://i.ytimg.com/vi/0VWw5m-lMZ0/default.jpg,https://i.ytimg.com/vi/0VWw5m-lMZ0/hqdefault.jpg
2,0,2bqJNIQPoKY,fbi video enhancement of walter scott shooting,fbi video enhancement of walter scott shooting,,https://i.ytimg.com/vi/2bqJNIQPoKY/mqdefault.jpg,https://i.ytimg.com/vi/2bqJNIQPoKY/default.jpg,https://i.ytimg.com/vi/2bqJNIQPoKY/hqdefault.jpg
3,0,xfAx-GEjRjs,aramis ayala florida s first black female stat...,aramis ayala florida s first black female stat...,"[aramis ayala, racial profiling, florida, stat...",https://i.ytimg.com/vi/xfAx-GEjRjs/mqdefault.jpg,https://i.ytimg.com/vi/xfAx-GEjRjs/default.jpg,https://i.ytimg.com/vi/xfAx-GEjRjs/hqdefault.jpg
4,0,tF57RrMpmu0,tiger gets loose on a doha freeway in qatar th...,tiger gets loose on a doha freeway in qatar th...,"[Radio mango, fm, radio, malayalam, new, malay...",https://i.ytimg.com/vi/tF57RrMpmu0/mqdefault.jpg,https://i.ytimg.com/vi/tF57RrMpmu0/default.jpg,https://i.ytimg.com/vi/tF57RrMpmu0/hqdefault.jpg


In [34]:
# Save the filtered dataset
FVC_metadata_filtered_df.to_csv('FVC_metadata_filtered.csv', index=False)

# Download and map thumbnails

In [35]:
FVC = pd.read_csv('FVC_metadata_filtered.csv')

In [36]:
fakeImage_dir = './fake_images'
realImage_dir = './real_images'

if not os.path.exists(fakeImage_dir):
    os.makedirs(fakeImage_dir)

if not os.path.exists(realImage_dir):
    os.makedirs(realImage_dir)

In [37]:
def download_save_image(df):
  df_new = df.copy(deep=True)

  for i in range(len(df_new)):
    row = df_new.iloc[i]

    id = row['id']  
    url = row['thumbnail_320_180'] 
    is_fake = row['is_fake'] 


    filename = f'{id}' + '_' + url.split("/")[-1]      

    if is_fake:      
        image_path = os.path.join(fakeImage_dir,filename) 
    else:
        image_path = os.path.join(realImage_dir,filename) 

     
    if not os.path.exists(image_path):                       
            r = requests.get(url, stream = True)             
            
            if r.status_code == 200:                         
                r.raw.decode_content = True

                with open(image_path,'wb') as f:
                    shutil.copyfileobj(r.raw, f)             

                print(i, ': success')

                df_new.loc[(df_new['id'] == id), 'thumbnail_320_180_path'] = image_path        

            else:
                print(i, ': failed')

                df_new.loc[(df_new['id'] == id), 'thumbnail_320_180_path'] = '?'                  

    else:
        df_new.loc[(df_new['id'] == id), 'thumbnail_320_180_path'] = image_path 
        print('file exist', image_path)

  return df_new 

In [38]:
#Download thumbnails and map the downloaded file path
FVC_mapped = download_save_image(FVC)

0 : success
1 : success
2 : success
3 : success
4 : success
5 : success
6 : success
7 : success
8 : success
9 : success
10 : success
11 : success
12 : success
13 : success
14 : success
15 : success
16 : success
17 : success
18 : success
19 : success
20 : success
21 : success
22 : success
23 : success
24 : success
25 : success
26 : success
27 : success
28 : success
29 : success
30 : success
31 : success
32 : success
33 : success
34 : success
35 : success
36 : success
37 : success
38 : success
39 : success
40 : success
41 : success
42 : success
43 : success
44 : success
45 : success
46 : success
47 : success
48 : success
49 : success
50 : success
51 : success
52 : success
53 : success
54 : success
55 : success
56 : success
57 : success
58 : success
59 : success
60 : success
61 : success
62 : success
63 : success
64 : success
65 : success
66 : success
67 : success
68 : success
69 : success
70 : success
71 : success
72 : success
73 : success
74 : success
75 : success
76 : success
77 : succ

In [39]:
FVC_mapped = FVC_mapped.sample(frac=1).reset_index(drop=True)

In [40]:
FVC_mapped.to_csv('FVC_mapped.csv',index=True)

In [41]:
#import shutil
#shutil.make_archive('fake_images', 'zip', './fake_images')