# Downloading and Extracting the Fake-or-Real Dataset 

Manually authenticating to Kaggle, download dataset, unzipping into local project directory (included progress bar!) 

In [18]:
# importing 
import os
import json
import requests
from tqdm import tqdm  # for progress bar

# manually setting kaggle credentials due to restrictions
# could not use 'kaggle' CLI directly due to permission issues on the server
# manually read the Kaggle API key from the kaggle.json file
KAGGLE_USERNAME = "sharonlobo23"

# load API key from .kaggle folder
with open(os.path.expanduser('~/.kaggle/kaggle.json'), 'r') as f:
    kaggle_token = json.load(f)
    
KAGGLE_KEY = kaggle_token['key'] # extract key

# download url manually
dataset = "mohammedabdeldayem/the-fake-or-real-dataset"
dataset_url = f"https://{KAGGLE_USERNAME}:{KAGGLE_KEY}@www.kaggle.com/api/v1/datasets/download/{dataset}"

# setting up output folder directory
# downloaded .zip file and extracted contents will go here
output_folder = "deepfake_audio_project/data/raw"
os.makedirs(output_folder, exist_ok=True) # create directory

# defining path for downloaded zip file
output_zip = os.path.join(output_folder, "fake_or_real.zip")
print(f"downloading dataset into {output_zip}...")

# downloading with progress bar
response = requests.get(dataset_url, stream=True)

if response.status_code == 200:
    total_size_in_bytes = int(response.headers.get('content-length', 0))
    block_size = 1024  # 1 kb at a time
    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)

    with open(output_zip, 'wb') as f:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            f.write(data)
    progress_bar.close()

    # check if downloaded file size matches the expected size
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong in the download!")
    else:
        print("download complete!")
else:
    print(f"failed to download: Status code {response.status_code}") # debugging

# unzipping the dataset
import zipfile

print("unzipping...")
with zipfile.ZipFile(output_zip, 'r') as zip_ref:
    zip_ref.extractall(output_folder)

print("extraction complete! :)")


downloading dataset into deepfake_audio_project/data/raw/fake_or_real.zip...


100%|██████████| 17.2G/17.2G [03:16<00:00, 87.5MiB/s]    


download complete!
unzipping...
extraction complete! :)
