## Download CSVs from URLs

In [1]:
import time
import urllib3
import os, zipfile
import pandas as pd
from tqdm import tqdm

You need to get a list of all the urls to download from gdelt (this is what we used: http://data.gdeltproject.org/gdeltv2/masterfilelist.txt). All information is available here: https://blog.gdeltproject.org/gdelt-2-0-our-global-world-in-realtime/ 

Each url is then used to actually download said zip file with the GDELT data and make them into csv files.

In [4]:
# get df with urls and filenames
gdelt_url_df = pd.read_csv("GDELT_GKG/extras/GDELT_2022_urls.csv")
# format date
gdelt_url_df["Date_formatted"] = pd.to_datetime(gdelt_url_df['Date'].astype('str').str.slice(start=0,stop=8), infer_datetime_format=True)


### Sample URLs

In [None]:
# get random subset of rows
#samples = gdelt_url_df.sample(n=10, random_state=42)

In [4]:
# get month subset
samples = gdelt_url_df.loc[(gdelt_url_df['Date_formatted'] >= '2022-07-01')
                         & (gdelt_url_df['Date_formatted'] <= '2022-12-30')]


In [5]:
samples.shape

(17566, 3)

In [6]:
print("This should take {} minutes".format(((samples.shape[0] * 142)/100)/60))

This should take 415.7286666666667 minutes


### Download CSV files

In [7]:
# define paths
save_path = "/home/insert_user/GDELT_GKG/"
os.chdir(save_path) # change directory from working dir to dir with files

Quick function for executing bash commands, for source see https://www.scrapingbee.com/blog/python-wget/

In [8]:
import subprocess

def runcmd(cmd, verbose = False, *args, **kwargs):
    """
    Function for running bash commands more easily.
    """
    process = subprocess.Popen(
        cmd,
        stdout = subprocess.PIPE,
        stderr = subprocess.PIPE,
        #text = True,
        shell = True
    )
    std_out, std_err = process.communicate()
    if verbose:
        print(std_out.strip(), std_err)
    pass

In [20]:
print("Now downloading files...")
#samples = samples.head(10) # take subset for time testing

start = time.time()
for filename, url in tqdm(zip(samples["Date"],samples["URL"])):
    # define item name
    item_name = str(filename) + ".gkg.csv.zip"
    csv_name = "/home/insert_user/GDELT_GKG/gkg_csvs/" + str(filename) + ".gkg.csv"
    # file has already been downloaded, either exists as zip or csv, we skip
    if os.path.exists(item_name) or os.path.exists(csv_name):
        continue
    # download URL with bash cmd function
    runcmd("wget {}".format(url), verbose = False)

end = time.time()
print("This took {} s".format(end-start)) 

Now downloading files...


17566it [22:30, 13.01it/s]   

This took 1350.4307177066803 s





There's a lot of variance in how long each month takes to download: ex. January 2022 took 1h10mins, but February took over 2h.

Now we check if we get as many files as we expected: (if not, we can simply rerun the code above)

In [21]:
total_downloads = len(os.listdir(save_path)) + len(os.listdir("/home/insert_user/GDELT_GKG/gkg_csvs"))
print( "We're missing {} files".format(samples.shape[0] - total_downloads) )

We're missing -15581 files


One week's worth took 7mins 50s to download with urllib3, and 6mins 11s with !wget.



### Unzip downloaded files in directory

In [22]:
print("Now unzipping and deleting zips...")

start = time.time()

for item in tqdm(os.listdir(save_path)): # loop through items in dir
    if item.endswith(".zip"): # check for ".zip" extension
        file_name = os.path.abspath(item) # get full path of files
        try:
            zip_ref = zipfile.ZipFile(file_name) # create zipfile object
            zip_ref.extractall(save_path + "/gkg_csvs") # extract file to dir
            zip_ref.close() # close file
            os.remove(file_name) # delete zipped file
        except:
            print("Something went wrong with: ")
            print(file_name)
            

end = time.time()
print("This took {} s".format(end-start))

Now unzipping and deleting zips...


100%|████████████████████████████████████████████| 7/7 [00:00<00:00, 968.73it/s]

Something went wrong with: 
/home/ronja/GDELT_GKG/20221021171500.gkg.csv.zip
This took 0.009716272354125977 s





In [23]:
# let's check if there are files that were in the samples, but are not downloaded correctly
# just to make sure

start = time.time()
i = 0
for filename, url in tqdm(zip(samples["Date"],samples["URL"])):
    # define item name
    item_name = str(filename) + ".gkg.csv.zip"
    csv_name = "/home/insert_user/GDELT_GKG/gkg_csvs/" + str(filename) + ".gkg.csv"
    
    if not (os.path.exists(item_name) or os.path.exists(csv_name)):
        print("this file doesn't exist either as zip or csv", filename)
        i += 1
        continue

print(i)

17566it [00:00, 305076.29it/s]

this file doesn't exist either as zip or csv 20220704113000
this file doesn't exist either as zip or csv 20220705023000
this file doesn't exist either as zip or csv 20220706130000
this file doesn't exist either as zip or csv 20220706131500
this file doesn't exist either as zip or csv 20220706133000
this file doesn't exist either as zip or csv 20220706134500
this file doesn't exist either as zip or csv 20220706140000
this file doesn't exist either as zip or csv 20220706141500
this file doesn't exist either as zip or csv 20220706143000
this file doesn't exist either as zip or csv 20220706144500
this file doesn't exist either as zip or csv 20220706150000
this file doesn't exist either as zip or csv 20220706151500
this file doesn't exist either as zip or csv 20220706153000
this file doesn't exist either as zip or csv 20220706154500
this file doesn't exist either as zip or csv 20220706160000
this file doesn't exist either as zip or csv 20220706161500
this file doesn't exist either as zip or




In [17]:
# testing
os.path.exists("20221226074500.csv")

False

In [19]:
# testing
samples[samples["Date"] == 20221226074500]

Unnamed: 0,Date,URL,Date_formatted
34490,20221226074500,http://data.gdeltproject.org/gdeltv2/202212260...,2022-12-26
