# Test GDELT downloads


In [1]:
%reset -f

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
import requests
import json
import os
import sys
from pathlib import Path
from bs4 import BeautifulSoup
from time import sleep
import requests
from tqdm import tqdm
from parfive import Downloader
from parfive import SessionConfig


In [4]:
# Set filepaths
PROJ = Path(os.path.realpath("."))
if str(PROJ) == "/n/home10/shreyasgm":
    PROJ = Path(
        "/n/holystore01/LABS/hausmann_lab/lab/glocal_aggregations/shreyas/proj/2023-02-05 - Pipeline/gdelt"
    )
ROOT = PROJ.parents[2]
DATA = ROOT / "data/"


In [5]:
# Import custom modules
sys.path.append(str(PROJ))
sys.path.append(str(ROOT / "src/"))
from general_utils import *

# from download_fao import *


In [6]:
# Download file with all the links
# masterfile_url = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"
masterfile_url = PROJ / "masterfilelist.txt"
links_df = pd.read_csv(masterfile_url, sep=" ", header=None, dtype=str)
links_df.columns = ["size", "hash", "url"]
# Make sure they're all string
links_df = links_df.astype(str)
links_df["filename"] = links_df.url.apply(lambda x: x.split("/")[-1])
# Split filename by . into date, type, filetype and compression
links_df[["time", "type", "filetype", "compression"]] = links_df.filename.str.split(
    ".", expand=True
)
links_df.head()


Unnamed: 0,size,hash,url,filename,time,type,filetype,compression
0,150383,297a16b493de7cf6ca809a7cc31d0b93,http://data.gdeltproject.org/gdeltv2/201502182...,20150218230000.export.CSV.zip,20150218230000,export,CSV,zip
1,318084,bb27f78ba45f69a17ea6ed7755e9f8ff,http://data.gdeltproject.org/gdeltv2/201502182...,20150218230000.mentions.CSV.zip,20150218230000,mentions,CSV,zip
2,10768507,ea8dde0beb0ba98810a92db068c0ce99,http://data.gdeltproject.org/gdeltv2/201502182...,20150218230000.gkg.csv.zip,20150218230000,gkg,csv,zip
3,149211,2a91041d7e72b0fc6a629e2ff867b240,http://data.gdeltproject.org/gdeltv2/201502182...,20150218231500.export.CSV.zip,20150218231500,export,CSV,zip
4,339037,dec3f427076b716a8112b9086c342523,http://data.gdeltproject.org/gdeltv2/201502182...,20150218231500.mentions.CSV.zip,20150218231500,mentions,CSV,zip


In [7]:
links_df.type.value_counts()


gkg         274577
export      274572
mentions    274572
Name: type, dtype: int64

In [8]:
# Select files of the type "export"
links_df = links_df[links_df.type == "export"]


In [9]:
# Set the destination folder
gdelt_outdir = DATA / "raw/rasters/gdelt_v2/"
# Download the files
links_to_download = links_df["url"].head(20).to_list()
download_urls_to_dir(
    urls=links_to_download,
    outdir=gdelt_outdir,
    max_conn=20,
    max_splits=5,
    overwrite=False,
    retries=2,
)


Downloading files:
http://data.gdeltproject.org/gdeltv2/20150218230000.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150218231500.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150218233000.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150218234500.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150219000000.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150219001500.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150219003000.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150219004500.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150219010000.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150219011500.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150219013000.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150219014500.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150219020000.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150219021500.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/201502

Files Downloaded:   0%|          | 0/20 [00:00<?, ?file/s]

20150219030000.export.CSV.zip:   0%|          | 0.00/120k [00:00<?, ?B/s]

20150219014500.export.CSV.zip:   0%|          | 0.00/165k [00:00<?, ?B/s]

20150218230000.export.CSV.zip:   0%|          | 0.00/150k [00:00<?, ?B/s]

20150218231500.export.CSV.zip:   0%|          | 0.00/149k [00:00<?, ?B/s]

20150219024500.export.CSV.zip:   0%|          | 0.00/148k [00:00<?, ?B/s]

20150219000000.export.CSV.zip:   0%|          | 0.00/363k [00:00<?, ?B/s]

20150219003000.export.CSV.zip:   0%|          | 0.00/255k [00:00<?, ?B/s]

20150219011500.export.CSV.zip:   0%|          | 0.00/185k [00:00<?, ?B/s]

20150219010000.export.CSV.zip:   0%|          | 0.00/225k [00:00<?, ?B/s]

20150219001500.export.CSV.zip:   0%|          | 0.00/252k [00:00<?, ?B/s]

20150219013000.export.CSV.zip:   0%|          | 0.00/175k [00:00<?, ?B/s]

20150219031500.export.CSV.zip:   0%|          | 0.00/117k [00:00<?, ?B/s]

20150219033000.export.CSV.zip:   0%|          | 0.00/145k [00:00<?, ?B/s]

20150219020000.export.CSV.zip:   0%|          | 0.00/156k [00:00<?, ?B/s]

20150219040000.export.CSV.zip:   0%|          | 0.00/133k [00:00<?, ?B/s]

20150219023000.export.CSV.zip:   0%|          | 0.00/175k [00:00<?, ?B/s]

20150219021500.export.CSV.zip:   0%|          | 0.00/184k [00:00<?, ?B/s]

20150218233000.export.CSV.zip:   0%|          | 0.00/150k [00:00<?, ?B/s]

20150218234500.export.CSV.zip:   0%|          | 0.00/159k [00:00<?, ?B/s]

20150219004500.export.CSV.zip:   0%|          | 0.00/219k [00:00<?, ?B/s]

1/0 files failed to download. Please check `.errors` for details
Errors encountered while downloading: 

------------------
Filepath function: functools.partial(<function default_name at 0x2b5b06fb72e0>, PosixPath('/n/holystore01/LABS/hausmann_lab/lab/glocal_aggregations/shreyas/data/raw/rasters/gdelt_v2'))
URL: http://data.gdeltproject.org/gdeltv2/20150218233000.export.CSV.zip
Exception: Timeout on reading data from socket
Trying to delete: /n/holystore01/LABS/hausmann_lab/lab/glocal_aggregations/shreyas/data/raw/rasters/gdelt_v2/20150218233000.export.CSV.zip
Error deleting file:  [Errno 2] No such file or directory: '/n/holystore01/LABS/hausmann_lab/lab/glocal_aggregations/shreyas/data/raw/rasters/gdelt_v2/20150218233000.export.CSV.zip'
Download failed. Retrying 1 of 2 times


Files Downloaded:   0%|          | 0/1 [00:00<?, ?file/s]

20150218233000.export.CSV.zip:   0%|          | 0.00/150k [00:00<?, ?B/s]

In [11]:
# Try to read csv directly
df = pd.read_csv(DATA / "raw/rasters/gdelt_v2/20150218230000.export.CSV", sep="\t")
df.head()

Unnamed: 0,410412347,20140218,201402,2014,2014.1315,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,4.1,"Waterkloof, Free State, South Africa.1",SF.1,SF03.1,77359.1,-30.3098.1,25.2971.1,-1299321.1,20150218230000,http://www.dailymaverick.co.za/article/2015-02-19-sona2015-jamming-state-security-agency-steps-forward-blames-low-ranking-operator/
0,410412348,20140218,201402,2014,2014.1315,,,,,,...,4,"Bengaluru, Karnataka, India",IN,IN19,70159.0,12.9833,77.5833,-2090174,20150218230000,http://timesofindia.indiatimes.com/city/bengal...
1,410412349,20140218,201402,2014,2014.1315,,,,,,...,4,"Great Southern, Victoria, Australia",AS,AS07,5387.0,-36.0667,146.483,-1576477,20150218230000,http://www.voxy.co.nz/entertainment/coast-new-...
2,410412350,20140218,201402,2014,2014.1315,,,,,,...,1,New Zealand,NZ,NZ,,-41.0,174.0,NZ,20150218230000,http://www.voxy.co.nz/entertainment/coast-new-...
3,410412351,20140218,201402,2014,2014.1315,,,,,,...,2,"Idaho, United States",US,USID,,44.2394,-114.51,ID,20150218230000,http://www.eastidahonews.com/2015/02/neil-patr...
4,410412352,20140218,201402,2014,2014.1315,AUS,AUSTRALIA,AUS,,,...,4,"Brisbane, Queensland, Australia",AS,AS04,154654.0,-27.5,153.017,-1561728,20150218230000,http://www.businessspectator.com.au/article/20...


In [13]:
df2 = pd.read_csv(DATA / "raw/rasters/gdelt_v2/20150218230000.export.CSV.zip", sep="\t")
df2.head()

Unnamed: 0,410412347,20140218,201402,2014,2014.1315,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,4.1,"Waterkloof, Free State, South Africa.1",SF.1,SF03.1,77359.1,-30.3098.1,25.2971.1,-1299321.1,20150218230000,http://www.dailymaverick.co.za/article/2015-02-19-sona2015-jamming-state-security-agency-steps-forward-blames-low-ranking-operator/
0,410412348,20140218,201402,2014,2014.1315,,,,,,...,4,"Bengaluru, Karnataka, India",IN,IN19,70159.0,12.9833,77.5833,-2090174,20150218230000,http://timesofindia.indiatimes.com/city/bengal...
1,410412349,20140218,201402,2014,2014.1315,,,,,,...,4,"Great Southern, Victoria, Australia",AS,AS07,5387.0,-36.0667,146.483,-1576477,20150218230000,http://www.voxy.co.nz/entertainment/coast-new-...
2,410412350,20140218,201402,2014,2014.1315,,,,,,...,1,New Zealand,NZ,NZ,,-41.0,174.0,NZ,20150218230000,http://www.voxy.co.nz/entertainment/coast-new-...
3,410412351,20140218,201402,2014,2014.1315,,,,,,...,2,"Idaho, United States",US,USID,,44.2394,-114.51,ID,20150218230000,http://www.eastidahonews.com/2015/02/neil-patr...
4,410412352,20140218,201402,2014,2014.1315,AUS,AUSTRALIA,AUS,,,...,4,"Brisbane, Queensland, Australia",AS,AS04,154654.0,-27.5,153.017,-1561728,20150218230000,http://www.businessspectator.com.au/article/20...
