In [1]:
import os
import numpy as np
import pandas as pd
import requests
import sys
import yaml
import gc

from zipfile import ZipFile, ZIP_DEFLATED
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from glob import glob, iglob
from pathlib import Path
from tqdm import tqdm
from multiprocessing import Pool
from functools import partial

import shapefile
import shapely
from shapely.geometry import Point

In [2]:
def get_time(file):
    t = file.split('.')[0]
    
    if len(t) == 8:
        t = datetime.strptime(t, '%Y%m%d') 
    elif len(t) == 6:
        t = datetime.strptime(t, '%Y%m')
    else:
        t = datetime.strptime(t, '%Y')
        
    return t
    
    
def download(url, name):
    r = requests.get(url, allow_redirects=True)
    with open(name, 'wb') as f:
        f.write(r.content)
    return True


def get_current_batch_dict():
    c_files = [Path(x).name.replace('.feather', '') for x in glob(str(save_folder / '*.feather'))]
    batches = {}
    for file in c_files:
        date, batch = file.split('_')
        batches[date] = max(batches.get(date, 0), int(batch))
    return batches


def load_processed(file):
    try:
        with open(file, 'r') as f:
            processed = yaml.safe_load(f)
    except FileNotFoundError:
        print(file, 'not found')
        processed = []
    return processed


def save_processed(content, file):
    with open(file, 'w') as f:
        yaml.safe_dump(content, f)
        
        
def match_geo(locs, lon_col, lat_col, country_col, use_pool=True):
    f = partial(match_single_geo, lon_col=lon_col, lat_col=lat_col, country_col=country_col)
    
    if use_pool:
        with Pool(30) as p:
            results = p.map(f, locs.iterrows())
    else:
        results = [f(x) for x in locs.iterrows()]
    
    for result in results:
        idx = result['idx']
        locs.loc[idx, 'adm2'] = result.get('adm2')
        locs.loc[idx, 'adm1'] = result.get('adm1')
        locs.loc[idx, 'adm0'] = result.get('adm0')
        locs.loc[idx, 'nearest_loc'] = result['nearest_loc']
        
    return locs


def country_shapes(country):
    if country in shapes_by_country:
        yield from shapes_by_country[country]
    else:
        for shapes in shapes_by_country.values():
            yield from shapes
        
        
def match_single_geo(row, lon_col, lat_col, country_col):
    if isinstance(row, tuple):
        row = row[1]
        
    if isinstance(row['adm1'], str):
        return {'nearest_loc': row['nearest_loc'], 
                'adm1': row['adm1'],
                'adm2': row['adm2'], 
                'adm0': row['adm0'],
                'idx': row.name}
    
    results = {'nearest_loc': False, 'idx': row.name}

    point = Point((float(row[lon_col]), float(row[lat_col])))
    nearest_record = None
    nearest_distance = np.inf
    for shape, country, adm1, adm2 in country_shapes(row[country_col]):
        if point.within(shape):
            results['adm0'] = country
            results['adm2'] = adm2
            results['adm1'] = adm1
            break
    else:
        for shape, country, adm1, adm2 in country_shapes(row[country_col]):
            distance = point.distance(shape)
            if distance < nearest_distance:
                nearest_distance = distance
                nearest_record = (country, adm1, adm2)

        if nearest_record:
            results['nearest_loc'] = True
            results['adm0'] = nearest_record[0]
            results['adm1'] = nearest_record[1]
            results['adm2'] = nearest_record[2]
    
    return results    


def get_locs(df, cols):
    locs = df.loc[df[cols].notnull().all(axis=1), cols].drop_duplicates()
    locs['adm1'] = np.nan
    locs['adm2'] = np.nan
    locs['adm0'] = np.nan
    locs['nearest_loc'] = False
    return locs


def append_adm_codes(df, lat_col, lon_col, country_col, geo_type_col, use_pool=True):
    # Get locs
    on_cols = [lat_col, lon_col, country_col]
    locs = get_locs(df, on_cols)

    # Append adm codes
    locs = match_geo(locs, lon_col, lat_col, country_col, use_pool=use_pool)

    # Merge them back
    df = df.merge(locs, on=on_cols, how='left')
    
    df['adm0'].fillna(df[country_col], inplace=True)
    
    # Handle geo precision
    null_mask = df[geo_type_col].isnull()
    no_adm1_mask = df[geo_type_col] <= 1
    no_adm2_mask = np.logical_or(df[geo_type_col] <= 2, df[geo_type_col] >= 5)
    df.loc[no_adm1_mask, 'adm1'] = np.nan
    df.loc[no_adm2_mask, 'adm2'] = np.nan
    df.loc[null_mask, ['adm1', 'adm2']] = np.nan
    
    return df


def convert_cols_to_numeric(df, cols):
    for col in cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

In [3]:
# Constants for later
keep_cols = ['SQLDATE', 'ActionGeo_Lat', 'ActionGeo_Long', 'ActionGeo_Type', 'ActionGeo_CountryCode', 'EventCode',
             'NumMentions', 'IsRootEvent', 'NumSources', 'NumArticles', 'AvgTone']
download_folder = Path('../Downloads')
save_folder = Path('../Output/GDELT/')
collapse_folder = Path('../Output/GDELT_collapsed/')
batch_size = 50
downloaded = [Path(x).name for x in glob(f'../Downloads/*.zip')]

processed_file = save_folder / 'processed.yml'
processed = load_processed(processed_file)

lat_col = 'ActionGeo_Lat'
lon_col = 'ActionGeo_Long'
geo_type_col = 'ActionGeo_Type'
country_col = 'ActionGeo_CountryCode'
group_cols = ['EventCode']
dist_cols = ['NumMentions', 'IsRootEvent', 'NumSources', 'NumArticles', 'AvgTone']

../Output/GDELT/processed.yml not found


In [4]:
gdelt = 'http://data.gdeltproject.org/events/'
response = requests.get(gdelt + 'index.html')
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
# Current header row
header = requests.get('https://www.gdeltproject.org/data/lookups/CSV.header.dailyupdates.txt')
header = header.text.strip().split('\t')
# Header row up until march 2013
ignore_cols = {'SOURCEURL'}
header_old = [x for x in header if x not in ignore_cols]

In [6]:
def get_shape_tuple(shape_record_tuple, i):
    shape, record = shape_record_tuple
    country = record[0]
    shape = shapely.geometry.shape(shape)  

    adm1, adm2 = None, None
    if i >= 1:
        adm1 = record[2]
    if i == 2:
        adm2 = record[5]
        
    return (shape, country, adm1, adm2)

In [7]:
shps = glob('../Shapefiles/preprocessed/*.shp')
shapes_by_country = {}
for i, shp in tqdm(enumerate(shps), total=len(shps)):
    codes = Path(shp).name.replace('.shp', '').split('_')
    shp = shapefile.Reader(shp)
    shapes = [shapely.geometry.shape(s) for s in shp.shapes()]
    records = shp.records()
    
    for code in codes:
        shapes_by_country[code] = []
        for shape, record in zip(shapes, records):
            shapes_by_country[code].append((shape, *record))
    
assert 'GT' in shapes_by_country and 'PM' in shapes_by_country

0: 100%|██████████| 256/256 [05:21<00:00,  1.26s/it]
1: 100%|██████████| 3610/3610 [05:50<00:00, 10.30it/s]  
2: 100%|██████████| 45962/45962 [04:25<00:00, 173.15it/s] 


In [8]:
all_a = soup.find_all('a')
all_files = {}

header_cutoff_date = datetime.strptime('201303', '%Y%m')
for a in all_a:
    name = a.text
    try:
        t = get_time(name)
    except ValueError:
        print(f'Skipping file {a}')
        continue
        
    if t.year >= 2021:
        continue
        
    if t.year not in all_files:
        all_files[t.year] = []
        
    all_files[t.year].append({
        'date': t,
        'name': name,
        'href': a['href'],
        'header': header if t > header_cutoff_date else header_old,
        'append': set() if t > header_cutoff_date else ignore_cols
    })
    
print('Total years:', len(all_files))
print('Total files:', len(all_a))

Skipping file <a href="md5sums">md5sums</a>
Skipping file <a href="filesizes">filesizes</a>
Skipping file <a href="GDELT.MASTERREDUCEDV2.1979-2013.zip">GDELT.MASTERREDUCEDV2.1979-2013.zip</a>
Skipping file <a href="GDELT.MASTERREDUCEDV2.1979-2013.zip">GDELT.MASTERREDUCEDV2.1979-2013.zip</a>
Total years: 42
Total files: 3068


In [9]:
def download_and_combine(elements, batch_size, batch, download_folder):
    dfs = []
    for element in tqdm(elements[batch_size * (batch - 1): batch_size * batch], 
                        desc=f'Year {year}. Batch {batch} of {n_batches}'): 
        zfile = download_folder / element['name']

        if element['name'] not in downloaded:
            download(gdelt + element['href'], zfile)

        with ZipFile(zfile, 'r') as f:
            files = f.namelist()
            f.extractall(download_folder)

        for file in files:
            df = pd.read_csv(download_folder / files[0], sep='\t', names=element['header'], dtype=str)
            for col in element['append']:
                df[col] = np.nan
            dfs.append(df[keep_cols])
            os.remove(download_folder / file)       

    # Combine in a single df
    print('Combining...')
    df = dfs[0].append(dfs[1:], ignore_index=True)
    return df

In [10]:
# Process files
for year, elements in all_files.items():
    elements = sorted(elements, key=lambda x: x['date'])
    n_batches = len(elements) // batch_size + int(len(elements) % batch_size > 0)
    
    for batch in range(1, n_batches + 1):
        batch_name = f'{year}_b{batch:02}.feather'
        
        # If the batch exists already, skip it
        if batch_name in processed:
            continue
        
        df = download_and_combine(elements, batch_size, batch, download_folder)        
        
        # Parse numerics
        df = convert_cols_to_numeric(df, dist_cols + [geo_type_col])
        
        # Append ADM
        print('Matching geo...')
        df = append_adm_codes(df, lat_col, lon_col, country_col, geo_type_col)
        
        # Save by sqldate
        print('Saving...')
        batch_dict = get_current_batch_dict()
        for date, subdf in df.groupby('SQLDATE'):
            n = batch_dict.get(date, -1) + 1
            subdf.reset_index(drop=True).to_feather(save_folder / f'{date}_{n:02}.feather')
            
        # Add to processed
        processed.append(batch_name)
        save_processed(processed, processed_file)

Year 2020. Batch 1 of 8: 100%|██████████| 50/50 [01:10<00:00,  1.41s/it]


Combining...
Matching geo...
Saving...


Year 2020. Batch 2 of 8: 100%|██████████| 50/50 [01:10<00:00,  1.41s/it]


Combining...
Matching geo...
Saving...


Year 2020. Batch 3 of 8: 100%|██████████| 50/50 [01:00<00:00,  1.21s/it]


Combining...
Matching geo...
Saving...


Year 2020. Batch 4 of 8: 100%|██████████| 50/50 [01:05<00:00,  1.31s/it]


Combining...
Matching geo...
Saving...


Year 2020. Batch 5 of 8: 100%|██████████| 50/50 [01:03<00:00,  1.28s/it]


Combining...
Matching geo...
Saving...


Year 2020. Batch 6 of 8: 100%|██████████| 50/50 [00:57<00:00,  1.14s/it]


Combining...
Matching geo...
Saving...


Year 2020. Batch 7 of 8: 100%|██████████| 50/50 [00:45<00:00,  1.10it/s]


Combining...
Matching geo...
Saving...


Year 2020. Batch 8 of 8: 100%|██████████| 16/16 [00:15<00:00,  1.01it/s]


Combining...
Matching geo...
Saving...


Year 2019. Batch 1 of 8: 100%|██████████| 50/50 [01:14<00:00,  1.50s/it]


Combining...
Matching geo...
Saving...


Year 2019. Batch 2 of 8: 100%|██████████| 50/50 [01:24<00:00,  1.68s/it]


Combining...
Matching geo...
Saving...


Year 2019. Batch 3 of 8: 100%|██████████| 50/50 [01:21<00:00,  1.64s/it]


Combining...
Matching geo...
Saving...


Year 2019. Batch 4 of 8: 100%|██████████| 50/50 [01:16<00:00,  1.54s/it]


Combining...
Matching geo...
Saving...


Year 2019. Batch 5 of 8: 100%|██████████| 50/50 [01:12<00:00,  1.45s/it]


Combining...
Matching geo...
Saving...


Year 2019. Batch 6 of 8: 100%|██████████| 50/50 [01:17<00:00,  1.56s/it]


Combining...
Matching geo...
Saving...


Year 2019. Batch 7 of 8: 100%|██████████| 50/50 [01:12<00:00,  1.44s/it]


Combining...
Matching geo...
Saving...


Year 2019. Batch 8 of 8: 100%|██████████| 15/15 [00:17<00:00,  1.16s/it]


Combining...
Matching geo...
Saving...


Year 2018. Batch 1 of 8: 100%|██████████| 50/50 [01:20<00:00,  1.60s/it]


Combining...
Matching geo...
Saving...


Year 2018. Batch 2 of 8: 100%|██████████| 50/50 [01:26<00:00,  1.72s/it]


Combining...
Matching geo...
Saving...


Year 2018. Batch 3 of 8: 100%|██████████| 50/50 [01:27<00:00,  1.76s/it]


Combining...
Matching geo...
Saving...


Year 2018. Batch 4 of 8: 100%|██████████| 50/50 [01:28<00:00,  1.76s/it]


Combining...
Matching geo...
Saving...


Year 2018. Batch 5 of 8: 100%|██████████| 50/50 [01:21<00:00,  1.63s/it]


Combining...
Matching geo...
Saving...


Year 2018. Batch 6 of 8: 100%|██████████| 50/50 [01:27<00:00,  1.75s/it]


Combining...
Matching geo...
Saving...


Year 2018. Batch 7 of 8: 100%|██████████| 50/50 [01:22<00:00,  1.64s/it]


Combining...
Matching geo...
Saving...


Year 2018. Batch 8 of 8: 100%|██████████| 15/15 [00:18<00:00,  1.22s/it]


Combining...
Matching geo...
Saving...


Year 2017. Batch 1 of 8: 100%|██████████| 50/50 [01:40<00:00,  2.01s/it]


Combining...
Matching geo...
Saving...


Year 2017. Batch 2 of 8: 100%|██████████| 50/50 [01:39<00:00,  1.99s/it]


Combining...
Matching geo...
Saving...


Year 2017. Batch 3 of 8: 100%|██████████| 50/50 [01:33<00:00,  1.86s/it]


Combining...
Matching geo...
Saving...


Year 2017. Batch 4 of 8: 100%|██████████| 50/50 [01:27<00:00,  1.76s/it]


Combining...
Matching geo...
Saving...


Year 2017. Batch 5 of 8: 100%|██████████| 50/50 [01:20<00:00,  1.61s/it]


Combining...
Matching geo...
Saving...


Year 2017. Batch 6 of 8: 100%|██████████| 50/50 [01:28<00:00,  1.76s/it]


Combining...
Matching geo...
Saving...


Year 2017. Batch 7 of 8: 100%|██████████| 50/50 [01:30<00:00,  1.81s/it]


Combining...
Matching geo...
Saving...


Year 2017. Batch 8 of 8: 100%|██████████| 15/15 [00:19<00:00,  1.33s/it]


Combining...
Matching geo...
Saving...


Year 2016. Batch 1 of 8: 100%|██████████| 50/50 [01:44<00:00,  2.08s/it]


Combining...
Matching geo...
Saving...


Year 2016. Batch 2 of 8: 100%|██████████| 50/50 [01:38<00:00,  1.98s/it]


Combining...
Matching geo...
Saving...


Year 2016. Batch 3 of 8: 100%|██████████| 50/50 [01:39<00:00,  1.99s/it]


Combining...
Matching geo...
Saving...


Year 2016. Batch 4 of 8: 100%|██████████| 50/50 [01:35<00:00,  1.91s/it]


Combining...
Matching geo...
Saving...


Year 2016. Batch 5 of 8: 100%|██████████| 50/50 [01:33<00:00,  1.86s/it]


Combining...
Matching geo...
Saving...


Year 2016. Batch 6 of 8: 100%|██████████| 50/50 [01:40<00:00,  2.01s/it]


Combining...
Matching geo...
Saving...


Year 2016. Batch 7 of 8: 100%|██████████| 50/50 [01:32<00:00,  1.85s/it]


Combining...
Matching geo...
Saving...


Year 2016. Batch 8 of 8: 100%|██████████| 16/16 [00:25<00:00,  1.59s/it]


Combining...
Matching geo...
Saving...


Year 2015. Batch 1 of 8: 100%|██████████| 50/50 [01:12<00:00,  1.45s/it]


Combining...
Matching geo...
Saving...


Year 2015. Batch 2 of 8: 100%|██████████| 50/50 [01:27<00:00,  1.75s/it]


Combining...
Matching geo...
Saving...


Year 2015. Batch 3 of 8: 100%|██████████| 50/50 [01:25<00:00,  1.71s/it]


Combining...
Matching geo...
Saving...


Year 2015. Batch 4 of 8: 100%|██████████| 50/50 [01:30<00:00,  1.81s/it]


Combining...
Matching geo...
Saving...


Year 2015. Batch 5 of 8: 100%|██████████| 50/50 [01:30<00:00,  1.82s/it]


Combining...
Matching geo...
Saving...


Year 2015. Batch 6 of 8: 100%|██████████| 50/50 [01:33<00:00,  1.86s/it]


Combining...
Matching geo...
Saving...


Year 2015. Batch 7 of 8: 100%|██████████| 50/50 [01:33<00:00,  1.88s/it]


Combining...
Matching geo...
Saving...


Year 2015. Batch 8 of 8: 100%|██████████| 15/15 [00:27<00:00,  1.80s/it]


Combining...
Matching geo...
Saving...


Year 2014. Batch 1 of 8: 100%|██████████| 50/50 [00:54<00:00,  1.08s/it]


Combining...
Matching geo...
Saving...


Year 2014. Batch 2 of 8: 100%|██████████| 50/50 [01:01<00:00,  1.24s/it]


Combining...
Matching geo...
Saving...


Year 2014. Batch 3 of 8: 100%|██████████| 50/50 [01:03<00:00,  1.27s/it]


Combining...
Matching geo...
Saving...


Year 2014. Batch 4 of 8: 100%|██████████| 50/50 [01:06<00:00,  1.33s/it]


Combining...
Matching geo...
Saving...


Year 2014. Batch 5 of 8: 100%|██████████| 50/50 [01:10<00:00,  1.40s/it]


Combining...
Matching geo...
Saving...


Year 2014. Batch 6 of 8: 100%|██████████| 50/50 [01:18<00:00,  1.56s/it]


Combining...
Matching geo...
Saving...


Year 2014. Batch 7 of 8: 100%|██████████| 50/50 [01:12<00:00,  1.46s/it]


Combining...
Matching geo...
Saving...


Year 2014. Batch 8 of 8: 100%|██████████| 11/11 [00:12<00:00,  1.12s/it]


Combining...
Matching geo...
Saving...


Year 2013. Batch 1 of 6: 100%|██████████| 50/50 [01:20<00:00,  1.61s/it]


Combining...
Matching geo...
Saving...


Year 2013. Batch 2 of 6: 100%|██████████| 50/50 [00:33<00:00,  1.51it/s]


Combining...
Matching geo...
Saving...


Year 2013. Batch 3 of 6: 100%|██████████| 50/50 [01:01<00:00,  1.22s/it]


Combining...
Matching geo...
Saving...


Year 2013. Batch 4 of 6: 100%|██████████| 50/50 [01:09<00:00,  1.39s/it]


Combining...
Matching geo...
Saving...


Year 2013. Batch 5 of 6: 100%|██████████| 50/50 [01:06<00:00,  1.33s/it]


Combining...
Matching geo...
Saving...


Year 2013. Batch 6 of 6: 100%|██████████| 28/28 [00:31<00:00,  1.12s/it]


Combining...
Matching geo...
Saving...


Year 2012. Batch 1 of 1: 100%|██████████| 12/12 [04:26<00:00, 22.25s/it]


Combining...
Matching geo...
Saving...


Year 2011. Batch 1 of 1: 100%|██████████| 12/12 [04:14<00:00, 21.23s/it]


Combining...
Matching geo...
Saving...


Year 2010. Batch 1 of 1: 100%|██████████| 12/12 [02:54<00:00, 14.56s/it]


Combining...
Matching geo...
Saving...


Year 2009. Batch 1 of 1: 100%|██████████| 12/12 [02:59<00:00, 14.93s/it]


Combining...
Matching geo...
Saving...


Year 2008. Batch 1 of 1: 100%|██████████| 12/12 [01:51<00:00,  9.30s/it]


Combining...
Matching geo...
Saving...


Year 2007. Batch 1 of 1: 100%|██████████| 12/12 [01:32<00:00,  7.72s/it]


Combining...
Matching geo...
Saving...


Year 2006. Batch 1 of 1: 100%|██████████| 12/12 [00:49<00:00,  4.15s/it]


Combining...
Matching geo...
Saving...


Year 2005. Batch 1 of 1: 100%|██████████| 1/1 [00:27<00:00, 27.13s/it]


Combining...
Matching geo...
Saving...


Year 2004. Batch 1 of 1: 100%|██████████| 1/1 [00:37<00:00, 37.13s/it]


Combining...
Matching geo...
Saving...


Year 2003. Batch 1 of 1: 100%|██████████| 1/1 [00:47<00:00, 47.41s/it]


Combining...
Matching geo...
Saving...


Year 2002. Batch 1 of 1: 100%|██████████| 1/1 [00:35<00:00, 35.31s/it]


Combining...
Matching geo...
Saving...


Year 2001. Batch 1 of 1: 100%|██████████| 1/1 [00:45<00:00, 45.42s/it]


Combining...
Matching geo...
Saving...


Year 2000. Batch 1 of 1: 100%|██████████| 1/1 [00:36<00:00, 36.33s/it]


Combining...
Matching geo...
Saving...


Year 1999. Batch 1 of 1: 100%|██████████| 1/1 [00:34<00:00, 34.89s/it]


Combining...
Matching geo...
Saving...


Year 1998. Batch 1 of 1: 100%|██████████| 1/1 [00:34<00:00, 34.75s/it]


Combining...
Matching geo...
Saving...


Year 1997. Batch 1 of 1: 100%|██████████| 1/1 [00:28<00:00, 28.21s/it]


Combining...
Matching geo...
Saving...


Year 1996. Batch 1 of 1: 100%|██████████| 1/1 [00:20<00:00, 20.84s/it]


Combining...
Matching geo...
Saving...


Year 1995. Batch 1 of 1: 100%|██████████| 1/1 [00:13<00:00, 13.23s/it]


Combining...
Matching geo...
Saving...


Year 1994. Batch 1 of 1: 100%|██████████| 1/1 [00:13<00:00, 13.88s/it]


Combining...
Matching geo...
Saving...


Year 1993. Batch 1 of 1: 100%|██████████| 1/1 [00:09<00:00,  9.46s/it]


Combining...
Matching geo...
Saving...


Year 1992. Batch 1 of 1: 100%|██████████| 1/1 [00:07<00:00,  7.97s/it]


Combining...
Matching geo...
Saving...


Year 1991. Batch 1 of 1: 100%|██████████| 1/1 [00:11<00:00, 11.58s/it]


Combining...
Matching geo...
Saving...


Year 1990. Batch 1 of 1: 100%|██████████| 1/1 [00:08<00:00,  8.35s/it]


Combining...
Matching geo...
Saving...


Year 1989. Batch 1 of 1: 100%|██████████| 1/1 [00:07<00:00,  7.89s/it]


Combining...
Matching geo...
Saving...


Year 1988. Batch 1 of 1: 100%|██████████| 1/1 [00:08<00:00,  8.64s/it]


Combining...
Matching geo...
Saving...


Year 1987. Batch 1 of 1: 100%|██████████| 1/1 [00:08<00:00,  8.16s/it]


Combining...
Matching geo...
Saving...


Year 1986. Batch 1 of 1: 100%|██████████| 1/1 [00:08<00:00,  8.09s/it]


Combining...
Matching geo...
Saving...


Year 1985. Batch 1 of 1: 100%|██████████| 1/1 [00:07<00:00,  7.34s/it]


Combining...
Matching geo...
Saving...


Year 1984. Batch 1 of 1: 100%|██████████| 1/1 [00:06<00:00,  6.51s/it]


Combining...
Matching geo...
Saving...


Year 1983. Batch 1 of 1: 100%|██████████| 1/1 [00:06<00:00,  6.36s/it]


Combining...
Matching geo...
Saving...


Year 1982. Batch 1 of 1: 100%|██████████| 1/1 [00:05<00:00,  5.91s/it]


Combining...
Matching geo...
Saving...


Year 1981. Batch 1 of 1: 100%|██████████| 1/1 [00:05<00:00,  5.50s/it]


Combining...
Matching geo...
Saving...


Year 1980. Batch 1 of 1: 100%|██████████| 1/1 [00:04<00:00,  4.15s/it]


Combining...
Matching geo...
Saving...


Year 1979. Batch 1 of 1: 100%|██████████| 1/1 [00:03<00:00,  3.40s/it]


Combining...
Matching geo...
Saving...


In [11]:
# def add_missing_locations(file, cols):
#     df = pd.read_feather(file)
    
#     mask = np.logical_and(df['adm1'].isnull(), df[country_col].notnull())
#     if mask.sum() == 0:
#         return False
    
#     idx = df[mask].index
#     subdf = append_adm_codes(df.loc[mask, cols], lat_col, lon_col, country_col, geo_type_col, use_pool=False)
#     subdf.index = idx
#     df[mask] = subdf
    
#     df.to_feather(file)
#     return True

# files = glob(str(save_folder / '*.feather'))
# df = pd.read_feather(files[0])
# cols = [x for x in df.columns if x not in ['adm0', 'adm1', 'adm2', 'nearest_loc']]
# del df

# f = partial(add_missing_locations, cols=cols)
# with Pool(20) as p:
#     results = [x for x in tqdm(p.imap_unordered(f, files), total=len(files))]
# print(sum(results), 'files changed')
# del results

# Process

In [8]:
def get_collapsed(df, group_cols, dist_cols):    
    pivot = pd.pivot_table(df, 
                           index=group_cols, 
                           values=dist_cols,
                           aggfunc=[np.mean, np.sum, np.median, np.std])
    pivot.columns = ['_'.join(x[::-1]) for x in pivot.columns]
    pivot['count'] = df.groupby(group_cols)[dist_cols[0]].count()
    return pivot.reset_index()
        
        
def increase_month(date: int):    
    month = date % 100
    if month == 12:
        new_date = ((date // 100) + 1) * 100 + 1
    else:
        new_date = date + 1
    return new_date


def increase_week(date: int):    
    week = date % 100
    assert 0 < week < 54
    
    if week < 52:
        new_date = date + 1
    elif week == 53:
        new_date = (date // 100 + 1) * 100 + 1
    else:
        year = date // 100
        _, to_ = week_range(date)
        if to_ == datetime(year, 12, 31):
            new_date = (year + 1) * 100 + 1
        else:
            new_date = date + 1
        
    return new_date


def increase_day(date: int):
    date = datetime.strptime(str(date), '%Y%m%d')
    date += timedelta(days=1)
    return format_date(date)


def format_date(date):
    return int(datetime.strftime(date, '%Y%m%d'))


def week_range(year_week):
    year = year_week // 100
    week = year_week % 100
    assert 0 < week < 54
    date = datetime.strptime(f'{year}01-0', '%Y%U-%w')
    
    if date.day == 1:
        from_ = date
        to_ = date + timedelta(days=6)
    else:
        from_ = datetime(year, 1, 1)
        to_ = date
    
    if week > 1:
        from_ = to_ + timedelta(days=1 + (7 * (week - 2)))
        to_ = from_ + timedelta(days=6)
        
    if to_.year > year:
        to_ = datetime(year, 12, 31)
    
    return format_date(from_), format_date(to_)    


def load_data(g, save_folder):
    dfs = [pd.read_feather(file) for file in glob(str(save_folder / g))]
    return dfs[0].append(dfs[1:], ignore_index=True)


def load_yearly(save_folder, from_=1979, to_=2020):
    for i in range(from_, to_ + 1):
        yield i, load_data(f'{i}*.feather', save_folder)


def load_monthly(save_folder, from_=197901, to_=202012):
    while from_ <= to_:
        yield from_, load_data(f'{from_}*.feather', save_folder)
        from_ = increase_month(from_)
        
def load_weekly(save_folder, from_=197901, to_=202053):    
    while from_ <= to_:
        d1, d2 = week_range(from_)
        
        dfs = []
        while d1 <= d2:
            dfs.append(load_data(f'{d1}*.feather', save_folder))
            d1 = increase_day(d1)
            
        yield from_, dfs[0].append(dfs[1:], ignore_index=True)
        
        from_ = increase_week(from_)
        

In [5]:
def get_collapsed_group(group, group_cols, dist_cols):
    time_value, df = group
    
    df[group_cols] = df[group_cols].fillna('Not Available')
    collapsed = get_collapsed(df, group_cols, dist_cols)
    collapsed['time_value'] = time_value
    
    return collapsed

In [6]:
time_funcs = {
    'yearly': {'f': load_yearly},
    'monthly': {'f': load_monthly},
    'weekly': {'f': load_weekly, 'batch': 42}
}
groups = {
    'country': list(set(group_cols + ['adm0'])),
    'edo': list(set(group_cols + ['adm0', 'adm1'])),
    'mun': list(set(group_cols + ['adm0', 'adm1', 'adm2']))
}
all_group_cols = list(set(group_cols + [country_col, 'adm0', 'adm1', 'adm2']))

In [9]:
processed = [Path(x).name for x in glob(str(collapse_folder / '*.feather'))]

# For each time aggregation
for time_name, time_attrs in time_funcs.items():
    n_batches = time_attrs.get('batch', 1)
    batch_size = 42 // n_batches + int(42 % n_batches > 0)
    
    collapsed = {group_name: [] for group_name in groups}
    
    # If all groups for a time value has been processed, no need to load the data
    if sum([time_name in x for x in processed]) == len(groups):
        continue
    
    # Load the data and collapse each group
    for group in tqdm(time_attrs['f'](save_folder), desc=f"{time_name}"):

        for group_name, group_cols in groups.items():  
            fname = f'GDELT_{group_name}_{time_name}.feather'
            if fname in processed:
                continue

            collapsed[group_name].append(
                get_collapsed_group(group, group_cols=group_cols, dist_cols=dist_cols)
            )

    # Append and save collapsed
    for group_name, dfs in collapsed.items():
        fname = f'GDELT_{group_name}_{time_name}.feather'
        print(f'Appending {group_name}')
        df = dfs[0].append(dfs[1:], ignore_index=True)

        print('Saving df with shape', df.shape)
        df.to_feather(collapse_folder / fname)
        del df

        gc.collect()

monthly: 504it [1:24:44, 10.09s/it]


Appending country
Saving df with shape (6588341, 24)
Appending edo
Saving df with shape (24228382, 25)
Appending mun
Saving df with shape (46140368, 26)


weekly: 2226it [1:56:28,  3.14s/it]


Appending country
Saving df with shape (16993811, 24)
Appending edo
Saving df with shape (50478266, 25)
Appending mun
Saving df with shape (83162071, 26)


In [11]:
# for key in groups.keys():
#     print(f'GDELT {key} weekly')
#     df = load_data(f'GDELT_{key}_weekly_*.feather', collapse_folder)
#     print(df.shape)
#     df.to_csv(collapse_folder / f'GDELT_{key}_weekly.csv')
#     del df

In [13]:
# Transform feather to csv
all_files = [Path(f) for f in glob(str(collapse_folder / '*.feather'))]
feather_files = [x for x in all_files if x.suffix == '.feather']
names = {x.name for x in feather_files}
for file in feather_files:
    csv_file = file.with_suffix('.csv')
    if csv_file.name in names or '_b' in file.name:
        continue
    print(file.name)
    df = pd.read_feather(file)
    df.to_csv(csv_file, index=False)
    del df

GDELT_edo_yearly.feather
GDELT_country_weekly.feather
GDELT_mun_weekly.feather
GDELT_edo_weekly.feather
GDELT_country_monthly.feather
GDELT_mun_monthly.feather
GDELT_country_yearly.feather
GDELT_edo_monthly.feather
GDELT_mun_yearly.feather


In [14]:
df = pd.read_csv(collapse_folder / f'GDELT_mun_weekly.csv')

no_adm = df.loc[df['adm1'] == 'Not Available', 'adm0'].unique()
with_adm = df.loc[df['adm1'] != 'Not Available', 'adm0'].unique()
not_available = [x for x in set(no_adm).difference(with_adm) if len(x) < 3]
print(len(not_available))
print(', '.join(sorted(not_available)))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


4
GG, RB, US, YI


In [4]:
glob(str(collapse_folder / '*.csv'))

['../Output/GDELT_collapsed/GDELT_edo_monthly.csv',
 '../Output/GDELT_collapsed/GDELT_country_yearly.csv',
 '../Output/GDELT_collapsed/GDELT_mun_monthly.csv',
 '../Output/GDELT_collapsed/GDELT_mun_yearly.csv',
 '../Output/GDELT_collapsed/GDELT_mun_weekly.csv',
 '../Output/GDELT_collapsed/GDELT_country_weekly.csv',
 '../Output/GDELT_collapsed/GDELT_edo_weekly.csv',
 '../Output/GDELT_collapsed/GDELT_country_monthly.csv',
 '../Output/GDELT_collapsed/GDELT_edo_yearly.csv']

In [None]:
country_map = {'GG': 'DEU', 'US': 'USA'}
for file in tqdm(glob(str(collapse_folder / '*.csv'))):
    df = pd.read_csv(file, dtype=str)
    
    try:
        df.drop(columns='Unnamed: 0', inplace=True)
    except KeyError:
        pass
    
    df['adm0'] = df['adm0'].apply(lambda x: country_map.get(x, x))
    df.to_csv(file, index=False)
    del df

 44%|████▍     | 4/9 [19:23<25:22, 304.54s/it]

# Compress

In [5]:
files_by_month = {}

for file in iglob(str(save_folder / '*.feather')):
    file = Path(file)
    date = file.name[:6]
    if not date.isnumeric():
        print('Skipping', file.name)
        continue
        
    if date in files_by_month:
        files_by_month[date].append(file)
    else:
        files_by_month[date] = [file]
        
for date, files in tqdm(files_by_month.items()):
    with ZipFile(save_folder / f'{date}.zip', 'w', compression=ZIP_DEFLATED) as zf:
        for file in files:
            zf.write(file, file.name)
            
    for file in files:
        os.remove(file)            

100%|██████████| 505/505 [59:24<00:00,  7.06s/it]  


# Sanity Check

In [6]:
files = glob(str(save_folder) + '/*.zip')

dfs = []
np.random.seed(43)
for zfile in np.random.choice(files, 20):
    with ZipFile(str(zfile), 'r') as zf:
        dfiles = zf.namelist()
        if len(dfiles) > 5:
            dfiles = [str(x) for x in np.random.choice(dfiles, 1)]
        for dfile in dfiles:
            zf.extract(str(dfile), '')
            dfile = Path(dfile)
            dfs.append(pd.read_feather(str(dfile)))
            os.remove(str(dfile))
            
df = dfs[0].append(dfs[1:])
del dfs

df['check_passed'] = np.nan
df[df['adm2'].notnull()].sample(100).to_excel(save_folder / 'sanity_check_gdelt.xlsx', index=False)
# subdf.reset_index(drop=True).to_feather(save_folder / f'{date}_{n:02}.feather')