In [1]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import sys
import yaml
import gc

from datetime import datetime, timedelta
from zipfile import ZipFile
from glob import glob, iglob
from pathlib import Path
from tqdm import tqdm

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from projections import geometry



In [2]:
input_folder = Path('../Output/GDELT')
output_folder = Path('../Output/GDELT_Ethnic')
collapse_folder = output_folder / 'collapsed'
lat = 'ActionGeo_Lat'
lon = 'ActionGeo_Long'
geo_type_col = 'ActionGeo_Type'

In [3]:
adm = gpd.read_file('../Shapefiles/ethnic_preprocessed/tribe_adm0.shp')
countries = set(adm['GID_0'])
adm.head()

Unnamed: 0,NAME,TRIBE_CODE,LAT,LON,GID_0,NAME_0,area_tribe,area_adm,area_inter,geometry
0,GUANCHE,250,28.3354,-15.6735,ESP,Spain,7485274000.0,506043800000.0,7112255000.0,"MULTIPOLYGON (((-17.89487 27.78681, -17.89514 ..."
1,JEBALA,312,34.8506,-5.28036,ESP,Spain,16737560000.0,506043800000.0,20285420.0,"MULTIPOLYGON (((-5.37708 35.91704, -5.37708 35..."
2,RIF,651,34.7908,-3.71829,ESP,Spain,20272920000.0,506043800000.0,7560911.0,"MULTIPOLYGON (((-2.92593 35.29208, -2.92708 35..."
3,ADANGME,6,6.076851,0.270457,GHA,Ghana,4986379000.0,238324300000.0,4803454000.0,"MULTIPOLYGON (((0.69465 5.77336, 0.69328 5.775..."
4,ADELE,8,8.244284,0.673651,GHA,Ghana,1413803000.0,238324300000.0,677916100.0,"POLYGON ((0.45975 8.06680, 0.46512 8.07837, 0...."


In [4]:
zfiles = sorted(glob(str(input_folder / '*.zip')))

try:
    with open(output_folder / 'shapes_counts.yml', 'r') as f:
        shapes = yaml.safe_load(f)
except FileNotFoundError:
    shapes = {}

for zfile in tqdm(zfiles):
    zfile = Path(zfile)
    
    # Extract files from ZIP
    with ZipFile(zfile, 'r') as zf:
        files = zf.namelist()
        if all([f.split('.')[0] in shapes for f in files]):
            continue
        
        zf.extractall()

        dfs = []    
        for f in files:
            df = pd.read_feather(f)

            # Store counts in dict
            adm1_mask = df[geo_type_col] > 1
            adm2_mask = np.logical_and(df[geo_type_col] > 2, df[geo_type_col] < 5)
            shapes[f.split('.')[0]] = {'total': df.shape[0], 
                                       'adm1-2': int(adm1_mask.sum()),
                                       'adm2': int(adm2_mask.sum())}

            df = df[adm2_mask]
            dfs.append(df[df['adm0'].apply(lambda x: x in countries)].copy())
            os.remove(f)

    # Join files together
    if len(dfs) > 1:
        df = dfs[0].append(dfs[1:], ignore_index=True)
    else:
        df = dfs[0]
    del dfs
    
    df.drop(columns=['adm1', 'adm2', 'nearest_loc'], inplace=True)
    
    # Get unique locations
    locs = df[[lat, lon]].drop_duplicates()
    locs = gpd.GeoDataFrame(locs, geometry=gpd.points_from_xy(locs[lon], locs[lat]))
    locs = locs.set_crs('EPSG:4326')

    # Match geometries
    locs = geometry.loc_match(adm, locs, ['NAME', 'TRIBE_CODE', 'GID_0'], verbose=False)
    locs.drop(columns=['index_left', 'LAT', 'LON', 'NAME_0'], inplace=True)
    
    # Match locations back and save
    df = df.merge(locs, on=[lat, lon])
    df.drop(columns=['geometry', 'adm0'], inplace=True)
    df.rename(columns={'GID_0': 'adm0'}, inplace=True)
    df.to_csv(output_folder / zfile.with_suffix('.csv').name, index=False)

    # Save shape statistics
    with open(output_folder / 'shapes_counts.yml', 'w') as f:
        yaml.dump(shapes, f)

100%|██████████| 505/505 [7:10:24<00:00, 51.14s/it]   


# Save shapes as csv

In [5]:
with open(output_folder / 'shapes_counts.yml', 'r') as f:
    shapes = yaml.safe_load(f)

shape_stats = []
for k, v in shapes.items():
    shape_stats.append({'date': k.split('_')[0], **v})
    
shape_stats = pd.DataFrame(shape_stats)
shape_stats = shape_stats.groupby('date').sum()
shape_stats.to_csv(output_folder / 'shapes_counts.csv')

# Collapse

In [9]:
def get_collapsed(df, group_cols, dist_cols):   
    for col in dist_cols:
        df[col] = pd.to_numeric(df[col])
        
    pivot = pd.pivot_table(df, 
                           index=group_cols, 
                           values=dist_cols,
                           aggfunc=[np.mean, np.sum, np.median, np.std])
    pivot.columns = ['_'.join(x[::-1]) for x in pivot.columns]
    pivot['count'] = df.groupby(group_cols)[dist_cols[0]].count()
    return pivot.reset_index()
        
    
def get_collapsed_group(group, group_cols, dist_cols):
    time_value, df = group
    
    df[group_cols] = df[group_cols].fillna('Not Available')
    collapsed = get_collapsed(df, group_cols, dist_cols)
    collapsed['time_value'] = time_value
    
    return collapsed

        
def increase_month(date: int):    
    month = date % 100
    if month == 12:
        new_date = ((date // 100) + 1) * 100 + 1
    else:
        new_date = date + 1
    return new_date


def increase_week(date: int):    
    week = date % 100
    assert 0 < week < 54
    
    if week < 52:
        new_date = date + 1
    elif week == 53:
        new_date = (date // 100 + 1) * 100 + 1
    else:
        year = date // 100
        _, to_ = week_range(date)
        if to_ == datetime(year, 12, 31):
            new_date = (year + 1) * 100 + 1
        else:
            new_date = date + 1
        
    return new_date


def increase_day(date: int):
    date = datetime.strptime(str(date), '%Y%m%d')
    date += timedelta(days=1)
    return format_date(date)


def format_date(date):
    return int(datetime.strftime(date, '%Y%m%d'))


def week_range(year_week):
    year = year_week // 100
    week = year_week % 100
    assert 0 < week < 54
    date = datetime.strptime(f'{year}01-0', '%Y%U-%w')
    
    if date.day == 1:
        from_ = date
        to_ = date + timedelta(days=6)
    else:
        from_ = datetime(year, 1, 1)
        to_ = date
    
    if week > 1:
        from_ = to_ + timedelta(days=1 + (7 * (week - 2)))
        to_ = from_ + timedelta(days=6)
        
    if to_.year > year:
        to_ = datetime(year, 12, 31)
    
    return format_date(from_), format_date(to_)


def load_data(g, save_folder):
    dfs = [pd.read_csv(file, dtype=str) for file in glob(str(save_folder / g))]
    return dfs[0].append(dfs[1:], ignore_index=True)


def load_yearly(save_folder, from_=1979, to_=2020):
    for i in range(from_, to_ + 1):
        yield i, load_data(f'{i}*.csv', save_folder)


def load_monthly(save_folder, from_=197901, to_=202012):
    while from_ <= to_:
        yield from_, load_data(f'{from_}*.csv', save_folder)
        from_ = increase_month(from_)
        

def load_weekly(save_folder, from_=197901, to_=202053):    
    while from_ <= to_:
        d1, d2 = week_range(from_)
        m1, m2 = d1 // 100, d2 // 100
        
        df = load_data(f'{m1}.csv', save_folder)
        if m1 != m2:
            df = df.append(load_data(f'{m2}.csv', save_folder), ignore_index=True)
        
        df['SQLDATE'] = pd.to_numeric(df['SQLDATE'])
        df = df.loc[np.logical_and(df['SQLDATE'] >= d1, df['SQLDATE'] <= d2)]        
            
        yield from_, df
        
        from_ = increase_week(from_)

In [10]:
group_cols = ['EventCode']
dist_cols = ['NumMentions', 'IsRootEvent', 'NumSources', 'NumArticles', 'AvgTone']
time_funcs = {
    'yearly': {'f': load_yearly},
    'monthly': {'f': load_monthly},
    'weekly': {'f': load_weekly}
}
groups = {
    'ethnic': list(set(group_cols + ['adm0', 'NAME', 'TRIBE_CODE'])),
}

In [11]:
processed = [Path(x).name for x in glob(str(collapse_folder / '*.csv'))]

# For each time aggregation
for time_name, time_attrs in time_funcs.items():
    n_batches = time_attrs.get('batch', 1)
    batch_size = 42 // n_batches + int(42 % n_batches > 0)
    
    collapsed = {group_name: [] for group_name in groups}
    
    # If all groups for a time value has been processed, no need to load the data
    if sum([time_name in x for x in processed]) == len(groups):
        continue
    
    # Load the data and collapse each group
    for group in tqdm(time_attrs['f'](output_folder), desc=f"{time_name}"):

        for group_name, group_cols in groups.items():  
            fname = f'GDELT_{group_name}_{time_name}.csv'
            if fname in processed:
                continue

            collapsed[group_name].append(
                get_collapsed_group(group, group_cols=group_cols, dist_cols=dist_cols)
            )

    # Append and save collapse
#     df = df.merge(locs, on=[lat, lon])
#     df.drop(columns=['geometry', 'adm0'], inplace=True)
#     df.rename(columns={'GID_0': 'adm0'}, inplace=True)
#     df.to_csv(output_folder / zfile.with_suffix('.csv').name, index=False)

    # Save shape statistics
    with open(output_folder / 'shapes_counts.yml', 'w') as f:
        yaml.dump(shapes, f)
        
    for group_name, dfs in collapsed.items():
        fname = f'GDELT_{group_name}_{time_name}.csv'
        print(f'Appending {group_name}')
        df = dfs[0].append(dfs[1:], ignore_index=True)
        df.drop_duplicates(groups[group_name], inplace=True)

        print('Saving df with shape', df.shape)
        df.to_csv(collapse_folder / fname, index=False)
        del df

        gc.collect()

yearly: 42it [06:40,  9.53s/it]


Appending ethnic
Saving df with shape (127303, 26)


monthly: 504it [06:57,  1.21it/s]


Appending ethnic
Saving df with shape (127303, 26)


weekly: 2226it [20:31,  1.81it/s]


Appending ethnic
Saving df with shape (127303, 26)
