In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import ftfy
import re
from glob import glob
from tqdm import tqdm
from gensim.utils import deaccent
from pathlib import Path

import shapefile
import shapely
from shapely.geometry import Point

import spacy
from collections import Counter
from string import punctuation
import yaml

from projections.constants import FLAT_CRS

nlp = spacy.load("en_core_web_md")
STOP = nlp.Defaults.stop_words.union(set(punctuation))

pd.set_option('max_columns', None)

In [2]:
africa = pd.read_csv('../Data/SCAD_Africa.csv', encoding='latin')
africa.columns = [x.lower() for x in africa.columns]
africa.rename(columns={'lgtbq_issue': 'lgbtq_issue'}, inplace=True)
print(africa.shape)

latam = pd.read_csv('../Data/SCAD_Latam.csv', encoding='latin')
latam.columns = [x.lower() for x in latam.columns]
print(latam.shape)

latam.head()

(17644, 44)
(5602, 44)


Unnamed: 0,eventid,id,ccode,countryname,startdate,enddate,duration,stday,stmo,styr,eday,emo,eyr,etype,escalation,actor1,actor2,actor3,target1,target2,cgovtarget,rgovtarget,npart,ndeath,repress,elocal,ilocal,sublocal,locnum,gislocnum,issue1,issue2,issue3,issuenote,nsource,notes,female_event,lgbtq_issue,coder,acd_questionable,latitude,longitude,geo_comments,location_precision
0,400001,1,40,Cuba,28-Jan-90,28-Jan-90,1,28,1,1990,28,1,1990,2,0,Citizens,,,U.S. government,,0,0,5,0,0,Unknown,Unknown,1,-99,-99,8,,,Citizens demonstrated against U.S. plans for t...,AP,,0,0,MW,0,21.950001,-79.550003,Unknown location,no
1,400002,2,40,Cuba,5-Mar-90,5-Mar-90,1,5,3,1990,5,3,1990,2,0,Government supporters,,,Sebastian Arcos - dissident,Other dissidents,0,0,-99,0,0,Unknown,Unknown,1,-99,-99,11,,,Pro-government demonstration occurred in front...,AP,,0,0,MW,0,21.950001,-79.550003,Unknown location,no
2,400003,3,40,Cuba,8-Mar-90,8-Mar-90,1,8,3,1990,8,3,1990,2,0,Government supporters,,,Gustavo Arcos - human rights activist and diss...,Three other dissidents,0,0,3,0,0,Unknown,Unknown,1,-99,-99,11,,,Pro-government demonstration occurred outside ...,AP,,0,0,MW,0,21.950001,-79.550003,Unknown location,no
3,400004,4,40,Cuba,9-Jul-90,26-Jul-90,18,9,7,1990,26,7,1990,2,4,Cuban asylum-seekers,,,Czechoslovak/Italian/Spanish/Swiss governments,Cuban government,1,0,2,0,1,Havana,Havana,1,1,1,10,,,"Asylum-seekers sought refuge in Czechoslovak, ...",AP,Is asylum-seeking at embassies a codeable even...,0,0,MW,0,23.05407,-82.345192,,
4,400005,5,40,Cuba,17-Jul-90,17-Jul-90,1,17,7,1990,17,7,1990,7,0,Police,,,Citizen,,0,0,-99,0,0,Havana,Havana,1,1,1,11,,,Man was beaten and taken away by police after ...,AP,Exact date not specified; event date approximated,0,0,MW,0,23.05407,-82.345192,,


In [3]:
df = africa.append(latam)
df.reset_index(drop=True, inplace=True)
print(df.shape)
assert df.shape[1] == latam.shape[1]

(23246, 44)


In [4]:
def clean_text(s):
    s = deaccent(ftfy.fix_text(s))
    try:
        s = s.strip().upper().encode('cp1252').decode().lower()
        s = deaccent(s)
    except Exception as e:
        s = s.lower()
        
    return s

for col in ['countryname']:
    df[col] = df[col].apply(clean_text)

In [5]:
iso_codes = pd.read_csv('../Data/iso_codes.csv', usecols=['country', 'numeric', 'iso2'])
iso_codes['country'] = iso_codes['country'].apply(clean_text)

# Namibia is interpreted as NaN
iso_codes.loc[iso_codes['numeric'] == 516, 'iso2'] = 'NA'

df = df.merge(iso_codes, left_on='countryname', right_on='country', how='left')

country_map = {
    'gambia': 'GM',
    'niger': 'NE', 
    'central african republic': 'CF', 
    'republic of congo': 'CG',
    'democratic republic of the congo': 'CD', 
    'democratic republic of congo': 'CD',
    'tanzania': 'TZ', 
    'swaziland': 'SZ', 
    'sudan': 'SD', 
    'dominican republic': 'DO'
}
df['iso2'] = np.where(df['iso2'].isnull(), df['countryname'].map(country_map), df['iso2'])

assert df['iso2'].isnull().sum() == 0

In [6]:
shps = glob('../Shapefiles/preprocessed/*.shp')
shapes_by_country = {}
for i, shp in tqdm(enumerate(shps), total=len(shps)):
    codes = Path(shp).name.replace('.shp', '').split('_')
    shp = shapefile.Reader(shp)
    shapes = [shapely.geometry.shape(s) for s in shp.shapes()]
    records = shp.records()
    
    for code in codes:
        shapes_by_country[code] = []
        for shape, record in zip(shapes, records):
            shapes_by_country[code].append((shape, *record))
    
assert 'GT' in shapes_by_country and 'PM' in shapes_by_country

100%|██████████| 276/276 [02:27<00:00,  1.87it/s]


In [7]:
locs = df[['longitude', 'latitude', 'iso2']].drop_duplicates()
locs['adm1'] = np.nan
locs['adm2'] = np.nan
locs['adm0'] = np.nan
locs['nearest_loc'] = False

In [8]:
def country_shapes(country):
    if country in shapes_by_country:
        yield from shapes_by_country[country]
    else:
        for shapes in shapes_by_country.values():
            yield from shapes
            

pbar = tqdm(total=locs.shape[0], desc='Finding codes')
for idx, row in locs.iterrows():
    if isinstance(row['adm1'], str):
        pbar.update(1)
        continue
    
    coordinates = (row['longitude'], row['latitude'])
    point = Point(coordinates)
    nearest_record = None
    nearest_distance = np.inf
    for shape, country, adm1, adm2 in country_shapes(row['iso2']):
        if point.within(shape):
            locs.loc[idx, 'adm2'] = adm2
            locs.loc[idx, 'adm1'] = adm1
            locs.loc[idx, 'adm0'] = country
            break
            
        distance = point.distance(shape)
        if distance < nearest_distance:
            nearest_distance = distance
            nearest_record = (shape, country, adm1, adm2)
    else:
        if nearest_record:
            locs.loc[idx, 'nearest_loc'] = True
            locs.loc[idx, 'adm2'] = nearest_record[3]
            locs.loc[idx, 'adm1'] = nearest_record[2]
            locs.loc[idx, 'adm0'] = nearest_record[1]
        
    pbar.update(1)
pbar.close()
print('Total nearest:', locs['nearest_loc'].sum())
print('Missing:', locs.loc[locs['adm1'].isnull(), 'iso2'].unique())

Finding codes: 100%|██████████| 5898/5898 [03:26<00:00, 28.50it/s]  

Total nearest: 213
Missing: []





In [9]:
locs.to_csv('../Output/SCAD_locs.csv', index=False)
# locs = pd.read_csv('../Output/SCAD_locs.csv')

In [10]:
df = df.merge(locs, on=['longitude', 'latitude', 'iso2'], how='left')

In [11]:
adm = pd.read_csv('../Shapefiles/ADM/POLIO_ADMINISTRATIVE_BOUNDARIES.csv')
adm.columns = [x.lower() for x in adm.columns]
adm = adm[['adm0_name', 'adm1_name', 'adm2_name', 'adm1_code', 'adm2_code']].copy()
adm.rename(columns={'adm1_code': 'adm1', 'adm2_code': 'adm2'}, inplace=True)

df = df.merge(adm, on=['adm1', 'adm2'], how='left')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [12]:
df['nearest_loc'].fillna(False, inplace=True)
df.to_csv('../Output/SCAD.csv', index=False)

# Keywords

In [13]:
POS_TAGS = {'PROPN', 'ADJ', 'NOUN'}

def get_hotwords(doc, is_doc=False):    
    if not is_doc:
        doc = nlp(doc)
        
    result = [token.lemma_ 
              for token in doc 
              if token.text.lower() not in STOP and token.pos_ in POS_TAGS]
                
    return result

def get_hotwords_list(series):
    words = []
    for doc in tqdm(nlp.pipe(series, 
                        batch_size=10000, 
                        disable=['ner', 'senter', 'parser'])):
        words.append(get_hotwords(doc, is_doc=True))
    
    return words

def get_hotword_count(series, verbose=False):    
    counter = Counter()
    for doc in nlp.pipe(series, 
                        batch_size=5000, 
                        disable=['ner', 'senter', 'parser']):
        words = get_hotwords(doc, is_doc=True)
        counter.update(words)
    
    return counter

In [14]:
# mask = df['issuenote'].notnull()
# counter = get_hotword_count(df.loc[mask, 'issuenote'], verbose=True)    
# top50 = counter.most_common()[:50]
    
# with open('top50_keywords_scad.yml', 'w') as f:
#     yaml.dump({k: v for k, v in top50}, f)

# Collapse

In [18]:
def load_keywords():
    with open('top50_keywords_scad.yml') as f:
        top50 = yaml.safe_load(f)

    top50 = list(top50.keys())
    return top50


def get_groups(loc_groups):
    time_groups = {'yearly': ['year'], 'monthly': ['year', 'month'], 'weekly': ['year', 'week']}
    additional = []

    groups = {}
    for loc_name, loc_group in loc_groups.items():
        for time_name, time_group in time_groups.items():
            groups[f'{loc_name}_{time_name}'] = loc_group + time_group + additional
            
    return groups


def collapse(df, loc_groups, suffix=''):
    if suffix and not suffix.startswith('_'):
        suffix = '_' + suffix
        
    most_freqs = ['actor1', 'actor2']
    mean_ = ['duration', 'repress', 'cgovtarget', 'rgovtarget', 'female_event', 'lgbtq_issue']
    sum_ = ['ndeath']
    
    top50 = load_keywords()
    
    df['startdate'] = pd.to_datetime(df['startdate'], format='%d-%b-%y')
    df['year'] = df['startdate'].dt.year
    df['month'] = df['startdate'].dt.month
    df['week'] = df['startdate'].dt.isocalendar().week
    
    groups = get_groups(loc_groups)
    
    # Fill NAs
    for group in groups.values():
        for col in group:
            df[col].fillna('Not available', inplace=True)
            
    done = [re.findall(r'SCAD_([^\.]+)\.csv', x)[0] for x in glob('../Output/SCAD/SCAD_*.csv')]
    
    for group_name, group in groups.items():
        n_groups = len(df.drop_duplicates(group))
        collapsed = []

        if group_name in done:
            print('Skipping', group_name)
            continue

        for key_values, subdf in tqdm(df.groupby(group), total=n_groups, desc=group_name):
            counter = Counter()
            for row in subdf['words']:
                counter.update(row)

            sub_group = {k: v for k, v in zip(group, key_values)}
            sub_group.update({f'keyword_{k}': counter.get(k, 0) for k in top50})

            for col in sum_:
                sub_group[col] = subdf[col].sum()

            for col in mean_:
                sub_group[col] = subdf[col].mean()

            for actor in most_freqs:
                mf = Counter(subdf[actor]).most_common()
                try:
                    most_freq = mf[0][0] if mf[0][0] != 'Not available' else mf[1][0]
                except IndexError:
                    most_freq = 'Not available'
                sub_group[f'most_freq_{actor}'] = most_freq

            collapsed.append(sub_group)

        collapsed = pd.DataFrame(collapsed)
        collapsed.to_csv(f'../Output/SCAD/SCAD_{group_name}{suffix}.csv', index=False)

In [16]:
%%time
df['words'] = get_hotwords_list(df['notes'].fillna(''))

23246it [00:11, 1944.38it/s]

CPU times: user 10.9 s, sys: 1.1 s, total: 12 s
Wall time: 12 s





In [None]:
collapse(df, loc_groups = {'country': ['adm0'], 
                           'edo': ['adm0', 'adm1'], 
                           'mun': ['adm0', 'adm1', 'adm2']})

country_yearly: 100%|██████████| 1440/1440 [00:01<00:00, 821.92it/s]
country_monthly: 100%|██████████| 7248/7248 [00:08<00:00, 850.07it/s]
country_weekly: 100%|██████████| 12797/12797 [00:14<00:00, 855.08it/s]
edo_yearly: 100%|██████████| 6206/6206 [00:06<00:00, 987.10it/s] 
edo_monthly: 100%|██████████| 14629/14629 [00:14<00:00, 991.26it/s] 
edo_weekly: 100%|██████████| 18729/18729 [00:18<00:00, 993.73it/s] 
mun_yearly: 100%|██████████| 9081/9081 [00:09<00:00, 994.37it/s] 
mun_monthly: 100%|██████████| 16682/16682 [00:17<00:00, 980.27it/s] 
mun_weekly:  47%|████▋     | 9423/19971 [00:09<00:10, 986.78it/s] 

# Ethnic

In [2]:
df = pd.read_csv('../Output/SCAD/SCAD.csv')
df['iso3'] = df['adm0']
df.drop(columns=['adm0', 'adm1', 'adm2', 'nearest_loc', 'adm0_name', 'adm1_name', 'adm2_name'], 
        inplace=True)

locs = df[['iso3', 'latitude', 'longitude']].drop_duplicates()
locs = gpd.GeoDataFrame(locs, geometry=gpd.points_from_xy(locs['longitude'], locs['latitude']))
locs = locs.set_crs('EPSG:4326')

df.head()

Unnamed: 0,eventid,id,ccode,countryname,startdate,enddate,duration,stday,stmo,styr,eday,emo,eyr,etype,escalation,actor1,actor2,actor3,target1,target2,cgovtarget,rgovtarget,npart,ndeath,repress,elocal,ilocal,sublocal,locnum,gislocnum,issue1,issue2,issue3,issuenote,nsource,notes,female_event,lgbtq_issue,coder,acd_questionable,latitude,longitude,geo_comments,location_precision,country,iso2,numeric,iso3
0,4040001.0,1.0,404,guinea-bissau,13-Apr-95,15-Apr-95,3,13,4,1995,15,4,1995,5,0.0,National Workers' Union members,,,central government,,1,0,-99,0,0,nationwide,nationwide,1,7,8,2,,,Workers strike to protest government salary pl...,1.0,,0,0,CC,0,12.0758,-14.6407,,,guinea-bissau,GW,624.0,GNB
1,4040002.0,2.0,404,guinea-bissau,2-Sep-96,2-Sep-96,1,2,9,1996,2,9,1996,4,0.0,African nationals expelled from Spain,,,central government,,1,0,2,0,1,Bissau,Bissau,1,1,1,10,,,African nationals expelled from Spain violentl...,1.0,,0,0,CC,0,11.8667,-15.6,,,guinea-bissau,GW,624.0,GNB
2,4040003.0,3.0,404,guinea-bissau,3-Sep-96,3-Sep-96,1,3,9,1996,3,9,1996,2,0.0,African nationals expelled from Spain,,,central government,Spain,1,0,2,0,0,Bissau,Bissau,1,1,1,10,13.0,,African nationals expelled from Spain peaceful...,0.0,,0,0,CC,0,11.8667,-15.6,,,guinea-bissau,GW,624.0,GNB
3,4040004.0,4.0,404,guinea-bissau,1-May-97,31-May-97,31,1,5,1997,31,5,1997,2,0.0,state employees,,,central government,,1,0,-99,0,0,Bissau and other cities,Bissau,1,4,4,2,,,State employees riot and protest over unpaid s...,0.0,Start and end dates are estimates. Protests o...,0,0,CC,0,11.8667,-15.6,,,guinea-bissau,GW,624.0,GNB
4,-4040005.0,5.0,404,guinea-bissau,7-Jun-98,10-May-99,338,7,6,1998,10,5,1999,-9,0.0,Military Junta for the Consolidation of Democr...,,,Government,"Guinea, Senegal",1,0,-99,-77,2,Nationwide,Nationwide,1,7,7,13,,,,,UPPSALA CONFLICT #216,0,0,CH,0,12.0758,-14.6407,,no,guinea-bissau,GW,624.0,GNB


In [3]:
adm = gpd.read_file('../Shapefiles/ethnic_preprocessed/tribe_adm0.shp')
adm.head()

Unnamed: 0,NAME,TRIBE_CODE,LAT,LON,GID_0,NAME_0,geometry
0,BAKWE,60,5.278991,-6.94432,LBR,Liberia,"MULTIPOLYGON (((-7.37926 5.36592, -7.38769 5.3..."
1,BASSA,81,6.206145,-9.67354,LBR,Liberia,"MULTIPOLYGON (((-10.13304 6.66386, -10.03429 6..."
2,GBANDE,231,7.774502,-9.90888,LBR,Liberia,"POLYGON ((-9.73738 7.26119, -9.94259 7.13166, ..."
3,GOLA,244,7.141137,-10.5386,LBR,Liberia,"POLYGON ((-10.43874 7.52385, -10.43260 7.51209..."
4,GREBO,248,4.92476,-7.65541,LBR,Liberia,"POLYGON ((-7.38769 5.36889, -7.37926 5.36592, ..."


In [4]:
# Find those that are within a shape
locs = gpd.sjoin(adm, locs, how='right', op='contains')
mask = locs['index_left'].isnull()
print((~mask).sum(), 'exact matches')

# To Flat CRS
adm = adm.to_crs(FLAT_CRS)
locs = locs.to_crs(FLAT_CRS)

locs['nearest_loc'] = False
cols = ['NAME', 'TRIBE_CODE']
for idx, row in tqdm(locs[mask].iterrows(), total=mask.sum(), desc='Finding codes'):
    argmin = adm.distance(row['geometry']).argmin()
    match = adm.iloc[argmin]
    
    for col in cols:
        locs.loc[idx, col] = match[col]
    locs.loc[idx, 'nearest_loc'] = True

3311 exact matches


Finding codes: 100%|██████████| 2587/2587 [02:12<00:00, 19.53it/s]


In [5]:
df = df.merge(locs[['longitude', 'latitude', 'iso3', 'NAME','TRIBE_CODE']], 
              on=['longitude', 'latitude', 'iso3'], 
              how='left')
df.to_csv('../Output/SCAD/SCAD_ethnic.csv', index=False)

In [14]:
%%time
df['words'] = get_hotwords_list(df['notes'].fillna(''))

23246it [00:11, 1997.71it/s]

CPU times: user 10.6 s, sys: 1.09 s, total: 11.7 s
Wall time: 11.6 s





In [19]:
collapse(df, loc_groups = {'ethnic': ['NAME','TRIBE_CODE']})

ethnic_yearly: 100%|██████████| 3855/3855 [00:04<00:00, 862.89it/s]
ethnic_monthly: 100%|██████████| 10531/10531 [00:10<00:00, 1000.14it/s]
ethnic_weekly: 100%|██████████| 15073/15073 [00:14<00:00, 1014.09it/s]
