In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import pickle
import re
from tqdm import tqdm
from glob import glob
from pathlib import Path

import shapefile
import shapely
from shapely.geometry import Point

import spacy
from collections import Counter
from string import punctuation
import yaml

nlp = spacy.load("en_core_web_sm")
STOP = nlp.Defaults.stop_words.union(set(punctuation))

pd.set_option('max_columns', None)

In [2]:
files = glob('../Data/ACLED/*.xlsx')
feather_file = Path('../Data/ACLED/acled.feather')
totals_feather_file = Path('../Data/ACLED/acled_totals.feather')
columns = None
df = []
totals = []

if feather_file.exists() and totals_feather_file.exists():
    df = pd.read_feather(feather_file)
    totals = pd.read_feather(totals_feather_file)
else:
    for file in tqdm(files):
        print(file)
        continent = pd.read_excel(file)
        continent.columns = [x.lower() for x in continent]

        if columns is None:
            columns = set(continent.columns)
        elif 'Totals' not in file:
            difference = columns.symmetric_difference(continent.columns)
            assert len(difference) == 0, difference

        continent['origin_filename'] = Path(file).name

        if 'Totals' in file:
            totals.append(continent)
        else:
            print('Appending')
            df.append(continent)

    df = df[0].append(df[1:], ignore_index=True)
    totals = totals[0].append(totals[1:], ignore_index=True)

    df.to_feather(feather_file)
    totals.to_feather(totals_feather_file)
    
df.drop_duplicates(['event_id_cnty'], inplace=True)

In [3]:
iso_codes = pd.read_csv('../Data/iso_codes.csv', usecols=['numeric', 'iso2'])
iso_codes = iso_codes.append({'iso2': 'XK', 'numeric': 0}, ignore_index=True)

# Namibia is interpreted as NaN
iso_codes.loc[iso_codes['numeric'] == 516, 'iso2'] = 'NA'

df = df.merge(iso_codes, left_on='iso', right_on='numeric', how='left')
assert df['iso2'].isnull().sum() == 0

In [4]:
shps = glob('../Shapefiles/preprocessed/*.shp')
shapes_by_country = {}
for i, shp in tqdm(enumerate(shps), total=len(shps)):
    codes = Path(shp).name.replace('.shp', '').split('_')
    shp = shapefile.Reader(shp)
    shapes = [shapely.geometry.shape(s) for s in shp.shapes()]
    records = shp.records()
    
    for code in codes:
        shapes_by_country[code] = []
        for shape, record in zip(shapes, records):
            shapes_by_country[code].append((shape, *record))
    
assert 'GT' in shapes_by_country and 'PM' in shapes_by_country

100%|██████████| 276/276 [02:31<00:00,  1.82it/s]


In [5]:
locs = df[['longitude', 'latitude', 'iso2']].drop_duplicates()
locs['adm1'] = np.nan
locs['adm2'] = np.nan
locs['adm0'] = np.nan
locs['nearest_loc'] = False

In [6]:
def country_shapes(country):
    if country in shapes_by_country:
        yield from shapes_by_country[country]
    else:
        for shapes in shapes_by_country.values():
            yield from shapes

In [7]:
pbar = tqdm(total=locs.shape[0], desc='Finding codes')
for idx, row in locs.iterrows():
    if isinstance(row['adm1'], str):
        pbar.update(1)
        continue
    
    coordinates = (row['longitude'], row['latitude'])
    point = Point(coordinates)
    nearest_record = None
    nearest_distance = np.inf
    for shape, country, adm1, adm2 in country_shapes(row['iso2']):
        if point.within(shape):
            locs.loc[idx, 'adm0'] = country
            locs.loc[idx, 'adm2'] = adm2
            locs.loc[idx, 'adm1'] = adm1
            break
            
        distance = point.distance(shape)
        if distance < nearest_distance:
            nearest_distance = distance
            nearest_record = (shape, country, adm1, adm2)
    else:
        if nearest_record:
            locs.loc[idx, 'nearest_loc'] = True
            locs.loc[idx, 'adm2'] = nearest_record[3]
            locs.loc[idx, 'adm1'] = nearest_record[2]
            locs.loc[idx, 'adm0'] = nearest_record[1]
        
    pbar.update(1)
pbar.close()
print('Total nearest:', locs['nearest_loc'].sum())
print('Missing:', 
      set(locs.loc[locs['adm1'].isnull(), 'iso2'].unique()).difference(locs.loc[locs['adm1'].notnull(), 'iso2'].unique()))

Finding codes: 100%|██████████| 96060/96060 [57:27<00:00, 27.87it/s]   


Total nearest: 1458
Missing: set()


In [8]:
locs.to_csv('../Output/ACLED/ACLED_locs.csv', index=False)
# locs = pd.read_csv('../Output/ACLED/ACLED_locs.csv')

In [9]:
df = df.merge(locs, on=['longitude', 'latitude', 'iso2'], how='left')
assert df['adm0'].isnull().sum() == 0
print(df.shape)

(1073947, 36)


In [10]:
# https://reliefweb.int/sites/reliefweb.int/files/resources/ACLED_Codebook_2017FINAL%20%281%29.pdf
# Page 26
print(df.groupby('geo_precision')['geo_precision'].count())
df.loc[df['geo_precision'] == 3, 'adm2'] = np.nan

geo_precision
0         1
1    768921
2    276043
3     28982
Name: geo_precision, dtype: int64


In [11]:
df.to_csv('../Output/ACLED/ACLED.csv', index=False)

# Keywords

In [None]:
df = pd.read_csv('../Output/ACLED/ACLED.csv')
# df.drop(columns=['adm0_name', 'adm1_name', 'adm2_name'], inplace=True)

In [8]:
POS_TAGS = {'PROPN', 'ADJ', 'NOUN'}

def get_hotwords(doc, is_doc=False):    
    if not is_doc:
        doc = nlp(doc)
        
    result = [token.lemma_ 
              for token in doc 
              if token.text.lower() not in STOP and token.pos_ in POS_TAGS]
                
    return result

def get_hotwords_list(series):
    words = []
    for doc in tqdm(nlp.pipe(series, 
                        batch_size=10000, 
                        disable=['ner', 'senter', 'parser']),
                   total=len(series)):
        words.append(get_hotwords(doc, is_doc=True))
    
    return words

def get_hotwords_set(series):
    words = get_hotwords_list(series)    
    return [list(set(w)) for w in words]

def get_hotword_count(series, verbose=False):    
    counter = Counter()
    for doc in nlp.pipe(series, 
                        batch_size=5000, 
                        disable=['ner', 'senter', 'parser']):
        words = get_hotwords(doc, is_doc=True)
        counter.update(words)
    
    return counter

In [None]:
mask = df['notes'].notnull()
counter = get_hotword_count(df.loc[mask, 'notes'], verbose=True)    
top50 = counter.most_common()[:50]
    
with open('top50_keywords_acled.yml', 'w') as f:
    yaml.dump({k: v for k, v in top50}, f)

# Collapse

In [None]:
with open('top50_keywords_acled.yml') as f:
    top50 = yaml.safe_load(f)
    
top50 = list(top50.keys()) + ['Venezuela', 'Venezuelan']

In [9]:
def load_keywords():
    with open('top50_keywords_scad.yml') as f:
        top50 = yaml.safe_load(f)

    top50 = list(top50.keys())
    return top50


def get_groups(loc_groups):
    time_groups = {'yearly': ['year'], 'monthly': ['year', 'month'], 'weekly': ['year', 'week']}
    additional = []

    groups = {}
    for loc_name, loc_group in loc_groups.items():
        for time_name, time_group in time_groups.items():
            groups[f'{loc_name}_{time_name}'] = loc_group + time_group + additional
            
    return groups


def collapse(df, loc_groups, suffix=''):
    if suffix and not suffix.startswith('_'):
        suffix = '_' + suffix
        
    most_freqs = ['actor1', 'actor2', 'source']
    additional = ['event_type', 'sub_event_type']
    
    top50 = load_keywords()
    
    df['event_date'] = pd.to_datetime(df['event_date'], format='%d-%b-%y')
    df['year'] = df['event_date'].dt.year
    df['month'] = df['event_date'].dt.month
    df['week'] = df['event_date'].dt.isocalendar().week
    
    groups = get_groups(loc_groups)
    
    # Fill NAs
    for group in groups.values():
        for col in group:
            df[col].fillna('Not available', inplace=True)
            
    done = [re.findall(r'ACLED_([^\.]+)\.csv', x)[0] for x in glob('../Output/ACLED/ACLED_*.csv')]
    
    for group_name, group in groups.items():
        n_groups = len(df.drop_duplicates(group))
        collapsed = []

        if group_name in done:
            print('Skipping', group_name)
            continue

        for key_values, subdf in tqdm(df.groupby(group), total=n_groups, desc=group_name):
            counter = Counter()
            for row in subdf['words']:
                counter.update(row)

            sub_group = {k: v for k, v in zip(group, key_values)}
            sub_group['count'] = subdf.shape[0]
            sub_group.update({f'keyword_{k}': counter.get(k, 0) for k in top50})
            sub_group['fatalities'] = subdf['fatalities'].sum()

            collapsed.append(sub_group)

        collapsed = pd.DataFrame(collapsed)
        collapsed.to_csv(f'../Output/ACLED/ACLED_{group_name}{suffix}.csv', index=False)

In [None]:
%%time
df['words'] = get_hotwords_set(df['notes'].fillna(''))

In [None]:
collapse(df, loc_groups={'country': ['adm0'], 
                         'edo': ['adm0', 'adm1'], 
                         'mun': ['adm0', 'adm1', 'adm2']})

# Ethnic

In [3]:
df = pd.read_csv('../Output/ACLED/ACLED.csv')
df['iso3'] = df['adm0']
df.drop(columns=['adm0', 'adm1', 'adm2', 'nearest_loc'], 
        inplace=True)

locs = df[['iso3', 'latitude', 'longitude']].drop_duplicates()
locs = gpd.GeoDataFrame(locs, geometry=gpd.points_from_xy(locs['longitude'], locs['latitude']))
locs = locs.set_crs('EPSG:4326')

df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,iso,event_id_cnty,event_id_no_cnty,event_date,year,time_precision,event_type,sub_event_type,actor1,assoc_actor_1,inter1,actor2,assoc_actor_2,inter2,interaction,region,country,admin1,admin2,admin3,location,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,timestamp,origin_filename,iso2,numeric,iso3
0,48,BHR324,324,2016-01-01,2016,1,Protests,Peaceful protest,Protesters (Bahrain),,6,,,0,60,Middle East,Bahrain,Southern,,,Al Maamir,26.133,50.609,1,Press TV,Regional,Small protest have been held in the village of...,0,1572403770,ACLED_MEast.xlsx,BH,48,BHR
1,48,BHR328,328,2016-01-01,2016,1,Protests,Excessive force against protesters,Protesters (Bahrain),,6,Police Forces of Bahrain (1999-),,1,16,Middle East,Bahrain,Northern,,,Ash Shakhurah,26.215,50.507,1,Middle East Eye,International,Bahrainis took part in an anti-regime protest ...,0,1615250536,ACLED_MEast.xlsx,BH,48,BHR
2,48,BHR1980,1980,2016-01-01,2016,1,Protests,Peaceful protest,Protesters (Bahrain),,6,,,0,60,Middle East,Bahrain,Muharraq,,,Samahij,26.28,50.635,1,Revolution Bahrain,New media,"On 1/1/2016, a number of people protested in S...",0,1584127715,ACLED_MEast.xlsx,BH,48,BHR
3,48,BHR326,326,2016-01-01,2016,1,Riots,Violent demonstration,Rioters (Bahrain),,5,Police Forces of Bahrain (1999-),,1,15,Middle East,Bahrain,Capital,,,Sitrah,26.155,50.621,1,Press TV,Regional,Clashes erupted between police and demonstrato...,0,1572403770,ACLED_MEast.xlsx,BH,48,BHR
4,48,BHR327,327,2016-01-01,2016,1,Protests,Excessive force against protesters,Protesters (Bahrain),,6,Police Forces of Bahrain (1999-),,1,16,Middle East,Bahrain,Northern,,,Abu Saybi,26.218,50.507,1,Middle East Eye,International,Bahrainis took part in an anti-regime protest ...,0,1615250536,ACLED_MEast.xlsx,BH,48,BHR


In [4]:
adm = gpd.read_file('../Shapefiles/ethnic_preprocessed/tribe_adm0.shp')
adm.head()

Unnamed: 0,NAME,TRIBE_CODE,LAT,LON,GID_0,NAME_0,area_tribe,area_adm,area_inter,geometry
0,BERABISH,94,19.8857,-4.72439,MRT,Mauritania,288603800000.0,1041176000000.0,58713460000.0,"POLYGON ((-5.62671 16.72143, -5.82874 17.01325..."
1,DELIM,170,21.5158,-15.112,MRT,Mauritania,116364000000.0,1041176000000.0,51916050000.0,"POLYGON ((-14.38610 19.01855, -14.54711 18.990..."
2,DUAISH,186,17.3523,-12.1479,MRT,Mauritania,64521580000.0,1041176000000.0,64521580000.0,"POLYGON ((-11.39385 16.00822, -11.39759 15.997..."
3,IMRAGEN,299,24.276,-14.8975,MRT,Mauritania,44112490000.0,1041176000000.0,7165069000.0,"MULTIPOLYGON (((-16.38461 19.62125, -16.38417 ..."
4,MASINA,501,15.2041,-3.8538,MRT,Mauritania,57279010000.0,1041176000000.0,5140482000.0,"POLYGON ((-6.51868 15.49704, -6.52111 15.49812..."


In [5]:
# Find those that are within a shape
locs = gpd.sjoin(adm, locs, how='right', op='contains')
mask = locs['index_left'].isnull()
print((~mask).sum(), 'exact matches')

# To Flat CRS
adm = adm.to_crs(epsg=3035)
locs = locs.to_crs(epsg=3035)

locs['nearest_loc'] = False
cols = ['NAME', 'TRIBE_CODE']
for idx, row in tqdm(locs[mask].iterrows(), total=mask.sum(), desc='Finding codes'):
    argmin = adm.distance(row['geometry']).argmin()
    match = adm.iloc[argmin]
    
    for col in cols:
        locs.loc[idx, col] = match[col]
    locs.loc[idx, 'nearest_loc'] = True

27756 exact matches


Finding codes: 100%|██████████| 68304/68304 [1:07:43<00:00, 16.81it/s]


In [6]:
id_cols = ['iso3', 'NAME', 'TRIBE_CODE']
df = df.merge(locs[['longitude', 'latitude'] + id_cols], 
              on=['longitude', 'latitude', 'iso3'], 
              how='left')
df.to_csv('../Output/ACLED/ACLED_ethnic.csv', index=False)

In [10]:
df['words'] = get_hotwords_set(df['notes'].fillna(''))

100%|██████████| 1073947/1073947 [46:41<00:00, 383.39it/s]  


In [11]:
collapse(df, loc_groups={'ethnic': id_cols})

ValueError: time data '2016-01-01' does not match format '%d-%b-%y' (match)

# Gen shapefiles codes

In [None]:
codes = {}
shps = ['../Shapefiles/GADM/gadm36_0.shp', '../Shapefiles/GADM/gadm36_1.shp', '../Shapefiles/GADM/gadm36_2.shp']
for i, shp in enumerate(shps):
    shp = shapefile.Reader(shp)
    records = shp.records()
    for record in records:
        name = record[0]
        r = {'adm0': record[0], 
             'adm0_name': record[1]}
        
        if i > 0:
            r.update({'adm1': record[2],
                      'adm1_name': record[3]})
            name = record[2]
            
        if i == 2:
            r.update({'adm2': record[5],
                      'adm2_name': record[6]})
            name = record[5]
        codes[name] = r
        
print(len(codes))

In [None]:
pd.DataFrame(codes.values()).to_csv('../Data/GADM_codes.csv', index=False)