In [1]:
import pandas as pd
import numpy as np
import yaml
from tqdm import tqdm
from glob import glob
from pathlib import Path

import shapefile
import shapely
from shapely.geometry import Point

pd.set_option('max_columns', None)

In [2]:
df = pd.read_excel('../Leaders/Leaders_v2.xlsx')
df.head()

Unnamed: 0,country,year,leader_name,leader_birthplace,llatitude,llongitude,split,abroad,born_capital
0,Angola,1980,José Eduardo dos Santos,"Luanda, Sambizanga District",-8.806,13.269,0.0,0.0,1.0
1,Angola,1981,José Eduardo dos Santos,"Luanda, Sambizanga District",-8.806,13.269,0.0,0.0,1.0
2,Angola,1982,José Eduardo dos Santos,"Luanda, Sambizanga District",-8.806,13.269,0.0,0.0,1.0
3,Angola,1983,José Eduardo dos Santos,"Luanda, Sambizanga District",-8.806,13.269,0.0,0.0,1.0
4,Angola,1984,José Eduardo dos Santos,"Luanda, Sambizanga District",-8.806,13.269,0.0,0.0,1.0


In [3]:
shps = glob('../Shapefiles/preprocessed/*.shp')
shapes_by_country = {}
for i, shp in tqdm(enumerate(shps), total=len(shps)):
    codes = Path(shp).name.replace('.shp', '').split('_')
    shp = shapefile.Reader(shp)
    shapes = [shapely.geometry.shape(s) for s in shp.shapes()]
    records = shp.records()
    
    for code in codes:
        shapes_by_country[code] = []
        for shape, record in zip(shapes, records):
            shapes_by_country[code].append((shape, *record))
    
assert 'GT' in shapes_by_country and 'PM' in shapes_by_country

100%|██████████| 276/276 [02:26<00:00,  1.89it/s]


In [4]:
lon_col = 'llongitude'
lat_col = 'llatitude'

to_num = lambda x: float(x.strip()) if isinstance(x, str) else x
df[lon_col] = df[lon_col].apply(to_num)
df[lat_col] = df[lat_col].apply(to_num)

locs = df[[lon_col, lat_col, 'country']].drop_duplicates()
locs['adm0'] = np.nan
locs['adm1'] = np.nan
locs['adm2'] = np.nan
locs['nearest_loc'] = False

In [5]:
with open('../Data/Countries_edited.yml') as f:
    all_countries = yaml.safe_load(f)
    
countries = {}
for country in all_countries:
    name = country['name']    
    iso = country['iso3']
    if not name or not iso:
        continue
    countries[name.title()] = iso[0]
    
locs['iso3'] = locs['country'].map(countries)

In [17]:
def country_shapes(country):
    if country is not None and country in shapes_by_country:
        yield from shapes_by_country[country]
    else:
        for shapes in shapes_by_country.values():
            yield from shapes
            

pbar = tqdm(total=locs.shape[0], desc='Finding codes')
for idx, row in locs.iterrows():
    if isinstance(row['adm1'], str):
        pbar.update(1)
        continue
    
    coordinates = (float(row[lon_col]), float(row[lat_col]))
    point = Point(coordinates)
    nearest_record = None
    nearest_distance = np.inf
    
    # Search worldwide
    for shape, adm0, adm1, adm2 in country_shapes(None): 
        if point.within(shape):
            locs.loc[idx, 'adm2'] = adm2
            locs.loc[idx, 'adm1'] = adm1
            locs.loc[idx, 'adm0'] = adm0
            break
            
        distance = point.distance(shape)
        if distance < nearest_distance:
            nearest_distance = distance
            nearest_record = (adm0, adm1, adm2)
    else:
        if nearest_record:
            locs.loc[idx, 'nearest_loc'] = True
            locs.loc[idx, 'adm2'] = nearest_record[2]
            locs.loc[idx, 'adm1'] = nearest_record[1]
            locs.loc[idx, 'adm0'] = nearest_record[0]
        
    pbar.update(1)
pbar.close()
print('Total nearest:', locs['nearest_loc'].sum())
print('Missing:', locs.loc[locs['adm1'].isnull(), 'iso3'].unique())


Finding codes:   0%|          | 0/164 [00:00<?, ?it/s][A
Finding codes:  91%|█████████ | 149/164 [00:06<00:00, 23.28it/s][A
Finding codes:  91%|█████████▏| 150/164 [00:12<00:27,  1.95s/it][A
Finding codes:  92%|█████████▏| 151/164 [00:18<00:41,  3.23s/it][A
Finding codes:  93%|█████████▎| 152/164 [00:22<00:40,  3.41s/it][A
Finding codes:  93%|█████████▎| 153/164 [00:26<00:38,  3.55s/it][A
Finding codes:  94%|█████████▍| 154/164 [00:30<00:36,  3.63s/it][A
Finding codes:  95%|█████████▍| 155/164 [00:36<00:39,  4.44s/it][A
Finding codes:  95%|█████████▌| 156/164 [00:40<00:34,  4.26s/it][A
Finding codes:  96%|█████████▌| 157/164 [00:47<00:34,  4.89s/it][A
Finding codes:  96%|█████████▋| 158/164 [00:53<00:31,  5.27s/it][A
Finding codes:  97%|█████████▋| 159/164 [00:59<00:27,  5.56s/it][A
Finding codes:  98%|█████████▊| 160/164 [01:05<00:22,  5.74s/it][A
Finding codes:  98%|█████████▊| 161/164 [01:09<00:15,  5.15s/it][A
Finding codes:  99%|█████████▉| 162/164 [01:11<00:08,  4.

Total nearest: 4
Missing: []





In [18]:
df = df.merge(locs, on=[lon_col, lat_col, 'country'], how='left')

In [19]:
df.to_csv('../Leaders/Leaders_with_locs.csv', index=False)