In [9]:
import pandas as pd
import numpy as np
import yaml

from glob import glob
from tqdm import tqdm
from pathlib import Path
import shapefile
import shapely
from shapely.geometry import Point

from gensim.utils import deaccent

In [2]:
df = pd.read_excel('../Data/FB_locations_210405.xlsx')
df.head()

Unnamed: 0,loc_type,FB_key,loc_name,country_code,country_name,FB_region,FB_region_key,loc_targeting
0,countries,AG,Antigua,,,,,"home, recent"
1,countries,AR,Argentina,,,,,"home, recent"
2,countries,AW,Aruba,,,,,"home, recent"
3,countries,BB,Barbados,,,,,"home, recent"
4,countries,BM,Bermuda,,,,,"home, recent"


In [3]:
for col in ['loc_name', 'FB_region']:
    df[col + '_c'] = df[col].str.lower().apply(lambda x: deaccent(x) if isinstance(x, str) else x)

In [4]:
with open('../Data/Countries_edited.yml') as f:
    countries = yaml.safe_load(f)

codes = {}
for country in countries:
    for code in country['codes']:
        codes[code] = country['iso3'][0]
            
codes.update({'AG': 'ATG', 'BO': 'BOL', 'DO': 'DOM', 'ES': 'ESP',
             'KN': 'KNA', 'KY': 'CYM', 'NI': 'NIC', 'PA': 'PAN', 
              'TT': 'TTO', 'VI': 'VIR', 'SX': 'SXM', 'TC': 'TCA'})

# ADM0

In [5]:
mask = df['loc_type'] == 'countries'
df['adm0'] = np.where(mask, 
                      df.loc[:, 'FB_key'].map(codes),
                      df.loc[:, 'country_code'].map(codes))

assert df['adm0'].isnull().sum() == 0

# ADM1

In [6]:
adm = pd.read_csv('../Data/GADM_codes.csv')
adm.columns = [x.lower() for x in adm.columns]

for col in adm.filter(regex=r'.*_name$').columns:
    adm[col] = adm[col].str.lower().apply(lambda x: deaccent(x) if isinstance(x, str) else x)

In [7]:
mask = df['loc_type'] == 'regions'
idx = df[mask].index

reg = adm.loc[adm['adm1'].notnull(), ['adm0', 'adm1_name', 'adm1']].drop_duplicates()

merge = df[mask].merge(reg, left_on=['adm0', 'loc_name_c'], right_on=['adm0', 'adm1_name'], how='left')
merge.index = idx
df.loc[mask, 'adm1'] = merge['adm1']

df.loc[mask, 'adm1'].isnull().sum()

125

# ADM 2

In [8]:
mask = df['loc_type'] == 'cities'
idx = df[mask].index

reg = adm.loc[adm['adm2'].notnull()].drop_duplicates()

merge = df[mask].merge(reg, 
                       left_on=['adm0', 'FB_region_c', 'loc_name_c'], 
                       right_on=['adm0', 'adm1_name', 'adm2_name'], 
                       how='left')
merge.index = idx
df.loc[mask, 'adm1'] = merge['adm1_y']
df.loc[mask, 'adm2'] = merge['adm2']

df.loc[mask, 'adm2'].isnull().sum()

138

In [9]:
df.to_csv('../Output/FB/FB_locs_v0.csv', index=False)

# Reviewed

In [15]:
df = pd.read_csv('../Output/FB/FB_locs_v1.csv')

In [16]:
shps = glob('../Shapefiles/preprocessed/*.shp')
shapes_by_country = {}
for i, shp in tqdm(enumerate(shps), total=len(shps)):
    codes = Path(shp).name.replace('.shp', '').split('_')
    shp = shapefile.Reader(shp)
    shapes = [shapely.geometry.shape(s) for s in shp.shapes()]
    records = shp.records()
    
    for code in codes:
        shapes_by_country[code] = []
        for shape, record in zip(shapes, records):
            shapes_by_country[code].append((shape, *record))
    
assert 'GT' in shapes_by_country and 'PM' in shapes_by_country

100%|██████████| 276/276 [02:31<00:00,  1.83it/s]


In [17]:
mask = df['lat'].notnull()
pbar = tqdm(total=mask.sum(), desc='Finding codes')  
for idx, row in df[mask].iterrows():
    coordinates = (row['lon'], row['lat'])
    point = Point(coordinates)
    nearest_record = None
    nearest_distance = np.inf
    for shape, country, adm1, adm2 in shapes_by_country[row['adm0']]:
        if point.within(shape):
            df.loc[idx, 'adm0'] = country
            df.loc[idx, 'adm2'] = adm2
            df.loc[idx, 'adm1'] = adm1
            break    
    else:
        print('Missing:', row)
    pbar.update(1)
pbar.close()

Finding codes: 100%|██████████| 2/2 [00:00<00:00, 240.77it/s]


In [18]:
df.to_csv('../Output/FB/FB_locs_v2.csv', index=False)