In [1]:
import pandas as pd
import numpy as np

from gensim.utils import deaccent

In [2]:
codes = pd.read_excel('../Data/ven_codes.xls', sheet_name='adm2')
codes.columns = [x.lower() for x in codes.columns]
for col in ['estado', 'municipio']:
    codes[col] = codes[col].str.lower().apply(deaccent)
    
codes['estado'] = codes['estado'].str.replace(r'^distrito federal$', 'distrito capital')

In [3]:
adm = pd.read_csv('../Data/GADM_codes.csv')
adm.columns = [x.lower() for x in adm.columns]
adm = adm.loc[np.logical_and(adm['adm0'] == 'VEN', adm['adm2'].notnull()),
              ['adm1_name', 'adm2_name', 'adm1', 'adm2']].copy()

In [4]:
for col in ['adm1_name', 'adm2_name']:
    adm[col] = adm[col].str.lower().apply(deaccent)
    
adm = adm[adm['adm1_name'] != 'dependencias federales'].copy()

In [5]:
assert len(set(adm['adm1_name'].unique()).symmetric_difference(codes['estado'].unique())) == 0

In [6]:
adm = adm.merge(codes, 
                left_on=['adm1_name', 'adm2_name'], 
                right_on=['estado', 'municipio'], 
                how='outer', 
                indicator=True)
print((adm['_merge'] == 'left_only').sum(), (adm['_merge'] == 'right_only').sum())

64 62


In [7]:
adm['adm1_name'].fillna(adm['estado'], inplace=True)
adm['estado'].fillna(adm['adm1_name'], inplace=True)
assert adm['adm1_name'].isnull().sum() == 0
assert adm['estado'].isnull().sum() == 0

In [8]:
def either_contains(s1, s2):
    return s1 in s2 or s2 in s1

In [9]:
for adm_idx, adm_mun in adm[adm['_merge'] == 'left_only'].iterrows():
    mask = np.logical_and(
        adm['_merge'] == 'right_only',
        adm['estado'] == adm_mun['adm1_name'],
    )
    for code_idx, code_mun in adm[mask].iterrows():
        if either_contains(adm_mun['adm2_name'], code_mun['municipio']):
            adm.loc[adm_idx, 'municipio'] = code_mun['municipio']
            adm.loc[adm_idx, 'code'] = code_mun['code']
            
            adm.loc[code_idx, 'adm2_name'] = adm_mun['adm2_name']
            adm.loc[code_idx, 'adm1'] = adm_mun['adm1']
            adm.loc[code_idx, 'adm2'] = adm_mun['adm2']
            
            break
            
print(adm['adm2_name'].isnull().sum(), adm['municipio'].isnull().sum())

19 21


In [11]:
adm.to_csv('../Output/ve_crosswalk.csv', index=False)

In [12]:
adm[adm['adm2_name'].isnull()]

Unnamed: 0,adm1_name,adm2_name,adm1,adm2,estado,municipio,code,_merge
345,anzoategui,,,,anzoategui,francisco del carm,VE0304,right_only
346,anzoategui,,,,anzoategui,francisco de miran,VE0305,right_only
347,anzoategui,,,,anzoategui,juan antonio sotil,VE0308,right_only
348,anzoategui,,,,anzoategui,juan manuel cajiga,VE0309,right_only
349,anzoategui,,,,anzoategui,jose gregorio mona,VE0310,right_only
350,anzoategui,,,,anzoategui,manuel ezequiel br,VE0312,right_only
351,anzoategui,,,,anzoategui,pedro maria freite,VE0313,right_only
352,anzoategui,,,,anzoategui,san jose de guanip,VE0315,right_only
355,anzoategui,,,,anzoategui,sir artur mc grego,VE0320,right_only
357,aragua,,,,aragua,jose rafael reveng,VE0506,right_only
