In [1]:
import pandas as pd
import numpy as np
from glob import iglob
from gensim.utils import deaccent
import ftfy
import textdistance
import yaml
from tqdm import tqdm
from pathlib import Path

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from projections.shapefiles import load_shapes_by_country
from projections import geometry

pd.set_option('display.max_columns', None)
# pd.set_option(“max_colwidth”, None)

In [2]:
output_path = Path('../Output/Latinobarometro/')

In [3]:
def clean_text(s):
    if not isinstance(s, str):
        return s
    
    s = deaccent(ftfy.fix_text(s))
    try:
        s = s.strip().upper().encode('cp1252').decode().lower()
        s = deaccent(s)
    except Exception as e:
        s = s.lower()
        
    return s


def find_closest_match(codes, adm0, adm1, adm2, max_allowed_distance=3):
    sub_codes = codes[codes['adm0'] == adm0].copy()
    sub2_codes = sub_codes[np.logical_and(sub_codes['adm1_name'] == adm1, 
                                          sub_codes['adm2'] != 'NA')].copy()
    if sub2_codes.shape[0] > 0:
        sub_codes = sub2_codes.copy()
        s = adm2
        col = 'adm2_name'
    else:
        s = adm1
        col = 'adm1_name'
        
    sub_codes['fuzzy_match_on'] = col
    
    sub_codes['distance'] = sub_codes[col].apply(lambda x: textdistance.damerau_levenshtein(x, s))
    min_distance = sub_codes['distance'].min()
    
    if min_distance > max_allowed_distance:
        return None

    sub_codes = sub_codes[sub_codes['distance'] == min_distance]
    
    if sub_codes.shape[0] == 1:
        return sub_codes.iloc[0]
    elif sub_codes.shape[0] == 0:
        return None
    else:
        sub_codes['distance'] = sub_codes[col].apply(lambda x: textdistance.mra(x, s))
        return sub_codes.iloc[sub_codes['distance'].argmin()]
        
    return sub_codes

In [4]:
default_cols = ['numinves', 'idenpa', 'reg', 'ciudad']
latinobarometro = pd.DataFrame()
for file in iglob('../Data/Latinobarometro/*.dta'):
    if '2004' in file:
        cols = ['numinves', 'idenpa', 'reg', 'Ciudad']
    elif '1995' in file:
        cols = ['numero', 'pais', 'region']
    elif '1996' in file:
        cols = ['numero', 'pais', 'region', 'ciu']
    else:
        cols = default_cols.copy()
        
    try:
        df = pd.read_stata(file, columns=cols)
    except ValueError:
        print('FAILED -->', file)
        continue
        
    print(file)
    if '1995' in file:
        df['ciudad'] = ''
    
    df.columns = default_cols    
    for col in default_cols[2:]:
        df[col] = df[col].astype(str)
    
    latinobarometro = latinobarometro.append(df)
    
print('Original:', latinobarometro.shape)

../Data/Latinobarometro/2011.dta
../Data/Latinobarometro/2008.dta
../Data/Latinobarometro/1996.dta
../Data/Latinobarometro/2006.dta
../Data/Latinobarometro/2003.dta
../Data/Latinobarometro/2001.dta
../Data/Latinobarometro/1998.dta
../Data/Latinobarometro/2007.dta
../Data/Latinobarometro/1995.dta
../Data/Latinobarometro/2018.dta
../Data/Latinobarometro/2017.dta
../Data/Latinobarometro/2009.dta
../Data/Latinobarometro/2015.dta
../Data/Latinobarometro/2013.dta
../Data/Latinobarometro/2010.dta
../Data/Latinobarometro/2002.dta
../Data/Latinobarometro/1997.dta
../Data/Latinobarometro/2016.dta
../Data/Latinobarometro/2004.dta
../Data/Latinobarometro/2000.dta
../Data/Latinobarometro/2005.dta
Original: (431148, 4)


In [5]:
latinobarometro['idenpa'] = latinobarometro['idenpa'].str.strip().str.lower().apply(deaccent)
iso_map = {
    'argentina': 'ARG',
    'bolivia': 'BOL',
    'brasil': 'BRA',
    'colombia': 'COL',
    'costa rica': 'CRI',
    'chile': 'CHL',
    'ecuador': 'ECU',
    'el salvador': 'SLV',
    'guatemala': 'GLP',
    'honduras': 'HND',
    'mexico': 'MEX',
    'nicaragua': 'NIC',
    'panama': 'PAN',
    'paraguay': 'PRY',
    'peru': 'PER',
    'rep. dominicana': 'DOM',
    'uruguay': 'URY',
    'venezuela': 'VEN',
    'dominican rep.': 'DOM',
    'brazil': 'BRA',
    'spain': 'ESP',
    'ma©xico': 'MEX',
    'panama¡': 'PAN',
    'peraº': 'PER',
    'repaºblica dominicana': 'DOM'
}
latinobarometro['iso'] = latinobarometro['idenpa'].map(iso_map)
iso_len = latinobarometro['iso'].str.len()
assert(iso_len.max() == 3)
assert(iso_len.min() == 3)
assert(iso_len.isnull().sum() == 0)

In [6]:
with open('../Data/Countries_edited.yml') as f:
    ctr = yaml.safe_load(f)
    
countries = {}
for country in ctr:
    iso3 = country['iso3']
    if iso3:
        for c in country['codes']:
            countries[c] = iso3[0]
            
latinobarometro['iso'] = latinobarometro['iso'].map(countries)

In [None]:
df = latinobarometro

for col in ['reg', 'ciudad']:
    df[col] = df[col].apply(clean_text)
    
df['reg'] = df['reg'].str.replace('/', '-')
df['reg'] = df['reg'].str.replace('mx: norte-', '')
df['reg'] = df['reg'].str.replace('mx: sur-', '')

df['ciudad'] = df['ciudad'].str.replace('/', '-')
df['ciudad'] = df['ciudad'].apply(lambda x: x.split(': ', 1)[-1])

for rep in ('norte-', 'sur-', 'sudeste-', 'nordeste-', 'centro oeste-', 'occidente-', 
            'centro-', 'metropolitana-'):
    df['ciudad'] = df['ciudad'].str.replace(rep, '')
df['ciudad'] = df['ciudad'].apply(lambda x: x[1:] if x.startswith('-') else x)

In [None]:
df.drop_duplicates(['idenpa', 'reg', 'ciudad'], inplace=True)
print('After duplicates:', df.shape)
df.sample(5)

In [None]:
edo_mun = df['ciudad'].str.split('-', 1, expand=True)
df['estado'] = edo_mun[0]
df['municipio'] = edo_mun[1]

In [None]:
codes = pd.read_csv('../Data/GADM_codes.csv')
codes.fillna('NA', inplace=True)

for col in ['adm1_name', 'adm2_name']:
    codes[col] = codes[col].apply(clean_text)
    
codes.columns = [x.lower() for x in codes.columns]
codes.head()

In [None]:
codes_adm1 = codes[['adm0', 'adm1_name', 'adm1']].drop_duplicates()
df = df.merge(codes_adm1,
              left_on = ['iso', 'estado'],
              right_on = ['adm0', 'adm1_name'],
              how = 'left')
df.drop(columns=['adm1_name'], inplace=True)

In [None]:
df = df.merge(codes,
              left_on = ['iso', 'estado', 'municipio'],
              right_on = ['adm0', 'adm1_name', 'adm2_name'],
              how = 'left',
              suffixes = ['', '_y'])
df.drop(columns=['adm0_name', 'adm1_name', 'adm2_name', 'adm1_y', 'adm0_y'], inplace=True)

In [13]:
df['fuzzy_match_on'] = np.nan
replace_cols = ['adm0', 'adm1', 'adm2', 'fuzzy_match_on']
mask = df['adm0'].isnull()
not_found = 0
for idx, row in tqdm(df[mask].iterrows(), total=mask.sum()):
    match = find_closest_match(codes, row['iso'], row['estado'], row['municipio'])
    
    if match is not None:
        for col in replace_cols:
            df.loc[idx, col] = match[col]
    else:
        not_found += 1
            
print(f'{not_found} locations not found')

100%|██████████| 2102/2102 [00:54<00:00, 38.75it/s] 

1587 locations not found





In [22]:
df['adm0'].fillna(df['iso'], inplace=True)
df['adm1'].fillna('NA', inplace=True)
df['adm2'].fillna('NA', inplace=True)

df = df.drop_duplicates(['idenpa', 'reg', 'ciudad', 'adm0', 'adm1', 'adm2'])

In [15]:
no_adm2 = codes.loc[codes['adm2'] == 'NA', 'adm0'].unique()
with_adm2 = codes.loc[codes['adm2'] != 'NA', 'adm0'].unique()
no_adm2 = set(no_adm2).difference(with_adm2)

no_adm1 = codes.loc[codes['adm1'] == 'NA', 'adm0'].unique()
with_adm1 = codes.loc[codes['adm1'] != 'NA', 'adm0'].unique()
no_adm1 = set(no_adm1).difference(with_adm1)

In [24]:
no_adm2_mask = np.logical_and(~df['iso'].isin(no_adm2), df['adm2']=='NA')
no_adm2_mask = np.logical_and(no_adm2_mask, df['municipio'].notnull())
no_adm1_mask = np.logical_and(~df['iso'].isin(no_adm1), df['adm1']=='NA')
mask = np.logical_or(no_adm1_mask, no_adm2_mask)
print(mask.sum())

df.loc[mask, 
       ['iso', 'idenpa', 'reg', 'ciudad']].to_excel('../Output/Latinobarometro/missing_locs.xlsx', 
                                                    index=False)

3382


In [18]:
df = df[['idenpa', 'reg', 'ciudad', 'adm0', 'adm1', 'adm2']]
df.to_csv(output_path / 'Latinobarometro_locs.csv', index=False)

In [15]:
df['adm1'].isnull().sum()

1587

# Append missing locs

In [4]:
df = pd.read_csv(output_path / 'Latinobarometro_locs.csv')
df.head()

Unnamed: 0,idenpa,reg,ciudad,adm0,adm1,adm2,geo_precision
0,argentina,ar: pampeana-buenos aires,buenos aires-gran buenos aires,ARG,ARG.1_1,ARG.5.1_1,ciudad
1,argentina,ar: capital federal,capital federal-ciudad de buenos aires,ARG,ARG.5_1,ARG.5.1_1,ciudad
2,argentina,ar: pampeana-cordoba,cordoba-gran cordoba,ARG,ARG.6_1,ARG.6.2_1,ciudad
3,argentina,ar: pampeana-santa fe,santa fe-gran rosario,ARG,ARG.21_1,ARG.21.12_1,ciudad
4,argentina,ar: cuyo-mendoza,mendoza-gran mendoza,ARG,ARG.13_1,ARG.13.7_1,ciudad


In [5]:
# V2: matched by name
missing_locs = pd.read_excel(output_path / 'missing_locs_v2.xlsx')
lats = missing_locs['coordenadas'].str.split(', ', expand=True)
missing_locs['lat'] = lats[0]
missing_locs['lon'] = lats[1]

missing_locs['adm0'].fillna(missing_locs['iso'], inplace=True)

missing_locs.drop(columns=['coordenadas', 'iso'], inplace=True)

missing_locs.head()

Unnamed: 0,idenpa,reg,ciudad,adm0,adm1,adm2,lat,lon
0,dominican rep.,do: montecristi,montecristi-guayubin,DOM,DOM.17_1,DOM.17.2_1,,
1,dominican rep.,do: el seibo,el seibo-el seibo (mun.),DOM,DOM.7_1,DOM.7.2_1,,
2,bolivia,bo: beni,beni-trinidad,BOL,BOL.3_1,BOL.3.1_1,,
3,bolivia,bo: beni,beni-riberalta,BOL,BOL.3_1,BOL.3.6_1,,
4,bolivia,bo: beni,beni-la cruz,BOL,BOL.3_1,,-15.38312512659179,-64.76692465967898


In [6]:
shapes_by_country = load_shapes_by_country('../Shapefiles/preprocessed')

Loading shapes: 100%|██████████| 276/276 [02:30<00:00,  1.84it/s]


In [7]:
for idx, row in missing_locs[missing_locs['lat'].notnull()].iterrows():
    country = row['adm0']
    record = geometry.find_record(row['lat'], row['lon'], shapes_by_country[country])
    if record:
        missing_locs.loc[idx, 'adm1'] = record[2]
        missing_locs.loc[idx, 'adm2'] = record[3]
    else:
        print('Not found', row)

In [8]:
df = df.merge(missing_locs, on=['idenpa', 'reg', 'ciudad'], how='left', suffixes=['', '_y'])

for i in range(3):
    df[f'adm{i}'] = np.where(df[f'adm{i}']=='NA', df[f'adm{i}_y'], df[f'adm{i}'])
    
df.drop(columns=['adm0_y', 'adm1_y', 'adm2_y', 'lat', 'lon'], inplace=True)

In [9]:
# V3: Matched by coords
missing_locs = pd.read_excel(output_path / 'missing_locs_v3.xlsx')

# Clean
missing_locs['geo_precision'] = np.where(missing_locs['region_coors'].notnull(), 
                                         'region', 
                                         np.nan)
missing_locs['geo_precision'] = np.where(missing_locs['coordenadas'].notnull(), 
                                         'ciudad', 
                                         missing_locs['geo_precision'])
missing_locs['coordenadas'].fillna(missing_locs['region_coors'], inplace=True)
missing_locs['reg'].fillna('NA', inplace=True)
missing_locs['ciudad'].fillna('NA', inplace=True)
missing_locs['adm0'] = missing_locs['iso']

missing_locs = missing_locs[missing_locs['coordenadas'].notnull()].copy()

# Split coordinates
lats = missing_locs['coordenadas'].str.split(', ', expand=True)
missing_locs['lat'] = lats[0]
missing_locs['lon'] = lats[1]

missing_locs.drop(columns=['coordenadas', 'region_coors', 'idenpa', 'iso'], inplace=True)

# Map coordinates to codes
missing_locs['adm1'] = np.nan
missing_locs['adm2'] = np.nan

for idx, row in tqdm(missing_locs[missing_locs['lat'].notnull()].iterrows()):
    country = row['adm0']
    record = geometry.find_record(row['lat'], row['lon'], shapes_by_country[country])
    if record:
        missing_locs.loc[idx, 'adm1'] = record[2]
        missing_locs.loc[idx, 'adm2'] = record[3]
    else:
        print('Not found', row)

mask = missing_locs['adm2'].notnull()
adm_mismatches = missing_locs.loc[mask, 'adm0'] != missing_locs.loc[mask, 'adm0'].str.slice(0, 3)
print('Mistmatches:', adm_mismatches.sum(), adm_mismatches.mean())
        
missing_locs.head()

164it [00:00, 180.49it/s]

Not found reg                          uy: rio negro
ciudad           rio negro-rio negro rural
geo_precision                       ciudad
adm0                                   URY
lat                     -32.48331052724436
lon                    -58.233212660427505
adm1                                   NaN
adm2                                   NaN
Name: 147, dtype: object
Not found reg                       bo: tarija
ciudad                tarija-yacuiba
geo_precision                 ciudad
adm0                             BOL
lat              -22.004455806442174
lon               -63.67483841576406
adm1                             NaN
adm2                             NaN
Name: 201, dtype: object


649it [00:10, 40.36it/s] 

Not found reg                  cr: puntarenas
ciudad           puntarenas-central
geo_precision                ciudad
adm0                            CRI
lat               9.980832658008849
lon              -84.84762704727747
adm1                            NaN
adm2                            NaN
Name: 616, dtype: object


1754it [00:21, 217.18it/s]

Not found reg                py: ciudades pequenas
ciudad           presidente hayes-nanawa
geo_precision                     ciudad
adm0                                 PRY
lat                  -25.294989641677986
lon                  -57.684430020950224
adm1                                 NaN
adm2                                 NaN
Name: 1693, dtype: object
Not found reg              do: resto del pais
ciudad            elias pina-banica
geo_precision                ciudad
adm0                            DOM
lat               19.08021757476525
lon              -71.70233496084059
adm1                            NaN
adm2                            NaN
Name: 1716, dtype: object


2063it [00:25, 79.79it/s] 

Not found reg                  baja california sur
ciudad           baja california-comondu
geo_precision                     ciudad
adm0                                 MEX
lat                    32.66159984564221
lon                  -115.51850823177713
adm1                                 NaN
adm2                                 NaN
Name: 2074, dtype: object


3317it [00:47, 69.33it/s] 

Mistmatches: 0 0.0





Unnamed: 0,reg,ciudad,geo_precision,adm0,lat,lon,adm1,adm2
0,ar: pampeana-buenos aires,buenos aires-gran buenos aires,ciudad,ARG,-34.612413095262305,-58.44050994242362,ARG.5_1,ARG.5.1_1
1,ar: capital federal,capital federal-ciudad de buenos aires,ciudad,ARG,-34.60331062979878,-58.38799008712614,ARG.5_1,ARG.5.1_1
2,ar: pampeana-cordoba,cordoba-gran cordoba,ciudad,ARG,-31.40594952247841,-64.19985286963004,ARG.6_1,ARG.6.2_1
3,ar: pampeana-santa fe,santa fe-gran rosario,ciudad,ARG,-32.930053196164856,-60.72147181933543,ARG.21_1,ARG.21.12_1
4,ar: cuyo-mendoza,mendoza-gran mendoza,ciudad,ARG,-32.56452951825084,-68.76146298779408,ARG.13_1,ARG.13.7_1


In [9]:
df['reg'].fillna('NA', inplace=True)
df['ciudad'].fillna('NA', inplace=True)
df = df.merge(missing_locs, on=['adm0', 'reg', 'ciudad'], how='left', suffixes=['', '_y'])

for i in range(1, 3):
    df[f'adm{i}'].fillna('NA', inplace=True)
    df[f'adm{i}'] = np.where(df[f'adm{i}']=='NA', df[f'adm{i}_y'], df[f'adm{i}'])
    
df.drop(columns=['adm1_y', 'adm2_y', 'lat', 'lon'], inplace=True)

In [12]:
df.to_csv(output_path / 'Latinobarometro_locs.csv', index=False)

# Collapse files

In [6]:
invalid = ['No contesta', 'No sabe', 'Ninguna']
default_cols = ['numinves', 'idenpa', 'reg', 'ciudad']

# 1995
cues = pd.read_csv('../Data/Latinobarometro/csv/1995.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
df = cues[['numero', 'pais', 'region']].copy()
df.columns = default_cols[:-1]
df['izquierda-derecha'] = cues['p31']
df['empleo_pais'] = cues['p16']
df['empleo_mujer_bueno'] = cues['p66a']
df['empleo_mujer_mismo_salario'] = cues['p66b']
df['empleo_mujer_educacion'] = cues['p66c']
df['empleo_perder'] = cues['s12b']
df['mercosur'] = cues['p51']
df['confianza_argentina'] = cues['p54a']
df['confianza_chile'] = cues['p54b']
df['confianza_brasil'] = cues['p54c']
df['confianza_uruguay'] = cues['p54d']
df['confianza_ecuador'] = cues['p54e']
df['confianza_paraguay'] = cues['p54f']
df['confianza_venezuela'] = cues['p54g']
df['confianza_mexico'] = cues['p54h']
df['confianza_peru'] = cues['p54i']
df['confianza_colombia'] = cues['p54j']
df['confianza_bolivia'] = cues['p54k']
df['confianza_eeuu'] = cues['p54l']
df['confianza_japon'] = cues['p54m']
df['confianza_europa'] = cues['p54n']
df['sexo'] = cues['s1']
df['edad'] = cues['s2']
df['estado_civil'] = cues['s6']
df['ocupacion'] = cues['s10']

# 1996
cues = pd.read_csv('../Data/Latinobarometro/csv/1996.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[['numero', 'pais', 'region', 'ciu']].copy()
sdf.columns = default_cols
sdf['izquierda-derecha'] = cues['p38']
sdf['empleo_mujer_bueno'] = cues['p69a']
sdf['empleo_mujer_mismo_salario'] = cues['p69b']
sdf['empleo_mujer_educacion'] = cues['p69c']
sdf['empleo_perder'] = cues['p14']
sdf['mercosur'] = cues['p58']
sdf['confianza_argentina'] = cues['p601a']
sdf['confianza_chile'] = cues['p601b']
sdf['confianza_brasil'] = cues['p601c']
sdf['confianza_uruguay'] = cues['p601d']
sdf['confianza_paraguay'] = cues['p601e']
sdf['confianza_venezuela'] = cues['p601f']
sdf['confianza_ecuador'] = cues['p601g']
sdf['confianza_mexico'] = cues['p601h']
sdf['confianza_peru'] = cues['p601i']
sdf['confianza_colombia'] = cues['p601j']
sdf['confianza_bolivia'] = cues['p601k']
sdf['confianza_eeuu'] = cues['p601l']
sdf['confianza_japon'] = cues['p601m']
sdf['confianza_europa'] = cues['p601n']
sdf['confianza_china'] = cues['p601o']
sdf['sexo'] = cues['s1']
sdf['edad'] = cues['s2']
sdf['estado_civil'] = cues['s5']
sdf['ocupacion'] = cues['s7a']
df = df.append(sdf, ignore_index=True)

# 1997
cues = pd.read_csv('../Data/Latinobarometro/csv/1997.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[default_cols].copy()
sdf['izquierda-derecha'] = cues['sp56']
sdf['empleo_encontrar'] = cues['sp82']
sdf['empleo_mujer_bueno'] = cues['sp69a']
sdf['empleo_mujer_mismo_salario'] = cues['sp69b']
sdf['empleo_mujer_educacion'] = cues['sp69c']
sdf['empleo_mujer_cargos_publicos'] = cues['sp69d']
sdf['empleo_perder'] = cues['sp81']
sdf['mercosur'] = cues['sp53']
p = 'sp66'
sdf['confianza_argentina'] = cues[f'{p}a']
sdf['confianza_chile'] = cues[f'{p}b']
sdf['confianza_brasil'] = cues[f'{p}c']
sdf['confianza_uruguay'] = cues[f'{p}d']
sdf['confianza_paraguay'] = cues[f'{p}e']
sdf['confianza_venezuela'] = cues[f'{p}f']
sdf['confianza_ecuador'] = cues[f'{p}g']
sdf['confianza_mexico'] = np.where(cues[f'{p}h'].isin(invalid), cues['sp67a'], cues[f'{p}h'])
sdf['confianza_peru'] = cues[f'{p}i']
sdf['confianza_colombia'] = cues[f'{p}j']
sdf['confianza_bolivia'] = cues[f'{p}k']
sdf['confianza_eeuu'] = np.where(cues[f'{p}l'].isin(invalid), cues['sp67e'], cues[f'{p}l'])
p = 'sp67'
sdf['confianza_honduras'] = cues[f'{p}b']
sdf['confianza_guatemala'] = cues[f'{p}c']
sdf['confianza_salvador'] = cues[f'{p}d']
sdf['confianza_nicaragua'] = cues[f'{p}f']
sdf['confianza_panama'] = cues[f'{p}g']
sdf['confianza_costa_rica'] = cues[f'{p}h']
sdf['sexo'] = cues['s1']
sdf['edad'] = cues['s2']
sdf['estado_civil'] = cues['s5']
sdf['ocupacion'] = cues['s7a']
sdf['religion'] = cues['sp85']
sdf['salario'] = cues['sp87']
df = df.append(sdf, ignore_index=True)

# 1998
cues = pd.read_csv('../Data/Latinobarometro/csv/1998.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[default_cols].copy()
sdf['izquierda-derecha'] = cues['sp52']
sdf['empleo_encontrar'] = cues['sp79']
sdf['sexo'] = cues['s1']
sdf['edad'] = cues['s2']
sdf['estado_civil'] = cues['s5']
sdf['ocupacion'] = cues['s7a']
sdf['religion'] = cues['sp80']
sdf['salario'] = cues['sp83']
df = df.append(sdf, ignore_index=True)

# 2000
cues = pd.read_csv('../Data/Latinobarometro/csv/2000.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[default_cols].copy()
sdf['izquierda-derecha'] = cues['p52st']
sdf['empleo_mujer_bueno'] = cues['p70st_a']
sdf['empleo_mujer_mismo_salario'] = cues['p70st_b']
sdf['empleo_mujer_educacion'] = cues['p70st_c']
sdf['empleo_mujer_cargos_publicos'] = cues['p70st_d']
sdf['sexo'] = cues['s1']
sdf['edad'] = cues['s2']
sdf['estado_civil'] = cues['s4']
sdf['ocupacion'] = cues['s8a']
sdf['religion'] = cues['p76st']
sdf['salario'] = cues['p78st']
df = df.append(sdf, ignore_index=True)

# 2001
cues = pd.read_csv('../Data/Latinobarometro/csv/2001.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[default_cols].copy()
sdf['izquierda-derecha'] = cues['p54st']
sdf['empleo_encontrar'] = cues['p34st']
sdf['sentimiento_racial'] = cues['p72njl']
sdf['sexo'] = cues['s1']
sdf['edad'] = cues['s2']
sdf['estado_civil'] = cues['s4']
sdf['ocupacion'] = cues['s8a']
sdf['religion'] = cues['p86st']
sdf['salario'] = cues['p88st']
df = df.append(sdf, ignore_index=True)

# 2002
cues = pd.read_csv('../Data/Latinobarometro/csv/2002.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[default_cols].copy()
sdf['izquierda-derecha'] = cues['p64st']
sdf['impacto_extrangeros_trabajo'] = cues['p62no2a']
sdf['impacto_extrangeros_ideas'] = cues['p62no2b']
sdf['impacto_extrangeros_no_entrar'] = cues['p62no2c']
sdf['impacto_extrangeros_aceptarlos'] = cues['p62no2d']
sdf['sexo'] = cues['s1']
sdf['edad'] = cues['s2']
sdf['estado_civil'] = cues['s4']
sdf['ocupacion'] = cues['s8a']
sdf['religion'] = cues['p65st']
sdf['salario'] = cues['p67st']
df = df.append(sdf, ignore_index=True)

# 2003
cues = pd.read_csv('../Data/Latinobarometro/csv/2003.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[default_cols].copy()
sdf['izquierda-derecha'] = cues['p60st']
sdf['sexo'] = cues['s1']
sdf['edad'] = cues['s2']
sdf['estado_civil'] = cues['s4']
sdf['ocupacion'] = cues['s8a']
sdf['religion'] = cues['p91st']
sdf['salario'] = cues['p90st']
df = df.append(sdf, ignore_index=True)

# 2004
cues = pd.read_csv('../Data/Latinobarometro/csv/2004.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[['numinves', 'idenpa', 'reg', 'ciudad']].copy()
sdf.columns = default_cols
sdf['izquierda-derecha'] = cues['p87st']
sdf['discriminacion'] = cues['p37n']
sdf['sexo'] = cues['s1']
sdf['edad'] = cues['s2']
sdf['estado_civil'] = cues['s4']
sdf['ocupacion'] = cues['s8a']
sdf['religion'] = cues['p90st']
sdf['salario'] = cues['p89st']
sdf['idioma'] = cues['s13']
df = df.append(sdf, ignore_index=True)

# 2005
cues = pd.read_csv('../Data/Latinobarometro/csv/2005.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[default_cols].copy()
sdf['izquierda-derecha'] = cues['p34st']
sdf['discriminacion'] = cues['p93st']
sdf['salario'] = cues['s1']
sdf['religion'] = cues['s2']
sdf['emigrar'] = cues['s4']
sdf['sexo'] = cues['s6']
sdf['edad'] = cues['s7']
sdf['estado_civil'] = cues['s9']
sdf['ocupacion'] = cues['s13a']
sdf['idioma'] = cues['s18']
df = df.append(sdf, ignore_index=True)

# 2006
cues = pd.read_csv('../Data/Latinobarometro/csv/2006.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[default_cols].copy()
sdf['izquierda-derecha'] = cues['p47st']
sdf['discriminacion'] = cues['p46stm']
p = 'p61n'
sdf['confianza_venezuela'] = cues[f'{p}_a']
sdf['confianza_bolivia'] = cues[f'{p}_b']
sdf['confianza_argentina'] = cues[f'{p}_c']
sdf['confianza_ecuador'] = cues[f'{p}_d']
sdf['confianza_mexico'] = cues[f'{p}_e']
sdf['salario'] = cues['s1']
sdf['religion'] = cues['s2']
sdf['emigrar'] = cues['s4']
sdf['sexo'] = cues['s6']
sdf['edad'] = cues['s7']
sdf['estado_civil'] = cues['s9']
sdf['ocupacion'] = cues['s13a']
sdf['idioma'] = cues['s18']
df = df.append(sdf, ignore_index=True)

# 2007
cues = pd.read_csv('../Data/Latinobarometro/csv/2007.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[default_cols].copy()
sdf['izquierda-derecha'] = cues['p67st']
sdf['empleo_perder'] = cues['s1']
sdf['salario'] = cues['s2']
sdf['religion'] = cues['s4']
sdf['emigrar'] = cues['s6']
sdf['nacional'] = cues['s8n']
sdf['raza'] = cues['s9']
sdf['sexo'] = cues['s10']
sdf['edad'] = cues['s11']
sdf['estado_civil'] = cues['s9']
sdf['ocupacion'] = cues['s17a']
sdf['idioma'] = cues['s22']
df = df.append(sdf, ignore_index=True)

# 2010
cues = pd.read_csv('../Data/Latinobarometro/csv/2010.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[default_cols].copy()
sdf['izquierda-derecha'] = cues['p60st']
sdf['impacto_extrangeros_trabajo'] = cues['p69st_a']
sdf['impacto_extrangeros_aceptarlos'] = cues['p69st_b']
sdf['empleo_perder'] = cues['s3']
sdf['nacimiento'] = cues['s1ncc']
sdf['salario'] = cues['s4']
sdf['religion'] = cues['s9']
sdf['raza'] = cues['s20']
sdf['sexo'] = cues['s7']
sdf['edad'] = cues['s8']
sdf['estado_civil'] = cues['s5']
sdf['ocupacion'] = cues['s16a']
sdf['idioma'] = cues['p79_a']
sdf['nacional'] = cues['s18']
sdf['emigrar'] = cues['s19']
df = df.append(sdf, ignore_index=True)

# 2011
cues = pd.read_csv('../Data/Latinobarometro/csv/2011.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[default_cols].copy()
sdf['izquierda-derecha'] = cues['p76st']
sdf['empleo_perder'] = cues['s9']
sdf['salario'] = cues['s10icc12']
sdf['religion'] = cues['s18']
sdf['raza'] = cues['s27']
sdf['sexo'] = cues['s16']
sdf['edad'] = cues['s17']
sdf['estado_civil'] = cues['s15']
sdf['ocupacion'] = cues['s16']
sdf['idioma'] = cues['s11_a']
sdf['nacional'] = cues['s25']
sdf['emigrar'] = cues['s26']
df = df.append(sdf, ignore_index=True)


# 2013
cues = pd.read_csv('../Data/Latinobarometro/csv/2013.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[default_cols].copy()
sdf['izquierda-derecha'] = cues['p41st']
sdf['empleo_perder'] = cues['s5']
sdf['salario'] = cues['s6']
sdf['religion'] = cues['s14']
sdf['raza'] = cues['s21']
sdf['sexo'] = cues['s10']
sdf['edad'] = cues['s11']
sdf['estado_civil'] = cues['s9']
sdf['ocupacion'] = cues['s19_a']
sdf['idioma'] = cues['s7_a']
sdf['nacional'] = cues['s12']
sdf['emigrar'] = cues['s13']
df = df.append(sdf, ignore_index=True)

# 2015
cues = pd.read_csv('../Data/Latinobarometro/csv/2015.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[default_cols].copy()
sdf['izquierda-derecha'] = cues['p27st']
sdf['empleo_perder'] = cues['s3']
sdf['impacto_extrangeros_trabajo'] = cues['p44st_a']
sdf['impacto_extrangeros_no_entrar'] = cues['p44st_b']
sdf['impacto_extrangeros_aceptarlos'] = cues['p44stm_c']
sdf['salario'] = cues['s4']
sdf['religion'] = cues['s16']
sdf['raza'] = cues['s23']
sdf['sexo'] = cues['s12']
sdf['edad'] = cues['s13']
sdf['estado_civil'] = cues['s11']
sdf['ocupacion'] = cues['s21_a']
sdf['idioma'] = cues['s5a']
sdf['nacional'] = cues['s14']
sdf['emigrar'] = cues['s15']
df = df.append(sdf, ignore_index=True)

# 2016
cues = pd.read_csv('../Data/Latinobarometro/csv/2016.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[default_cols].copy()
sdf['izquierda-derecha'] = cues['p17st']
sdf['empleo_perder'] = cues['s3']
sdf['salario'] = cues['s4']
sdf['religion'] = cues['s8']
sdf['raza'] = cues['s9']
sdf['fecha_de_nacimiento'] = cues['s6']
sdf['estado_civil'] = cues['s5']
sdf['ocupacion'] = cues['s18a']
sdf['idioma'] = cues['s17a']
sdf['nacional'] = cues['s7']
sdf['emigrar'] = cues['s10']
df = df.append(sdf, ignore_index=True)

# 2017
cues = pd.read_csv('../Data/Latinobarometro/csv/2017.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[default_cols].copy()
sdf['izquierda-derecha'] = cues['p19stc']
sdf['empleo_perder'] = cues['s4']
sdf['salario'] = cues['s5']
sdf['religion'] = cues['s9']
sdf['raza'] = cues['s10']
sdf['fecha_de_nacimiento'] = cues['s7']
sdf['estado_civil'] = cues['s6']
sdf['ocupacion'] = cues['s18_a']
sdf['idioma'] = cues['s24_a']
sdf['nacional'] = cues['s8']
sdf['emigrar'] = cues['s11']
df = df.append(sdf, ignore_index=True)

# 2018
cues = pd.read_csv('../Data/Latinobarometro/csv/2018.csv', encoding='latin', dtype=str)
cues.columns = [x.lower() for x in cues.columns]
sdf = cues[default_cols].copy()
sdf['izquierda-derecha'] = cues['p22st']
sdf['empleo_perder'] = cues['s3']
sdf['impacto_extranjeros_personal'] = cues['p43n']
sdf['salario'] = cues['s4']
sdf['religion'] = cues['s5']
sdf['raza'] = cues['s6']
sdf['estado_civil'] = cues['s23']
sdf['ocupacion'] = cues['s14a']
sdf['idioma'] = cues['s13a']
sdf['nacional'] = cues['s16']
sdf['emigrar'] = cues['s7']
df = df.append(sdf, ignore_index=True)

In [8]:
for col in ['idenpa', 'reg', 'ciudad']:
    df[col] = df[col].apply(clean_text)
    
df['reg'] = df['reg'].str.replace('/', '-')
df['reg'] = df['reg'].str.replace('mx: norte-', '')
df['reg'] = df['reg'].str.replace('mx: sur-', '')

df['ciudad'] = df['ciudad'].str.replace('/', '-')
df['ciudad'] = df['ciudad'].apply(lambda x: x.split(': ', 1)[-1] if isinstance(x, str) else x)

for rep in ('norte-', 'sur-', 'sudeste-', 'nordeste-', 'centro oeste-', 'occidente-', 
            'centro-', 'metropolitana-'):
    df['ciudad'] = df['ciudad'].str.replace(rep, '')
df['ciudad'] = df['ciudad'].apply(lambda x: x[1:] if isinstance(x, str) and x.startswith('-') else x)
    
df.head()

Unnamed: 0,numinves,idenpa,reg,izquierda-derecha,empleo_pais,empleo_mujer_bueno,empleo_mujer_mismo_salario,empleo_mujer_educacion,empleo_perder,mercosur,confianza_argentina,confianza_chile,confianza_brasil,confianza_uruguay,confianza_ecuador,confianza_paraguay,confianza_venezuela,confianza_mexico,confianza_peru,confianza_colombia,confianza_bolivia,confianza_eeuu,confianza_japon,confianza_europa,sexo,edad,estado_civil,ocupacion,ciudad,confianza_china,empleo_encontrar,empleo_mujer_cargos_publicos,confianza_honduras,confianza_guatemala,confianza_salvador,confianza_nicaragua,confianza_panama,confianza_costa_rica,religion,salario,sentimiento_racial,impacto_extrangeros_trabajo,impacto_extrangeros_ideas,impacto_extrangeros_no_entrar,impacto_extrangeros_aceptarlos,discriminacion,idioma,emigrar,nacional,raza,nacimiento,fecha_de_nacimiento,impacto_extranjeros_personal
0,1995,chile,cl: i region: tarapaca,5,Mejorara un poco,Si,No,Si,No aplicada,Se beneficia,Algo de confianza,No aplica,Algo de confianza,Algo de confianza,Algo de confianza,Algo de confianza,Algo de confianza,Algo de confianza,Poca confianza,Algo de confianza,Poca confianza,Algo de confianza,Algo de confianza,Algo de confianza,Hombre,23,Casado/conviviente,Estudiante,,,,,,,,,,,,,,,,,,,,,,,,,
1,1995,chile,cl: i region: tarapaca,9,Empeorara mucho,No,Si,Si,No aplicada,Se beneficia,Algo de confianza,No aplica,Algo de confianza,Poca confianza,Algo de confianza,Algo de confianza,Algo de confianza,Algo de confianza,Poca confianza,Algo de confianza,Poca confianza,Algo de confianza,Algo de confianza,Algo de confianza,Mujer,21,Soltero,Estudiante,,,,,,,,,,,,,,,,,,,,,,,,,
2,1995,chile,cl: i region: tarapaca,5,Mejorara un poco,No,No,Si,No preocupado,No sabe,Algo de confianza,No aplica,Algo de confianza,Algo de confianza,Algo de confianza,Algo de confianza,Algo de confianza,Algo de confianza,Algo de confianza,Algo de confianza,Algo de confianza,Algo de confianza,Algo de confianza,Algo de confianza,Hombre,34,Soltero,Asalariado privado,,,,,,,,,,,,,,,,,,,,,,,,,
3,1995,chile,cl: i region: tarapaca,Izquierda,Mejorara un poco,No,No,Si,No aplicada,Se beneficia,Poca confianza,No aplica,Poca confianza,Poca confianza,Poca confianza,Poca confianza,Poca confianza,Algo de confianza,Poca confianza,Poca confianza,Algo de confianza,Algo de confianza,Algo de confianza,Algo de confianza,Mujer,30,Casado/conviviente,Responsable casa,,,,,,,,,,,,,,,,,,,,,,,,,
4,1995,chile,cl: i region: tarapaca,5,Mejorara un poco,No,No,Si,No aplicada,Se beneficia,Poca confianza,No aplica,Mucha confianza,Poca confianza,Algo de confianza,Algo de confianza,Mucha confianza,Mucha confianza,Algo de confianza,Algo de confianza,Poca confianza,Mucha confianza,Mucha confianza,Mucha confianza,Mujer,44,Casado/conviviente,Responsable casa,,,,,,,,,,,,,,,,,,,,,,,,,


In [11]:
locs = pd.read_csv('../Output/Latinobarometro/Latinobarometro_locs.csv')
locs.fillna('NA', inplace=True)

on = ['idenpa', 'reg', 'ciudad']
df = df.fillna('NA').merge(locs, on=on, how='left')

print(df['adm1'].isnull().sum())
cols = on + [x for x in df if x not in on]
df[cols].to_csv('../Output/Latinobarometro/Latinobarometro.csv', index=False)

1601
