# ENIGH databases

@roman

25 June, 2024

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm

In [13]:
# Settings
# show 100 columns in pandas
pd.set_option('display.max_columns', 500)

In [3]:
# data root
ENIGH_ROOT = "../../data/catalogues/adamuz_data/enigh_{enigh_year}/"

---
# Data

Try with 2018 ENIGH

## Read a file

In [18]:
# year
YEAR_ENIGH = 2018

In [19]:
# important files
files = [
    'enigh{enigh_year}_ns_concentradohogar_csv.zip',
    'enigh{enigh_year}_ns_gastoshogar_csv.zip',
    'enigh{enigh_year}_ns_ingresos_csv.zip',
    'enigh{enigh_year}_ns_viviendas_csv.zip',
]

# append year to files
files = [file.format(enigh_year=YEAR_ENIGH) for file in files]

# unzip files
ENIGH_ROOT = ENIGH_ROOT.format(enigh_year=YEAR_ENIGH)
for file in files:
    os.system(f"unzip -o {ENIGH_ROOT + file} -d {ENIGH_ROOT}")


Archive:  ../../data/catalogues/adamuz_data/enigh_2018/enigh2018_ns_concentradohogar_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/concentradohogar.csv  
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/nota_bases_datos_enigh2018_ns.txt  
Archive:  ../../data/catalogues/adamuz_data/enigh_2018/enigh2018_ns_gastoshogar_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/gastoshogar.csv  
Archive:  ../../data/catalogues/adamuz_data/enigh_2018/enigh2018_ns_ingresos_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/ingresos.csv  
Archive:  ../../data/catalogues/adamuz_data/enigh_2018/enigh2018_ns_viviendas_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/viviendas.csv  
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/nota_bases_datos_enigh2018_ns.txt  


In [21]:
# read all csv and save them in a dictionary
dict_df = {}

# get only files which are .csv
files = [file for file in os.listdir(ENIGH_ROOT) if file.endswith(".csv")]

# read all files
for file in files:
    dict_df[file.split(".")[0]] = pd.read_csv(ENIGH_ROOT + file, encoding="latin1")

# see all keys
dict_df.keys()

  dict_df[file.split(".")[0]] = pd.read_csv(ENIGH_ROOT + file, encoding="latin1")
  dict_df[file.split(".")[0]] = pd.read_csv(ENIGH_ROOT + file, encoding="latin1")


dict_keys(['ingresos', 'gastoshogar', 'viviendas', 'concentradohogar'])

## Check Ingresos

In [24]:
# see a sample
dict_df['ingresos'].sample(5)

Unnamed: 0,ï»¿folioviv,foliohog,numren,clave,mes_1,mes_2,mes_3,mes_4,mes_5,mes_6,ing_1,ing_2,ing_3,ing_4,ing_5,ing_6,ing_tri
231244,2201182801,1,1,P008,,,,,,,80,,,,,,19.67
346165,3260310005,1,5,P009,,,,,,,1800,,,,,,440.21
118084,1100837301,1,2,P053,8.0,7.0,6.0,5.0,4.0,3.0,1500,0.0,0.0,0.0,0.0,0.0,733.69
196425,1860360007,1,4,P009,,,,,,,1800,,,,,,440.21
210451,2000321603,1,3,P001,9.0,8.0,7.0,6.0,5.0,4.0,2800,2800.0,2800.0,2800.0,2800.0,2800.0,8262.29


## Check Gastos

In [25]:
# see a sample
dict_df['gastoshogar'].sample(5)

Unnamed: 0,ï»¿folioviv,foliohog,clave,tipo_gasto,mes_dia,forma_pag1,forma_pag2,forma_pag3,lugar_comp,orga_inst,frecuencia,fecha_adqu,fecha_pago,cantidad,gasto,pago_mp,costo,inmujer,inst_1,inst_2,num_meses,num_pagos,ultim_pago,gasto_tri,gasto_nm,gas_nm_tri,imujer_tri
684875,1203980209,1,F003,G1,0,1,0,0,0,0,0,0,0,,400.0,,,,,,,,,1161.29,,,
1445066,2604276002,1,J021,G1,0,1,0,0,5,0,0,0,0,,200.0,120.0,,200.0,,,,,,195.65,,,195.65
3926980,3002475005,1,A124,G1,1108,1,0,0,4,0,0,0,0,0.5,11.0,,,,,,,,,141.42,,,
4362264,1301544302,1,G102,G7,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,800.0,2322.58,
2168303,101807005,1,A129,G1,1113,1,0,0,4,0,0,0,0,0.5,8.0,,,,,,,,,102.85,,,


In [30]:
# see a sample
dict_df['gastoshogar'][dict_df['gastoshogar']["ï»¿folioviv"] == 100013601]

Unnamed: 0,ï»¿folioviv,foliohog,clave,tipo_gasto,mes_dia,forma_pag1,forma_pag2,forma_pag3,lugar_comp,orga_inst,frecuencia,fecha_adqu,fecha_pago,cantidad,gasto,pago_mp,costo,inmujer,inst_1,inst_2,num_meses,num_pagos,ultim_pago,gasto_tri,gasto_nm,gas_nm_tri,imujer_tri
0,100013601,1,C002,G1,0,1,0,0,6,0,0,0,0,,30.0,,,,,,,,,90.0,,,
1,100013601,1,C004,G1,0,1,0,0,10,0,0,0,0,,84.0,,,,,,,,,252.0,,,
2,100013601,1,C005,G1,0,1,0,0,10,0,0,0,0,,60.0,,,,,,,,,180.0,,,
3,100013601,1,D001,G1,0,1,0,0,6,0,0,0,0,,33.0,,,-1.0,,,,,,99.0,,,-1.0
4,100013601,1,F007,G1,0,1,0,0,0,0,0,0,0,,2400.0,,,,,,,,,7200.0,,,
2076553,100013601,1,J016,G6,0,0,0,0,0,3,3,0,0,,,,30.0,,,,,,,,30.0,29.34,
2076554,100013601,1,J031,G6,0,0,0,0,0,3,3,0,0,,,,500.0,,,,,,,,500.0,489.13,
2139125,100013601,1,A004,G1,930,1,0,0,4,0,0,0,0,0.5,15.0,,,,,,,,,192.85,,,
2139126,100013601,1,A004,G1,1001,1,0,0,4,0,0,0,0,0.5,15.0,,,,,,,,,192.85,,,
2139127,100013601,1,A004,G1,1002,1,0,0,4,0,0,0,0,0.5,15.0,,,,,,,,,192.85,,,


## Check Viviendas

In [36]:
# see size
dict_df['viviendas'].shape

(73405, 64)

In [31]:
# see a sample
dict_df['viviendas'].sample(5)

Unnamed: 0,ï»¿folioviv,tipo_viv,mat_pared,mat_techos,mat_pisos,antiguedad,antigua_ne,cocina,cocina_dor,cuart_dorm,num_cuarto,disp_agua,dotac_agua,excusado,uso_compar,sanit_agua,biodigest,bano_comp,bano_excus,bano_regad,drenaje,disp_elect,focos_inca,focos_ahor,combustible,estufa_chi,eli_basura,tenencia,renta,estim_pago,pago_viv,pago_mesp,tipo_adqui,viv_usada,tipo_finan,num_dueno1,hog_dueno1,num_dueno2,hog_dueno2,escrituras,lavadero,fregadero,regadera,tinaco_azo,cisterna,pileta,calent_sol,calent_gas,medidor_luz,bomba_agua,tanque_gas,aire_acond,calefacc,tot_resid,tot_hom,tot_muj,tot_hog,ubica_geo,tam_loc,est_socio,est_dis,upm,factor,procaptar
7020,301383016,1,8,10,3,,1.0,1,2,1,5,1,5.0,1,2,1,2,1,0,0,1,1,0,8,5,,1,1,4000.0,,,1.0,,,,,,,,,1,1,1,1,2,1,2,2,1,2,2,2,2,2,1,1,1,3008,3,3,39,806,113,0
12078,506564021,1,8,3,2,51.0,,1,2,3,4,1,1.0,1,2,3,2,0,1,0,5,1,3,0,3,,1,4,,700.0,,,3.0,,5.0,1.0,1.0,,,3.0,1,1,2,2,2,2,2,1,1,2,2,2,2,4,3,1,1,5020,3,2,80,1546,202,0
52069,2260526918,1,8,10,2,10.0,,1,2,2,4,1,2.0,1,2,2,2,1,0,0,1,1,0,5,3,,1,4,,1500.0,,,3.0,,5.0,1.0,1.0,,,3.0,1,1,1,1,2,2,2,2,1,2,2,2,2,5,3,2,1,22014,4,2,360,5899,131,0
49094,2161215320,1,8,10,2,4.0,,1,2,3,4,6,,1,2,2,2,0,2,0,1,1,0,5,3,,4,3,,1500.0,900.0,1.0,2.0,,1.0,1.0,1.0,,,3.0,1,2,2,2,1,2,2,2,1,2,1,2,2,4,2,2,1,21114,4,1,342,5578,1016,0
49644,2200827906,1,8,10,3,,1.0,1,2,3,6,1,1.0,1,2,1,2,1,1,0,1,1,0,14,3,,1,2,,6000.0,,,,,,,,,,,1,1,1,1,2,2,2,1,1,2,1,2,2,4,2,2,1,22014,1,4,346,5663,513,0


In [35]:
# see a sample
dict_df['viviendas'][dict_df['viviendas']["ï»¿folioviv"] == 2200827906]

Unnamed: 0,ï»¿folioviv,tipo_viv,mat_pared,mat_techos,mat_pisos,antiguedad,antigua_ne,cocina,cocina_dor,cuart_dorm,num_cuarto,disp_agua,dotac_agua,excusado,uso_compar,sanit_agua,biodigest,bano_comp,bano_excus,bano_regad,drenaje,disp_elect,focos_inca,focos_ahor,combustible,estufa_chi,eli_basura,tenencia,renta,estim_pago,pago_viv,pago_mesp,tipo_adqui,viv_usada,tipo_finan,num_dueno1,hog_dueno1,num_dueno2,hog_dueno2,escrituras,lavadero,fregadero,regadera,tinaco_azo,cisterna,pileta,calent_sol,calent_gas,medidor_luz,bomba_agua,tanque_gas,aire_acond,calefacc,tot_resid,tot_hom,tot_muj,tot_hog,ubica_geo,tam_loc,est_socio,est_dis,upm,factor,procaptar
49644,2200827906,1,8,10,3,,1,1,2,3,6,1,1,1,2,1,2,1,1,0,1,1,0,14,3,,1,2,,6000,,,,,,,,,,,1,1,1,1,2,2,2,1,1,2,1,2,2,4,2,2,1,22014,1,4,346,5663,513,0


## Check Concentrado Hogar

In [37]:
# see size
dict_df['concentradohogar'].shape

(74647, 126)

In [38]:
# see a sample
dict_df['concentradohogar'].sample(5)

Unnamed: 0,ï»¿folioviv,foliohog,ubica_geo,tam_loc,est_socio,est_dis,upm,factor,clase_hog,sexo_jefe,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,...,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,smg
58665,2503868704,1,25012,1,4,396,6739,299,3,1,71,6,4,2,2,4,0,2,2,1,3,1,172485.17,112745.84,112745.84,85573.77,0.0,0.0,4672.12,0.0,0.0,22499.95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52239.33,51639.33,0.0,0.0,...,2445.64,782.6,4245.0,0.0,1020.0,225.0,3000.0,3930.0,3930.0,0.0,0.0,1467.38,0.0,0.0,1467.38,23424.54,0.0,0.0,0.0,16947.54,147.54,16800.0,6477.0,20250.0,18900.0,1350.0,0.0,3135.0,3135.0,0.0,0.0,0.0,1285.71,0.0,0.0,0.0,0.0,1285.71,944.26,0.0,0.0,0.0,0.0,0.0,0.0,944.26,0.0,0.0,0.0,7952.4
25777,1060630016,1,10004,4,2,155,3131,148,3,1,72,4,5,1,4,3,2,1,2,2,3,2,47785.8,37839.11,22010.85,12521.73,2934.78,0.0,2152.17,0.0,4402.17,0.0,15828.26,15828.26,0.0,15828.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7043.47,7043.47,0.0,0.0,...,1672.8,2064.1,4981.93,0.0,0.0,240.0,4741.93,2798.43,841.91,1956.52,0.0,1408.68,1408.68,0.0,0.0,3408.13,0.0,1956.52,0.0,0.0,0.0,0.0,1451.61,660.0,0.0,660.0,0.0,1393.53,1393.53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,968.47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,968.47,0.0,7952.4
13259,600296201,1,6002,1,4,86,1642,149,1,2,59,4,1,0,1,1,0,1,0,1,1,1,12648.0,8477.89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2262.43,2262.43,0.0,0.0,2262.43,0.0,0.0,0.0,0.0,0.0,6215.46,0.0,0.0,0.0,4170.11,0.0,0.0,1491.71,...,0.0,0.0,4391.51,4064.51,0.0,0.0,327.0,400.62,400.62,0.0,0.0,0.0,0.0,0.0,0.0,8245.25,6634.25,0.0,0.0,0.0,0.0,0.0,1611.0,630.0,0.0,630.0,0.0,145.16,145.16,0.0,0.0,0.0,2699.98,0.0,0.0,0.0,0.0,2699.98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7952.4
22179,910581602,1,9007,1,3,138,2681,1538,3,1,31,10,3,1,2,3,0,2,1,2,3,2,85131.08,63590.1,44409.79,32459.01,2360.65,0.0,5655.72,0.0,3934.41,0.0,14754.09,14754.09,0.0,0.0,14754.09,0.0,0.0,0.0,0.0,0.0,4426.22,0.0,0.0,0.0,3540.98,0.0,0.0,0.0,...,0.0,586.95,670.5,0.0,300.0,70.5,300.0,1107.0,1107.0,0.0,0.0,1960.4,1486.94,0.0,473.46,15097.76,2057.12,1475.4,0.0,8325.24,885.24,7440.0,3240.0,0.0,0.0,0.0,0.0,4207.62,645.0,0.0,3562.62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,127.86,0.0,127.86,127.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7952.4
44513,1910575407,1,19031,2,2,305,5168,769,2,1,43,6,5,4,1,3,2,3,0,2,2,2,37599.92,31793.47,31793.47,31793.47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2080.64,0.0,0.0,0.0,2080.64,435.46,435.46,0.0,0.0,0.0,0.0,0.0,0.0,4573.8,3702.84,0.0,0.0,0.0,0.0,0.0,870.96,0.0,0.0,0.0,0.0,1071.26,1071.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7952.4


In [40]:
# see columns
dict_df['concentradohogar'].info(True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74647 entries, 0 to 74646
Data columns (total 126 columns):
 #    Column       Dtype  
---   ------       -----  
 0    ï»¿folioviv  int64  
 1    foliohog     int64  
 2    ubica_geo    int64  
 3    tam_loc      int64  
 4    est_socio    int64  
 5    est_dis      int64  
 6    upm          int64  
 7    factor       int64  
 8    clase_hog    int64  
 9    sexo_jefe    int64  
 10   edad_jefe    int64  
 11   educa_jefe   int64  
 12   tot_integ    int64  
 13   hombres      int64  
 14   mujeres      int64  
 15   mayores      int64  
 16   menores      int64  
 17   p12_64       int64  
 18   p65mas       int64  
 19   ocupados     int64  
 20   percep_ing   int64  
 21   perc_ocupa   int64  
 22   ing_cor      float64
 23   ingtrab      float64
 24   trabajo      float64
 25   sueldos      float64
 26   horas_extr   float64
 27   comisiones   float64
 28   aguinaldo    float64
 29   indemtrab    float64
 30   otra_rem     float64

In [None]:
# see a sample
dict_df['concentradohogar'][dict_df['concentradohogar']["ï»¿folioviv"] == 2200827906]

Unnamed: 0,ï»¿folioviv,tipo_viv,mat_pared,mat_techos,mat_pisos,antiguedad,antigua_ne,cocina,cocina_dor,cuart_dorm,num_cuarto,disp_agua,dotac_agua,excusado,uso_compar,sanit_agua,biodigest,bano_comp,bano_excus,bano_regad,drenaje,disp_elect,focos_inca,focos_ahor,combustible,estufa_chi,eli_basura,tenencia,renta,estim_pago,pago_viv,pago_mesp,tipo_adqui,viv_usada,tipo_finan,num_dueno1,hog_dueno1,num_dueno2,hog_dueno2,escrituras,lavadero,fregadero,regadera,tinaco_azo,cisterna,pileta,calent_sol,calent_gas,medidor_luz,bomba_agua,tanque_gas,aire_acond,calefacc,tot_resid,tot_hom,tot_muj,tot_hog,ubica_geo,tam_loc,est_socio,est_dis,upm,factor,procaptar
49644,2200827906,1,8,10,3,,1,1,2,3,6,1,1,1,2,1,2,1,1,0,1,1,0,14,3,,1,2,,6000,,,,,,,,,,,1,1,1,1,2,2,2,1,1,2,1,2,2,4,2,2,1,22014,1,4,346,5663,513,0


^^^
with `concentradohogar` is enough

---
# Concentrado Hogar

The data diccionary is found in the [INEGI](https://www.inegi.org.mx/rnm/index.php/catalog/685/data-dictionary/F28?file_name=concentradohogar)

## S1: Read

In [23]:
# year
YEAR_ENIGH = 2018

In [24]:
# important files
files = [
    'enigh{enigh_year}_ns_concentradohogar_csv.zip'
]

# append year to files
files = [file.format(enigh_year=YEAR_ENIGH) for file in files]

# unzip files
ENIGH_ROOT = ENIGH_ROOT.format(enigh_year=YEAR_ENIGH)
for file in files:
    os.system(f"unzip -o {ENIGH_ROOT + file} -d {ENIGH_ROOT}")


Archive:  ../../data/catalogues/adamuz_data/enigh_2018/enigh2018_ns_concentradohogar_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/concentradohogar.csv  
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/nota_bases_datos_enigh2018_ns.txt  


In [47]:
# read only concentradohogar
file = ENIGH_ROOT.format(enigh_year=YEAR_ENIGH) + 'concentradohogar.csv'

# read file
df_concentradohogar = pd.read_csv(file)
df_concentradohogar.shape

(74647, 126)

In [48]:
# see columns
df_concentradohogar.info(True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74647 entries, 0 to 74646
Data columns (total 126 columns):
 #    Column      Dtype  
---   ------      -----  
 0    folioviv    int64  
 1    foliohog    int64  
 2    ubica_geo   int64  
 3    tam_loc     int64  
 4    est_socio   int64  
 5    est_dis     int64  
 6    upm         int64  
 7    factor      int64  
 8    clase_hog   int64  
 9    sexo_jefe   int64  
 10   edad_jefe   int64  
 11   educa_jefe  int64  
 12   tot_integ   int64  
 13   hombres     int64  
 14   mujeres     int64  
 15   mayores     int64  
 16   menores     int64  
 17   p12_64      int64  
 18   p65mas      int64  
 19   ocupados    int64  
 20   percep_ing  int64  
 21   perc_ocupa  int64  
 22   ing_cor     float64
 23   ingtrab     float64
 24   trabajo     float64
 25   sueldos     float64
 26   horas_extr  float64
 27   comisiones  float64
 28   aguinaldo   float64
 29   indemtrab   float64
 30   otra_rem    float64
 31   remu_espec  float64
 32   

In [49]:
# see columns
print(df_concentradohogar.columns.to_list())

['folioviv', 'foliohog', 'ubica_geo', 'tam_loc', 'est_socio', 'est_dis', 'upm', 'factor', 'clase_hog', 'sexo_jefe', 'edad_jefe', 'educa_jefe', 'tot_integ', 'hombres', 'mujeres', 'mayores', 'menores', 'p12_64', 'p65mas', 'ocupados', 'percep_ing', 'perc_ocupa', 'ing_cor', 'ingtrab', 'trabajo', 'sueldos', 'horas_extr', 'comisiones', 'aguinaldo', 'indemtrab', 'otra_rem', 'remu_espec', 'negocio', 'noagrop', 'industria', 'comercio', 'servicios', 'agrope', 'agricolas', 'pecuarios', 'reproducc', 'pesca', 'otros_trab', 'rentas', 'utilidad', 'arrenda', 'transfer', 'jubilacion', 'becas', 'donativos', 'remesas', 'bene_gob', 'transf_hog', 'trans_inst', 'estim_alqu', 'otros_ing', 'gasto_mon', 'alimentos', 'ali_dentro', 'cereales', 'carnes', 'pescado', 'leche', 'huevo', 'aceites', 'tuberculo', 'verduras', 'frutas', 'azucar', 'cafe', 'especias', 'otros_alim', 'bebidas', 'ali_fuera', 'tabaco', 'vesti_calz', 'vestido', 'calzado', 'vivienda', 'alquiler', 'pred_cons', 'agua', 'energia', 'limpieza', 'cuida

In [50]:
# important columns (deprecated)
cols_important = [
    'factor',
    'ubica_geo',
    'est_socio',
    'tot_integ', 'hombres', 'mujeres', 'mayores', 'menores',
    'p12_64', 'p65mas', 'ocupados', 'percep_ing', 'perc_ocupa',
    'ing_cor',  'arrenda', 'rentas', 'utilidad', 'arrenda', 'transfer',
    'jubilacion', 'remesas', 'bene_gob', 'estim_alqu', 'alimentos', 'ali_dentro',
    'transporte', 'medicinas', 'mantenim', 'combus', 'educa_espa', 'percep_tot',
    'deposito', 'deudas'
]

notes:
- subset by `foliohog == 1` (primary home)
- ubica_geo should be string with zfill(5)

## S2: Mini Wrangling

In [51]:
# subset and wrangle
df_concentradohogar = (
    df_concentradohogar
    .query("foliohog == 1")
    .reset_index(drop=True)
    .set_index("folioviv")
    .loc[:, cols_important]
        .assign(
        ubica_geo=lambda x: x["ubica_geo"].astype(str).str.zfill(5)
    )
)
df_concentradohogar.head()

Unnamed: 0_level_0,factor,ubica_geo,est_socio,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,arrenda,rentas,utilidad,arrenda,transfer,jubilacion,remesas,bene_gob,estim_alqu,alimentos,ali_dentro,transporte,medicinas,mantenim,combus,educa_espa,percep_tot,deposito,deudas
folioviv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
100013601,179,1001,3,3,2,1,3,0,1,2,2,3,2,76403.7,0.0,0.0,0.0,0.0,11288.96,9147.54,0.0,1622.95,12000.0,5618.47,4075.63,8400.0,0.0,7200.0,7200.0,0.0,0.0,0.0,0.0
100013602,179,1001,3,5,2,3,5,0,5,0,2,5,2,42987.73,0.0,0.0,0.0,0.0,3752.67,0.0,98.36,0.0,24000.0,20930.29,8587.46,7628.56,341.4,3600.0,3600.0,17567.05,6073.09,9.83,0.0
100013603,179,1001,3,2,1,1,2,0,2,0,2,2,2,580697.74,29508.19,29508.19,0.0,29508.19,391304.34,0.0,0.0,0.0,18000.0,37594.06,25251.25,12325.68,0.0,4500.0,4500.0,0.0,3857.14,66393.44,14754.09
100013604,179,1001,3,2,1,1,2,0,1,1,0,2,0,46252.71,0.0,0.0,0.0,0.0,34252.71,23606.55,5901.63,0.0,12000.0,2892.84,2892.84,7350.0,322.82,6000.0,6000.0,639.34,1380.55,0.0,0.0
100013606,179,1001,3,4,1,3,3,1,3,0,2,2,2,53837.09,0.0,0.0,0.0,0.0,107.6,0.0,0.0,0.0,10500.0,7367.09,4795.67,600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
# shape
df_concentradohogar.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73405 entries, 100013601 to 3260798906
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   mujeres     73405 non-null  int64  
 1   transporte  73405 non-null  float64
 2   remesas     73405 non-null  float64
 3   menores     73405 non-null  int64  
 4   bene_gob    73405 non-null  float64
 5   mantenim    73405 non-null  float64
 6   hombres     73405 non-null  int64  
 7   percep_ing  73405 non-null  int64  
 8   jubilacion  73405 non-null  float64
 9   ali_dentro  73405 non-null  float64
 10  rentas      73405 non-null  float64
 11  medicinas   73405 non-null  float64
 12  percep_tot  73405 non-null  float64
 13  p65mas      73405 non-null  int64  
 14  arrenda     73405 non-null  float64
 15  estim_alqu  73405 non-null  float64
 16  factor      73405 non-null  int64  
 17  deudas      73405 non-null  float64
 18  tot_integ   73405 non-null  int64  
 19  deposito    73405

## S3: Summary Statistics

In [61]:
# get weighted mean for all columns
def weighted_mean(df, cols, weight_col):
    return (df[cols].mul(df[weight_col], axis=0).sum() / df[weight_col].sum())


# get weighted mean
table_concentradohogar = (
    df_concentradohogar
    .groupby("ubica_geo")
    .apply(
        lambda x: weighted_mean(x, cols_important[2:], "factor"),
        include_groups=False
        )
    .reset_index()
)

table_concentradohogar

Unnamed: 0,ubica_geo,est_socio,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,arrenda,arrenda.1,rentas,utilidad,arrenda.2,arrenda.3,transfer,jubilacion,remesas,bene_gob,estim_alqu,alimentos,ali_dentro,transporte,medicinas,mantenim,combus,educa_espa,percep_tot,deposito,deudas
0,01001,2.830593,3.721097,1.780415,1.940683,2.999458,0.721640,2.691494,0.307964,1.803984,2.391824,1.774901,63956.301534,934.380755,934.380755,4540.471227,3606.090472,934.380755,934.380755,9947.056421,5419.655226,295.724280,299.683788,6351.823899,11954.944213,8940.815992,8381.883266,214.566731,3749.066387,3393.212646,5514.063280,4509.766233,2451.678069,787.065174
1,01002,2.000000,3.604238,1.983979,1.620260,2.760799,0.843439,2.500406,0.260393,1.721456,2.213243,1.621882,56859.429450,44.637700,44.637700,23675.976623,23631.338923,44.637700,44.637700,5451.380160,443.291597,1569.348306,624.306757,3056.926434,8219.964496,6756.400139,3832.240098,86.010496,1672.733856,1548.282020,2771.949981,1629.274389,832.381317,153.873705
2,01003,2.000000,4.186397,1.901232,2.285165,3.038676,1.147721,2.690045,0.348631,1.640464,2.563270,1.625559,39359.516268,78.811118,78.811118,873.710450,794.899332,78.811118,78.811118,9950.411071,3087.059927,2957.713092,549.696547,2773.515153,10146.433728,8589.532845,5432.821375,92.096187,2595.354250,2362.333267,1976.826164,2812.032452,1030.745406,487.868450
3,01005,2.289274,4.025102,2.112810,1.912293,3.090068,0.935034,2.915252,0.174815,1.974036,2.538047,1.958301,62425.257471,1009.285988,1009.285988,3309.967804,2300.681816,1009.285988,1009.285988,6057.136014,2772.023940,398.966046,321.244494,4832.711173,12182.177261,8921.586678,9133.930340,176.550500,4029.594480,3727.519677,5778.835839,11572.415422,1400.846615,781.861625
4,01006,2.263761,4.607061,2.227474,2.379587,3.416530,1.190531,3.179800,0.236730,1.923411,2.834617,1.897690,56094.247226,715.270632,715.270632,3318.522814,2603.252181,715.270632,715.270632,8487.889536,3968.025996,984.991148,498.572452,3988.718748,11482.237200,8961.447306,10876.918543,160.279906,2891.605356,2678.916568,4927.467233,9956.567912,2643.608893,1078.976099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,32051,1.647511,3.843737,1.969687,1.874050,2.980958,0.862779,2.532491,0.448467,1.647511,2.671249,1.568440,40082.133101,569.521579,569.521579,3819.643237,3250.121658,569.521579,569.521579,8436.755174,1581.369893,1909.844823,2195.124737,5170.449559,10096.277873,7940.405039,5505.348080,108.134043,3437.907065,3201.493354,1760.580282,1527.192372,726.568682,238.542139
992,32052,2.000000,3.781420,1.848385,1.933035,2.984063,0.797358,2.489095,0.494967,1.826646,2.822452,1.802391,31810.506229,52.806616,52.806616,393.517413,340.710797,52.806616,52.806616,6801.598844,2366.563064,54.314990,1577.642648,2135.733241,8889.174727,7654.783363,3159.557716,18.494614,1392.505853,1311.380914,854.618212,997.911037,320.233406,356.248510
993,32053,2.000000,3.980587,1.898449,2.082138,3.021656,0.958931,2.724417,0.297239,1.724417,2.899912,1.702761,35816.245265,205.695961,205.695961,840.177073,634.481112,205.695961,205.695961,7112.281395,108.046713,1826.344042,1358.375522,2429.185390,9678.855319,7705.112781,2768.881374,50.788028,1456.738466,1418.148826,1788.645135,1598.613159,839.851450,81.797179
994,32054,1.650739,4.225957,2.053587,2.172370,3.080246,1.145711,2.762435,0.317811,1.450429,2.798340,1.426537,26251.858576,0.000000,0.000000,824.532602,824.532602,0.000000,0.000000,7341.604626,1004.517257,2091.341471,1996.116097,2478.150317,7099.878353,6488.594857,2552.759443,74.230333,1447.614891,1342.728791,821.287808,2162.023695,297.367283,94.309401


In [65]:
# see nans
table_concentradohogar.isnull().sum().to_frame().T

Unnamed: 0,ubica_geo,est_socio,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,arrenda,arrenda.1,rentas,utilidad,arrenda.2,arrenda.3,transfer,jubilacion,remesas,bene_gob,estim_alqu,alimentos,ali_dentro,transporte,medicinas,mantenim,combus,educa_espa,percep_tot,deposito,deudas
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## S4: All the ENIGHs

In [74]:
# function to extract, transform and load each enigh file
def weighted_mean(df, cols, weight_col):
    return (df[cols].mul(df[weight_col], axis=0).sum() / df[weight_col].sum())


def get_enigh(file, cols_summarize):
    # s1:read file
    df = pd.read_csv(file)
    
    # s2: subset and wrangle
    df = (
        df
        .query("foliohog == 1")
        .reset_index(drop=True)
        .assign(
            ubica_geo=lambda x: x["ubica_geo"].astype(str).str.zfill(5)
        )
    )

    # s3: get weighted mean
    table = (
        df
        .groupby("ubica_geo")
        .apply(
            lambda x: weighted_mean(x, cols_summarize, "factor"),
            include_groups=False
        )
        .reset_index()
    )
    
    return table


# get params
years_enigh = [2018, 2020, 2020]
cols_to_summarize = [
    'tam_loc', 'est_socio', 'est_dis',
    'clase_hog', 'sexo_jefe', 'edad_jefe',
    'educa_jefe', 'tot_integ', 'hombres', 'mujeres',
    'mayores', 'menores', 'p12_64', 'p65mas', 'ocupados',
    'percep_ing', 'perc_ocupa', 'ing_cor', 'ingtrab',
    'trabajo', 'sueldos', 'horas_extr', 'comisiones',
    'aguinaldo', 'indemtrab', 'otra_rem', 'remu_espec',
    'negocio', 'noagrop', 'industria', 'comercio', 'servicios',
    'agrope', 'agricolas', 'pecuarios', 'reproducc', 'pesca',
    'otros_trab', 'rentas', 'utilidad', 'arrenda', 'transfer',
    'jubilacion', 'becas', 'donativos', 'remesas', 'bene_gob',
    'transf_hog', 'trans_inst', 'estim_alqu', 'otros_ing', 'gasto_mon',
    'alimentos', 'ali_dentro', 'cereales', 'carnes', 'pescado',
    'leche', 'huevo', 'aceites', 'tuberculo', 'verduras', 'frutas',
    'azucar', 'cafe', 'especias', 'otros_alim', 'bebidas', 'ali_fuera',
    'tabaco', 'vesti_calz', 'vestido', 'calzado', 'vivienda',
    'alquiler', 'pred_cons', 'agua', 'energia', 'limpieza', 'cuidados',
    'utensilios', 'enseres', 'salud', 'atenc_ambu', 'hospital',
    'medicinas', 'transporte', 'publico', 'foraneo', 'adqui_vehi',
    'mantenim', 'refaccion', 'combus', 'comunica', 'educa_espa',
    'educacion', 'esparci', 'paq_turist', 'personales', 'cuida_pers',
    'acces_pers', 'otros_gas', 'transf_gas', 'percep_tot', 'retiro_inv',
    'prestamos', 'otras_perc', 'ero_nm_viv', 'ero_nm_hog', 'erogac_tot',
    'cuota_viv', 'mater_serv', 'material', 'servicio', 'deposito',
    'prest_terc', 'pago_tarje', 'deudas', 'balance', 'otras_erog',
]
FILE_TEMPLATE = 'enigh{enigh_year}_ns_concentradohogar_csv.zip'
ENIGH_ROOT = "../../data/catalogues/adamuz_data/enigh_{enigh_year}/"

# get all tables
list_tables = []
for year in tqdm(years_enigh):
    # get file
    dir_file = ENIGH_ROOT.format(enigh_year=year)
    file = dir_file + FILE_TEMPLATE.format(enigh_year=year)

    # unzip
    os.system(f"unzip -o {file} -d {dir_file}")

    # get table
    table = get_enigh(dir_file + 'concentradohogar.csv', cols_to_summarize)

    # add year
    table["year"] = year

    # append
    list_tables.append(table)



  0%|          | 0/3 [00:00<?, ?it/s]

Archive:  ../../data/catalogues/adamuz_data/enigh_2018/enigh2018_ns_concentradohogar_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/concentradohogar.csv  
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/nota_bases_datos_enigh2018_ns.txt  


 33%|███▎      | 1/3 [00:03<00:06,  3.29s/it]

Archive:  ../../data/catalogues/adamuz_data/enigh_2020/enigh2020_ns_concentradohogar_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2020/concentradohogar.csv  


 67%|██████▋   | 2/3 [00:06<00:03,  3.07s/it]

Archive:  ../../data/catalogues/adamuz_data/enigh_2020/enigh2020_ns_concentradohogar_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2020/concentradohogar.csv  


100%|██████████| 3/3 [00:09<00:00,  3.03s/it]


In [75]:
# concat
df_concentradohogar_all = (
    pd.concat(list_tables)
    .sort_values(["ubica_geo", "year"], ignore_index=True)
    )
df_concentradohogar_all

Unnamed: 0,ubica_geo,tam_loc,est_socio,est_dis,clase_hog,sexo_jefe,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,year
0,01001,1.313242,2.830593,2.749283,2.162275,1.285027,50.089514,6.479373,3.721097,1.780415,1.940683,2.999458,0.721640,2.691494,0.307964,1.803984,2.391824,1.774901,63956.301534,43070.626604,37697.351960,31202.010734,495.394570,1317.133005,2206.442086,252.169473,852.605979,1371.596112,3683.768570,3046.691033,415.123858,1268.234312,1363.332863,637.077537,0.783675,633.157600,3.136262,0.0,1689.506073,4540.471227,3606.090472,934.380755,9947.056421,5419.655226,159.891602,989.171367,295.724280,299.683788,1599.486996,1183.443163,6351.823899,46.323383,39119.615453,11954.944213,8940.815992,1409.190847,1791.346954,141.805814,1001.070483,256.830314,64.707445,116.593703,888.009195,411.459735,58.341930,74.699891,70.812195,1791.369493,864.577994,2928.156080,85.972141,1740.256643,1088.568451,651.688192,3114.482487,1146.301370,175.330648,644.507169,1148.343300,2429.241635,1727.998294,274.058488,427.184852,1639.883555,1227.447356,197.869469,214.566731,8381.883266,1171.234546,335.669626,1267.892256,3749.066387,355.853740,3393.212646,1858.020452,5514.063280,3462.857938,1320.678289,730.527053,2968.421188,2140.535891,117.545545,710.339753,1376.439186,4509.766233,906.226031,512.698217,1570.168785,10.959857,1509.713343,8311.227510,1231.299092,479.004866,209.606931,269.397936,2451.678069,257.427359,2074.223392,787.065174,316.650080,713.879477,2018
1,01001,1.191967,2.848673,2.498772,2.110258,1.307766,50.307018,6.518359,3.511532,1.670060,1.841472,2.877479,0.634053,2.578268,0.299212,1.696833,2.295080,1.642091,60912.502414,39994.510208,36177.707945,30166.188042,278.128582,943.278159,2202.367386,323.549054,1224.366678,1039.830043,2677.040577,2616.953301,378.597045,837.298788,1401.057468,60.087276,2.420698,57.607004,0.059573,0.0,1139.761686,3339.710125,2725.802014,613.908110,10562.410583,6146.652668,104.825597,1051.749094,361.698340,898.020269,1472.286234,527.178381,6937.108693,78.762806,37361.636895,12577.730151,10739.532593,1660.287784,1981.896889,180.714402,1105.435781,366.783710,92.778479,130.161860,1079.879732,523.272579,81.819202,84.717259,85.751375,2324.198133,1041.835407,1720.387090,117.810468,1351.179617,878.251851,472.927765,3523.484832,1395.231161,184.045724,789.587590,1154.620356,2287.033608,1552.910254,271.907183,462.216171,1778.678539,1277.235951,174.258678,327.183910,8257.743571,929.733346,119.353027,1820.589293,3162.720259,342.765384,2819.954876,2225.347646,3700.139892,2536.653595,776.041132,387.445165,2749.710175,1969.424468,112.600106,667.685601,1135.936510,4979.077693,1640.427443,570.140620,1558.225213,42.339486,1167.944931,10520.616231,1834.717706,532.231593,265.539319,266.692273,4694.411677,129.864933,1556.153613,757.984491,463.801093,551.451125,2020
2,01001,1.191967,2.848673,2.498772,2.110258,1.307766,50.307018,6.518359,3.511532,1.670060,1.841472,2.877479,0.634053,2.578268,0.299212,1.696833,2.295080,1.642091,60912.502414,39994.510208,36177.707945,30166.188042,278.128582,943.278159,2202.367386,323.549054,1224.366678,1039.830043,2677.040577,2616.953301,378.597045,837.298788,1401.057468,60.087276,2.420698,57.607004,0.059573,0.0,1139.761686,3339.710125,2725.802014,613.908110,10562.410583,6146.652668,104.825597,1051.749094,361.698340,898.020269,1472.286234,527.178381,6937.108693,78.762806,37361.636895,12577.730151,10739.532593,1660.287784,1981.896889,180.714402,1105.435781,366.783710,92.778479,130.161860,1079.879732,523.272579,81.819202,84.717259,85.751375,2324.198133,1041.835407,1720.387090,117.810468,1351.179617,878.251851,472.927765,3523.484832,1395.231161,184.045724,789.587590,1154.620356,2287.033608,1552.910254,271.907183,462.216171,1778.678539,1277.235951,174.258678,327.183910,8257.743571,929.733346,119.353027,1820.589293,3162.720259,342.765384,2819.954876,2225.347646,3700.139892,2536.653595,776.041132,387.445165,2749.710175,1969.424468,112.600106,667.685601,1135.936510,4979.077693,1640.427443,570.140620,1558.225213,42.339486,1167.944931,10520.616231,1834.717706,532.231593,265.539319,266.692273,4694.411677,129.864933,1556.153613,757.984491,463.801093,551.451125,2020
3,01002,3.502535,2.000000,9.502535,2.027581,1.140641,49.600994,4.982965,3.604238,1.983979,1.620260,2.760799,0.843439,2.500406,0.260393,1.721456,2.213243,1.621882,56859.429450,24675.146233,22639.608039,18628.511903,886.846242,0.000000,1274.586249,223.188603,460.445814,1166.029227,1191.927595,762.882436,295.869884,236.925137,230.087414,429.045159,164.702685,264.342474,0.000000,0.0,843.610599,23675.976623,23631.338923,44.637700,5451.380160,443.291597,172.418478,802.779294,1569.348306,624.306757,1420.621389,418.614339,3056.926434,0.000000,21993.396234,8219.964496,6756.400139,1514.529793,1135.884369,40.070674,525.476606,283.662692,113.508012,157.457868,937.194774,351.052311,61.911357,16.976191,52.826062,900.659416,665.190014,1397.342622,66.221735,1495.827830,1033.302822,462.525008,1339.606533,104.804880,16.754183,205.183786,1012.863684,1680.446001,1177.354195,292.371691,210.720115,552.179860,365.701076,100.468288,86.010496,3832.240098,551.075067,305.880559,525.023184,1672.733856,124.451836,1548.282020,777.527433,2771.949981,2014.941112,563.664797,193.344071,1748.037800,1547.277353,71.133487,129.626959,353.143635,1629.274389,646.414495,139.685770,38.114105,0.000000,805.060019,3019.599582,0.000000,474.202007,281.918093,192.283914,832.381317,10.572054,63.432649,153.873705,1376.595836,108.542015,2018
4,01002,4.000000,2.000000,11.000000,2.217098,1.187896,51.105312,5.020217,4.385703,2.075317,2.310386,3.304836,1.080867,2.881739,0.423097,1.839984,2.478858,1.762685,42676.932291,28949.530690,23455.044724,20353.217110,289.514035,100.071924,1067.784121,44.078525,344.034133,1256.344874,4318.092303,2342.715008,772.794728,1331.904181,238.016099,1975.377295,725.629626,1249.747669,0.000000,0.0,1176.393663,2627.893293,2461.790284,166.103009,7652.633981,1284.361472,76.407115,1160.626070,1561.976716,1818.070979,1634.229198,116.962430,3409.293809,37.580518,24535.163513,10057.882685,8938.334839,1917.737784,1633.174320,78.674667,729.702141,359.271783,149.514010,163.616924,1091.038517,265.520017,143.136826,57.307307,61.904314,1377.758239,909.977990,1115.317676,4.230170,1494.923783,1034.875998,460.047785,1404.263898,60.937721,45.485267,296.539376,1001.301534,1987.917123,1279.634680,241.193533,467.088910,1378.318923,862.202386,135.480009,380.636527,4183.672015,463.693001,13.089827,556.921541,1915.928905,110.701546,1805.227359,1234.038742,1400.986002,1074.667875,231.788400,94.529726,2049.032189,1764.058932,34.384234,250.589024,578.166895,3222.723660,741.477315,229.954656,823.146531,0.000000,1428.145157,6509.449112,0.000000,411.181687,284.149183,127.032504,3711.725764,105.886901,96.435736,477.849499,1453.620311,252.749214,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3171,32056,1.000000,3.121463,534.121463,2.224621,1.328251,49.197496,6.822089,3.601669,1.665266,1.936403,2.852469,0.749200,2.590485,0.261984,1.628154,2.315565,1.589373,55085.904844,37182.589227,33735.645602,29123.971502,273.605223,379.742611,2400.885246,56.785040,1200.563703,300.092277,2238.058870,2235.425816,386.108005,854.130182,995.187629,2.633055,2.633055,0.000000,0.000000,0.0,1208.884755,2412.983816,2122.930466,290.053351,9574.600989,5724.228150,73.889191,1372.881016,139.626881,124.287193,1803.017708,336.670850,5902.144881,13.585930,38690.924533,12177.812142,9525.896973,1716.677559,2276.284782,162.405882,1062.445210,336.511595,70.616010,181.838652,1063.534248,601.785219,72.442441,86.147371,93.620170,1052.050132,749.537703,2574.916543,76.998626,1958.866636,1186.128380,772.738256,3643.958548,1281.776251,230.871188,476.140019,1655.171089,2397.357415,1783.908968,188.222743,425.225704,580.579726,474.720868,27.390822,78.468035,8405.320650,1393.687516,272.935277,1136.945871,3480.897713,314.331250,3166.566463,2120.854274,4798.121175,2986.074865,1148.528048,663.518262,3365.773304,2343.375739,161.642753,860.754813,1363.134937,2880.187041,735.663821,333.610710,410.923172,0.578034,1399.411305,4764.464668,1046.107343,381.115241,257.688072,123.427169,1832.686194,32.392271,260.538527,781.372840,48.882618,381.369632,2018
3172,32056,1.304299,2.869344,549.883672,2.112586,1.266458,51.285341,6.921710,3.476833,1.619356,1.857477,2.810872,0.665961,2.492319,0.318553,1.646438,2.269363,1.596140,55014.665779,36505.050889,33838.092465,28755.712744,128.562844,735.237669,2369.888449,141.182729,896.194720,811.313310,1856.478083,1776.542714,301.785154,329.609842,1145.147718,79.935370,53.860343,26.075027,0.000000,0.0,810.480340,1614.974293,1201.209951,413.764342,10270.100338,5869.566011,389.268545,748.078502,205.886613,1197.694437,1404.900171,454.706060,6585.553384,38.986875,35399.754305,12699.998593,10969.371202,1922.716242,2236.065702,362.289924,1271.791466,437.846604,119.514398,167.626135,1270.954746,660.372516,94.326412,106.745433,118.590103,1405.106779,795.424742,1683.669796,46.957596,1313.012224,825.147501,487.864724,3420.632375,1097.936340,213.986509,584.822099,1523.887427,2343.205878,1866.998963,250.985089,225.221826,1467.795344,1206.070680,33.937266,227.787399,7497.728964,530.838525,174.210212,984.761938,3564.979946,465.645861,3099.334085,2242.938342,2676.227530,1947.676059,707.429678,21.121792,3164.498434,1944.527637,73.576044,1146.394754,816.654964,4214.936653,1176.292445,309.546061,1045.660455,15.721967,1667.715724,7554.701874,857.553680,732.588291,466.876764,265.711527,3784.249126,15.747269,1016.664578,382.865113,211.719145,553.314672,2020
3173,32056,1.304299,2.869344,549.883672,2.112586,1.266458,51.285341,6.921710,3.476833,1.619356,1.857477,2.810872,0.665961,2.492319,0.318553,1.646438,2.269363,1.596140,55014.665779,36505.050889,33838.092465,28755.712744,128.562844,735.237669,2369.888449,141.182729,896.194720,811.313310,1856.478083,1776.542714,301.785154,329.609842,1145.147718,79.935370,53.860343,26.075027,0.000000,0.0,810.480340,1614.974293,1201.209951,413.764342,10270.100338,5869.566011,389.268545,748.078502,205.886613,1197.694437,1404.900171,454.706060,6585.553384,38.986875,35399.754305,12699.998593,10969.371202,1922.716242,2236.065702,362.289924,1271.791466,437.846604,119.514398,167.626135,1270.954746,660.372516,94.326412,106.745433,118.590103,1405.106779,795.424742,1683.669796,46.957596,1313.012224,825.147501,487.864724,3420.632375,1097.936340,213.986509,584.822099,1523.887427,2343.205878,1866.998963,250.985089,225.221826,1467.795344,1206.070680,33.937266,227.787399,7497.728964,530.838525,174.210212,984.761938,3564.979946,465.645861,3099.334085,2242.938342,2676.227530,1947.676059,707.429678,21.121792,3164.498434,1944.527637,73.576044,1146.394754,816.654964,4214.936653,1176.292445,309.546061,1045.660455,15.721967,1667.715724,7554.701874,857.553680,732.588291,466.876764,265.711527,3784.249126,15.747269,1016.664578,382.865113,211.719145,553.314672,2020
3174,32057,3.000000,2.000000,555.000000,2.277524,1.362521,53.898383,4.906748,3.987730,1.832571,2.155159,2.987730,1.000000,2.544785,0.442945,1.627217,2.390184,1.604685,38356.532724,23657.500922,17149.797839,15399.858634,274.308465,156.185936,743.362355,140.633701,68.005025,367.443723,5111.540443,3720.278004,0.000000,2776.569074,943.708930,1391.262438,395.404466,688.683230,307.174742,0.0,1396.162640,2670.968916,2612.371524,58.597392,7414.052798,1919.602213,26.450621,1022.843601,1018.419866,1465.073917,1613.879409,347.783170,4579.829404,34.180684,22879.875547,9574.317147,8902.291951,1922.260073,2016.183374,31.866429,724.230685,554.246288,118.065153,174.821142,1434.542019,270.215896,195.957573,43.590077,24.532465,850.914570,540.866206,599.003066,73.022129,1693.216442,1082.075633,611.140809,1889.812325,0.000000,46.472862,288.869046,1554.470416,1926.785901,1465.720116,181.043818,280.021967,776.682410,619.555731,0.000000,157.126679,2792.653828,761.400301,9.869588,151.376696,745.457973,71.400335,674.057639,1124.549269,948.416273,605.947630,342.468644,0.000000,2901.275825,2189.836589,26.146564,685.292672,376.715397,1866.535375,937.030566,0.000000,74.454397,0.000000,855.050413,2938.500055,0.000000,805.346972,632.174673,173.172299,1316.292377,4.909771,123.526572,129.744587,558.679776,0.000000,2020


In [77]:
# see nans
df_concentradohogar_all.isnull().sum()[df_concentradohogar_all.isnull().sum() > 0]

Series([], dtype: int64)

---
# Vivienda

Note:
- A Hogar can have multiple viviendas

## S1: Read

In [78]:
# year
YEAR_ENIGH = 2018

In [79]:
# important files
files = [
    'enigh{enigh_year}_ns_viviendas_csv.zip'
]

# append year to files
files = [file.format(enigh_year=YEAR_ENIGH) for file in files]

# unzip files
ENIGH_ROOT = ENIGH_ROOT.format(enigh_year=YEAR_ENIGH)
for file in files:
    os.system(f"unzip -o {ENIGH_ROOT + file} -d {ENIGH_ROOT}")


Archive:  ../../data/catalogues/adamuz_data/enigh_2018/enigh2018_ns_viviendas_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/viviendas.csv  
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/nota_bases_datos_enigh2018_ns.txt  


In [85]:
# read only concentradohogar
file = ENIGH_ROOT.format(enigh_year=YEAR_ENIGH) + 'viviendas.csv'

# read file
df_viviendas = pd.read_csv(file, na_values=[' '])
df_viviendas.shape

  df_viviendas = pd.read_csv(file, na_values=[' '])


(73405, 64)

In [86]:
# see
df_viviendas.isna().sum().to_frame().T

Unnamed: 0,folioviv,tipo_viv,mat_pared,mat_techos,mat_pisos,antiguedad,antigua_ne,cocina,cocina_dor,cuart_dorm,num_cuarto,disp_agua,dotac_agua,excusado,uso_compar,sanit_agua,biodigest,bano_comp,bano_excus,bano_regad,drenaje,disp_elect,focos_inca,focos_ahor,combustible,estufa_chi,eli_basura,tenencia,renta,estim_pago,pago_viv,pago_mesp,tipo_adqui,viv_usada,tipo_finan,num_dueno1,hog_dueno1,num_dueno2,hog_dueno2,escrituras,lavadero,fregadero,regadera,tinaco_azo,cisterna,pileta,calent_sol,calent_gas,medidor_luz,bomba_agua,tanque_gas,aire_acond,calefacc,tot_resid,tot_hom,tot_muj,tot_hog,ubica_geo,tam_loc,est_socio,est_dis,upm,factor,procaptar
0,0,0,0,0,0,10904,62501,0,5819,0,0,0,6007,0,1997,1997,1997,1997,1997,1997,0,0,493,493,0,60193,0,0,64308,9097,67045,57950,20987,59862,24504,20987,20987,70432,70432,20987,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [89]:
# see some sample
df_viviendas.sample(10).to_

Unnamed: 0,folioviv,tipo_viv,mat_pared,mat_techos,mat_pisos,antiguedad,antigua_ne,cocina,cocina_dor,cuart_dorm,num_cuarto,disp_agua,dotac_agua,excusado,uso_compar,sanit_agua,biodigest,bano_comp,bano_excus,bano_regad,drenaje,disp_elect,focos_inca,focos_ahor,combustible,estufa_chi,eli_basura,tenencia,renta,estim_pago,pago_viv,pago_mesp,tipo_adqui,viv_usada,tipo_finan,num_dueno1,hog_dueno1,num_dueno2,hog_dueno2,escrituras,lavadero,fregadero,regadera,tinaco_azo,cisterna,pileta,calent_sol,calent_gas,medidor_luz,bomba_agua,tanque_gas,aire_acond,calefacc,tot_resid,tot_hom,tot_muj,tot_hog,ubica_geo,tam_loc,est_socio,est_dis,upm,factor,procaptar
38100,1660219905,1,8,10,2,,1.0,1,2.0,2,3,2,1.0,1,1.0,3.0,2.0,0.0,1.0,0.0,2,1,0.0,2.0,3,,1,4,,1000.0,,,2.0,,5.0,1.0,1.0,,,1.0,1,2,2,2,2,1,2,2,1,2,2,2,2,4,1,3,1,16066,4,2,266,4420,751,0
53121,2302085202,2,8,10,3,9.0,,1,2.0,1,2,1,1.0,1,2.0,1.0,2.0,1.0,0.0,0.0,1,1,0.0,6.0,3,,3,3,,1500.0,1600.0,2.0,1.0,2.0,1.0,1.0,1.0,,,1.0,1,1,1,1,2,2,2,2,1,2,2,2,2,2,0,2,1,23005,1,2,362,6101,334,0
53085,2301983903,1,8,10,2,14.0,,1,2.0,2,3,1,1.0,1,2.0,1.0,1.0,1.0,0.0,0.0,1,1,3.0,2.0,3,,1,3,,1500.0,1200.0,1.0,1.0,1.0,1.0,1.0,1.0,,,1.0,1,1,1,1,2,2,2,2,1,2,2,1,2,3,1,2,1,23005,1,2,362,6092,316,0
8246,400466605,1,8,10,3,5.0,,2,,3,4,2,1.0,1,2.0,2.0,2.0,0.0,1.0,0.0,2,1,2.0,2.0,3,,1,4,,1500.0,,,3.0,,5.0,2.0,1.0,,,1.0,1,2,2,2,2,1,2,2,1,2,2,2,2,6,5,1,1,4002,1,2,42,920,156,0
36711,1562125709,1,8,10,2,10.0,,1,2.0,2,3,6,,1,1.0,3.0,2.0,0.0,1.0,0.0,2,1,5.0,0.0,3,,1,1,800.0,,,1.0,,,,,,,,,2,2,2,2,2,2,2,2,2,2,2,2,2,4,3,1,1,15121,4,1,247,4289,522,0
37755,1605019409,1,8,10,2,,1.0,1,2.0,1,2,2,2.0,1,2.0,2.0,2.0,1.0,0.0,0.0,1,1,0.0,3.0,3,,1,1,600.0,,,1.0,,,,,,,,,1,2,1,2,2,1,2,2,1,2,2,2,2,4,2,2,1,16052,3,2,262,4401,785,0
18868,806308305,1,8,10,2,22.0,,1,2.0,4,5,1,1.0,1,2.0,1.0,2.0,1.0,0.0,0.0,1,1,0.0,6.0,3,,1,1,1400.0,,,1.0,,,,,,,,,1,1,1,2,2,2,2,1,1,2,2,1,2,5,3,2,1,8037,1,2,116,2361,446,0
25480,1100029204,1,8,10,3,45.0,,1,2.0,2,6,1,1.0,1,2.0,1.0,2.0,2.0,1.0,0.0,1,1,0.0,9.0,3,,1,4,,4000.0,,,3.0,,5.0,1.0,1.0,,,1.0,1,1,1,2,2,1,2,2,1,2,2,2,2,3,2,1,1,11007,1,2,164,3133,686,0
4075,260048806,1,8,3,3,12.0,,1,2.0,1,2,2,1.0,1,1.0,1.0,2.0,0.0,1.0,0.0,2,1,2.0,0.0,3,,1,4,,1200.0,,,2.0,,5.0,1.0,1.0,,,1.0,1,2,2,2,2,2,2,2,1,2,2,1,2,4,2,2,1,2002,4,2,28,570,43,0
56138,2461056404,1,8,3,2,18.0,,1,2.0,2,5,2,4.0,1,2.0,3.0,2.0,0.0,1.0,0.0,2,1,0.0,5.0,1,2.0,1,4,,1500.0,,,3.0,,4.0,1.0,1.0,,,1.0,2,2,2,2,2,1,2,2,1,2,2,2,2,3,1,2,1,24057,4,1,388,6436,289,0


In [88]:
# see sum of procaptar
df_viviendas["procaptar"].sum()

711

In [110]:
# see columns
print(df_viviendas.columns.to_list())

['folioviv', 'tipo_viv', 'mat_pared', 'mat_techos', 'mat_pisos', 'antiguedad', 'antigua_ne', 'cocina', 'cocina_dor', 'cuart_dorm', 'num_cuarto', 'disp_agua', 'dotac_agua', 'excusado', 'uso_compar', 'sanit_agua', 'biodigest', 'bano_comp', 'bano_excus', 'bano_regad', 'drenaje', 'disp_elect', 'focos_inca', 'focos_ahor', 'combustible', 'estufa_chi', 'eli_basura', 'tenencia', 'renta', 'estim_pago', 'pago_viv', 'pago_mesp', 'tipo_adqui', 'viv_usada', 'tipo_finan', 'num_dueno1', 'hog_dueno1', 'num_dueno2', 'hog_dueno2', 'escrituras', 'lavadero', 'fregadero', 'regadera', 'tinaco_azo', 'cisterna', 'pileta', 'calent_sol', 'calent_gas', 'medidor_luz', 'bomba_agua', 'tanque_gas', 'aire_acond', 'calefacc', 'tot_resid', 'tot_hom', 'tot_muj', 'tot_hog', 'ubica_geo', 'tam_loc', 'est_socio', 'est_dis', 'upm', 'factor', 'procaptar']


## S2: Wrangle

In [95]:
# yes-no columns
yes_no_columns = [
    'cocina',
    'cocina_dor',
    'excusado',
    'uso_compar',
    'biodigest',
    'combustible',
    'estufa_chi',
    'lavadero',
    'fregadero',
    'regadera',
    'tinaco_azo',
    'cisterna',
    'pileta',
    'calent_sol',
    'calent_gas',
    'medidor_luz',
    'bomba_agua',
    'tanque_gas',
    'aire_acond',
    'calefacc',
    'pago_mesp',
    'viv_usada'
]

# map 1 for 1 and 2 for 0
df_viviendas[yes_no_columns] = df_viviendas[yes_no_columns].replace({2: 0})
df_viviendas[yes_no_columns].describe().loc['max', :].gt(1).astype(int)

cocina         0
cocina_dor     0
excusado       0
uso_compar     0
biodigest      0
estufa_chi     0
lavadero       0
fregadero      0
regadera       0
tinaco_azo     0
cisterna       0
pileta         0
calent_sol     0
calent_gas     0
medidor_luz    0
bomba_agua     0
tanque_gas     0
aire_acond     0
calefacc       0
Name: max, dtype: int64

In [97]:
# see some sample
df_viviendas.sample(10)

Unnamed: 0,folioviv,tipo_viv,mat_pared,mat_techos,mat_pisos,antiguedad,antigua_ne,cocina,cocina_dor,cuart_dorm,num_cuarto,disp_agua,dotac_agua,excusado,uso_compar,sanit_agua,biodigest,bano_comp,bano_excus,bano_regad,drenaje,disp_elect,focos_inca,focos_ahor,combustible,estufa_chi,eli_basura,tenencia,renta,estim_pago,pago_viv,pago_mesp,tipo_adqui,viv_usada,tipo_finan,num_dueno1,hog_dueno1,num_dueno2,hog_dueno2,escrituras,lavadero,fregadero,regadera,tinaco_azo,cisterna,pileta,calent_sol,calent_gas,medidor_luz,bomba_agua,tanque_gas,aire_acond,calefacc,tot_resid,tot_hom,tot_muj,tot_hog,ubica_geo,tam_loc,est_socio,est_dis,upm,factor,procaptar
41832,1860087710,1,8,10,3,18.0,,1,0.0,3,4,1,1.0,1,0.0,1.0,0.0,0.0,0.0,0.0,0,1,5.0,3.0,3,,1,4,,2000.0,,,2.0,,5.0,1.0,1.0,,,1.0,1,1,1,0,0,1,0,0,1,0,0,0,0,4,2,2,1,18012,4,2,295,4828,194,0
35754,1560277219,1,7,9,2,30.0,,1,0.0,2,3,2,2.0,1,0.0,2.0,0.0,0.0,1.0,0.0,0,1,0.0,3.0,3,,1,4,,1200.0,,,2.0,,5.0,1.0,1.0,,,3.0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,15047,4,1,247,4244,449,0
2681,202620902,1,8,10,2,13.0,,1,0.0,1,2,1,1.0,1,0.0,1.0,0.0,1.0,0.0,0.0,1,1,7.0,0.0,3,,2,3,,2100.0,2100.0,1.0,1.0,1.0,1.0,1.0,1.0,,,3.0,0,1,1,0,0,0,0,0,1,0,0,1,0,1,1,0,1,2002,1,2,16,378,781,0
36060,1560797423,1,8,10,2,15.0,,1,0.0,1,2,2,1.0,0,,,,,,,5,1,0.0,0.0,3,,4,2,,600.0,,,,,,,,,,,0,0,0,0,0,1,0,0,1,0,0,0,0,2,1,1,1,15124,4,1,247,4259,530,0
38522,1661588809,1,8,3,2,3.0,,1,0.0,2,3,2,2.0,1,0.0,1.0,0.0,0.0,1.0,0.0,0,1,0.0,4.0,3,,1,4,,1200.0,,,3.0,,5.0,1.0,2.0,,,1.0,1,0,0,1,0,0,0,0,0,0,0,0,0,4,2,2,2,16102,4,2,266,4439,1121,0
8337,400573203,1,8,3,2,4.0,,1,0.0,1,3,2,2.0,1,0.0,2.0,0.0,0.0,1.0,0.0,0,4,0.0,3.0,1,0.0,1,4,,900.0,,,2.0,,5.0,1.0,1.0,,,3.0,0,0,0,0,0,1,0,0,0,0,0,0,0,3,1,2,1,4002,1,2,42,940,160,0
2575,201984706,1,8,10,3,10.0,,1,0.0,1,2,1,1.0,1,0.0,1.0,0.0,0.0,0.0,0.0,1,1,8.0,0.0,3,,1,1,1000.0,,,1.0,,,,,,,,,0,1,1,0,0,0,0,0,1,0,0,1,0,2,1,1,1,2002,1,2,16,352,712,0
16966,761636916,1,8,3,2,11.0,,1,0.0,1,2,2,1.0,1,1.0,2.0,0.0,0.0,1.0,0.0,0,1,0.0,0.0,1,0.0,4,4,,500.0,,,4.0,,,1.0,1.0,,,3.0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,3,6,1,7065,4,1,114,2021,730,1
6520,301014815,1,8,10,3,,1.0,1,1.0,1,1,1,5.0,1,0.0,1.0,0.0,1.0,0.0,0.0,1,1,0.0,0.0,5,,1,1,1500.0,,,1.0,,,,,,,,,1,1,1,1,1,0,0,0,1,1,0,0,0,2,1,1,1,3008,2,2,32,782,127,0
14541,660016316,1,8,10,2,7.0,,1,0.0,1,3,1,2.0,1,0.0,1.0,0.0,0.0,1.0,0.0,1,1,5.0,1.0,3,,1,2,,1500.0,,,,,,,,,,,1,0,0,1,0,1,0,0,1,0,0,0,0,3,1,2,1,6005,4,2,95,1845,26,0


## S3: Summary Statistics

In [115]:
def weighted_mode(df, columns_to_mode, weighting_column):
    # Initialize a DataFrame to store the mode values
    mode_values = pd.DataFrame(columns=['Column', 'Weighted_Mode'])

    # Calculate weighted mode for each column in columns_to_mode
    for col in columns_to_mode:
        # Group by the column values and calculate weighted counts
        weighted_counts = df.groupby(col)[weighting_column].sum()

        # Find the index of maximum weighted count
        mode_index = weighted_counts.idxmax()

        # Get the mode value
        mode_value = df.loc[df[col] == mode_index, col].iloc[0]

        # Append mode value to mode_values DataFrame
        mode_values.loc[len(mode_values)] = [col, mode_value]

    # return a Series
    return mode_values.set_index('Column')['Weighted_Mode']


def weighted_mean(df, cols, weight_col):
    return (df[cols].mul(df[weight_col], axis=0).sum() / df[weight_col].sum())


def get_statistics(df, cols_mode, cols_mean, weight_col):
    # s1: get weighted mode
    mode_values = weighted_mode(df, cols_mode, weight_col)

    # s2: get weighted mean
    mean_values = weighted_mean(df, cols_mean, weight_col)

    # s3: get total viviendas
    total_viviendas = df[weight_col].sum()

    # return all the values in a pd series
    melt_values = pd.concat([mode_values, mean_values])
    melt_values['total_viviendas'] = total_viviendas

    return melt_values

In [131]:
# mode columns
mode_columns = [
    'mat_pared', 'mat_techos', 'mat_pisos', 'disp_agua',
    'combustible', 'eli_basura', 'tenencia', 'tipo_adqui',
    'tipo_adqui', 'tipo_finan', 'escrituras', 'disp_elect',
    'tipo_viv'
]

# continuous columns
cont_columns = [
    'regadera', 'pago_mesp', 'tinaco_azo',
    'lavadero', 'procaptar', 'tot_resid', 'antiguedad',
    'bano_comp', 'calent_sol', 'cocina',
    'aire_acond', 'tot_hom', 'cocina_dor', 'renta',
    'fregadero', 'focos_inca', 'sanit_agua',
    'uso_compar', 'medidor_luz', 'est_socio',
    'tot_muj', 'dotac_agua', 'bano_regad', 'estufa_chi',
    'tam_loc', 'tanque_gas', 'focos_ahor',
    'cisterna', 'cuart_dorm', 'drenaje', 'excusado', 'pileta',
    'num_cuarto', 'calent_gas', 'calefacc',
    'bano_excus', 'pago_viv',
    'bomba_agua', 'viv_usada', 'biodigest', 'tot_hog',
    'estim_pago'
]

# get statistics
table_vivienda = (
    df_viviendas
    .groupby("ubica_geo")
    .apply(
        lambda x: get_statistics(x, mode_columns, cont_columns, "factor"),
        include_groups=False
    )
)

In [133]:
# see if there are a column repeated between mode and continuous
set(mode_columns).intersection(set(cont_columns))

set()

In [132]:
# see summary
table_vivienda

Unnamed: 0_level_0,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_adqui,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas
ubica_geo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1
1001,8.0,10.0,3.0,1.0,3.0,3.0,4.0,1.0,1.0,1,1.0,1,1,0.945602,0.395334,0.782853,0.895975,0.0,3.781875,19.145343,0.811764,0.263397,0.979702,0.015126,1.806226,0.023304,385.384175,0.882548,1.340795,1.069981,0.029129,0.972374,2.830593,1.975650,1.091257,0.010228,0.001665,1.313242,0.146555,7.603346,0.493936,2.460400,1.016775,0.996631,0.133018,4.529099,0.515284,0.010715,0.229074,442.032669,0.416913,0.635324,0.014665,1.019717,2165.050435,234164
1002,8.0,10.0,2.0,1.0,3.0,3.0,4.0,3.0,3.0,5,1.0,1,1,0.858142,0.033360,0.825390,0.900527,0.0,3.604238,19.959440,0.770026,0.559116,0.966640,0.000000,1.983979,0.110221,35.956195,0.604847,1.853985,1.075745,0.064997,0.978503,2.000000,1.620260,1.121476,0.032752,0.000000,3.502535,0.193673,3.343034,0.108497,2.040256,1.011357,0.989252,0.097141,3.856520,0.383695,0.000000,0.205638,0.000000,0.066112,0.089333,0.010748,1.000000,1048.428311,9862
1003,8.0,10.0,2.0,1.0,3.0,3.0,4.0,3.0,3.0,5,1.0,1,1,0.921236,0.194712,0.945791,0.845611,0.0,4.218875,18.664862,0.765278,0.517141,0.950420,0.005413,1.917471,0.031301,165.882168,0.791637,1.660704,1.071468,0.050835,0.955911,2.000000,2.301404,1.188436,0.027379,0.043383,3.491645,0.109438,4.622343,0.117910,2.284616,0.987919,1.000000,0.077822,4.166235,0.224994,0.000000,0.231113,12.238174,0.097513,0.068565,0.037499,1.010826,946.097121,12747
1005,8.0,10.0,3.0,1.0,3.0,3.0,4.0,1.0,1.0,5,1.0,1,1,0.923233,0.397962,0.769360,0.923120,0.0,4.099135,17.957064,0.759095,0.439699,0.937282,0.002810,2.150613,0.021505,316.383800,0.790641,1.173579,1.108876,0.057248,0.953917,2.289274,1.948522,1.238732,0.048144,0.010415,3.006706,0.102394,6.985726,0.379341,2.421640,1.046533,0.992919,0.093927,4.051403,0.454760,0.002810,0.270690,640.096362,0.320857,0.509385,0.010940,1.024653,1650.605073,26691
1006,8.0,10.0,3.0,1.0,3.0,3.0,4.0,3.0,3.0,5,1.0,1,1,0.908175,0.244839,0.868611,0.891219,0.0,4.706995,19.819463,0.759010,0.494020,0.960436,0.000000,2.280554,0.071019,248.869594,0.783339,1.922018,1.040056,0.052507,0.979931,2.263761,2.426442,1.237877,0.006881,0.006881,2.316514,0.065367,4.733372,0.293742,2.540220,0.993119,1.000000,0.059469,4.178080,0.349934,0.000000,0.254997,45.412844,0.294397,0.215596,0.013188,1.033093,1356.364679,12208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32051,8.0,10.0,2.0,1.0,3.0,1.0,4.0,3.0,3.0,5,1.0,1,1,0.956878,0.126548,0.971565,0.632824,0.0,3.843737,21.579284,0.915635,0.758432,0.986252,0.013748,1.969687,0.014687,86.115618,0.599693,1.761165,1.176245,0.000000,0.955939,1.647511,1.874050,1.155922,0.000000,0.058748,3.352489,0.121254,4.644351,0.058748,2.349074,0.853130,1.000000,0.193749,4.284092,0.345316,0.000000,0.170950,43.305439,0.041243,0.196226,0.146870,1.000000,1764.819401,11711
32052,7.0,10.0,2.0,1.0,3.0,1.0,4.0,3.0,3.0,5,1.0,1,1,0.577800,0.074724,0.648749,0.839648,0.0,3.933664,25.465958,0.548441,0.267091,0.943171,0.000000,1.917447,0.012792,49.909129,0.342514,1.180204,1.355725,0.028799,0.926395,2.000000,2.016217,1.707675,0.000000,0.195163,3.662379,0.058157,3.745072,0.102754,2.309101,1.380749,0.951978,0.220607,4.034671,0.188382,0.000000,0.432895,48.021809,0.162240,0.030686,0.000000,1.041591,731.427373,14306
32053,7.0,10.0,2.0,1.0,3.0,1.0,4.0,3.0,3.0,5,1.0,1,1,0.796117,0.023900,0.793874,0.637791,0.0,3.980587,26.301141,0.750561,0.273339,0.932787,0.000000,1.898449,0.021656,9.560043,0.542191,2.494391,1.251683,0.091113,0.956687,2.000000,2.082138,1.973173,0.021656,0.067213,3.000000,0.088869,2.344259,0.069457,2.182226,1.000000,1.000000,0.208370,3.546678,0.451078,0.000000,0.362209,0.000000,0.165057,0.023900,0.000000,1.000000,825.418008,10251
32054,7.0,10.0,2.0,1.0,3.0,1.0,4.0,3.0,3.0,5,1.0,1,1,0.592968,0.026186,0.619154,0.752109,0.0,4.225957,21.759600,0.501451,0.499359,0.958629,0.000000,2.053587,0.034960,19.639603,0.379362,1.961801,1.396976,0.048053,0.917190,1.650739,2.172370,1.973139,0.034960,0.075926,3.698859,0.065465,2.122562,0.106904,2.197813,1.467031,0.895795,0.121887,3.747722,0.144091,0.000000,0.381251,0.000000,0.200715,0.056759,0.015185,1.000000,841.580617,14817


In [None]:
## S4: All tables (WIP9)

---
# Sandbox

In [56]:
df_concentradohogar

Unnamed: 0_level_0,factor,ubica_geo,est_socio,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,arrenda,rentas,utilidad,arrenda,transfer,jubilacion,remesas,bene_gob,estim_alqu,alimentos,ali_dentro,transporte,medicinas,mantenim,combus,educa_espa,percep_tot,deposito,deudas
folioviv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
100013601,179,01001,3,3,2,1,3,0,1,2,2,3,2,76403.70,0.00,0.00,0.0,0.00,11288.96,9147.54,0.00,1622.95,12000.00,5618.47,4075.63,8400.00,0.00,7200.00,7200.00,0.00,0.00,0.00,0.00
100013602,179,01001,3,5,2,3,5,0,5,0,2,5,2,42987.73,0.00,0.00,0.0,0.00,3752.67,0.00,98.36,0.00,24000.00,20930.29,8587.46,7628.56,341.40,3600.00,3600.00,17567.05,6073.09,9.83,0.00
100013603,179,01001,3,2,1,1,2,0,2,0,2,2,2,580697.74,29508.19,29508.19,0.0,29508.19,391304.34,0.00,0.00,0.00,18000.00,37594.06,25251.25,12325.68,0.00,4500.00,4500.00,0.00,3857.14,66393.44,14754.09
100013604,179,01001,3,2,1,1,2,0,1,1,0,2,0,46252.71,0.00,0.00,0.0,0.00,34252.71,23606.55,5901.63,0.00,12000.00,2892.84,2892.84,7350.00,322.82,6000.00,6000.00,639.34,1380.55,0.00,0.00
100013606,179,01001,3,4,1,3,3,1,3,0,2,2,2,53837.09,0.00,0.00,0.0,0.00,107.60,0.00,0.00,0.00,10500.00,7367.09,4795.67,600.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3260798902,192,32046,2,2,1,1,2,0,2,0,1,1,1,26772.78,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,2903.22,7926.20,7926.20,6074.79,0.00,4354.83,4354.83,0.00,0.00,0.00,0.00
3260798903,192,32046,2,2,1,1,2,0,0,2,0,2,0,11590.16,0.00,0.00,0.0,0.00,8686.94,0.00,0.00,3404.34,2903.22,5926.95,5926.95,2201.61,0.00,1451.61,1451.61,0.00,1285.71,0.00,0.00
3260798904,192,32046,2,7,3,4,5,2,3,2,4,4,4,81981.88,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,2032.25,10876.91,10876.91,10297.70,0.00,6344.48,5806.45,1248.38,0.00,0.00,0.00
3260798905,192,32046,2,1,1,0,1,0,1,0,1,1,1,28048.06,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,6351.29,3137.02,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [90]:
# see some sample
df_viviendas.sample(10).to_csv("viviendas_sample.csv")

In [109]:
import pandas as pd
import numpy as np

def weighted_mode(df, columns_to_mode, weighting_column):
    # Initialize a DataFrame to store the mode values
    mode_values = pd.DataFrame(columns=['Column', 'Weighted_Mode'])

    # Calculate weighted mode for each column in columns_to_mode
    for col in columns_to_mode:
        # Group by the column values and calculate weighted counts
        weighted_counts = df.groupby(col)[weighting_column].sum()

        # Find the index of maximum weighted count
        mode_index = weighted_counts.idxmax()

        # Get the mode value
        mode_value = df.loc[df[col] == mode_index, col].iloc[0]

        # Append mode value to mode_values DataFrame
        mode_values.loc[len(mode_values)] = [col, mode_value]

    # return a Series
    return mode_values.set_index('Column')['Weighted_Mode']

# Example usage:
# Assuming df is your DataFrame, columns_to_mode are the columns you want to find mode for,
# and weighting_column is the column containing the weights.
# Replace 'df', 'columns_to_mode', and 'weighting_column' with your actual DataFrame and column names.

# Sample DataFrame (you would replace this with your actual DataFrame)
data = {
    'A': [1, 2, 2, 3, 3, np.nan],
    'B': [4, 5, 5, 6, 6, 6],
    'factor': [1, 2, 1, 2, 1, 3]  # Weighting factor column
}

df = pd.DataFrame(data)

# Columns to find mode for
columns_to_mode = ['A', 'B']

# Weighting column
weighting_column = 'factor'

# Calculate weighted mode for each column
result = weighted_mode(df, columns_to_mode, weighting_column)
result


Column
A    2.0
B    6.0
Name: Weighted_Mode, dtype: float64

In [100]:
df

Unnamed: 0,A,B,factor
0,1,4,1
1,2,5,2
2,2,5,1
3,3,6,2
4,3,6,1
