# ENIGH Database

@roman

26 June, 2024

Code to create an ETL of the Enigh database, which is a database of the National Institute of Statistics and Geography (INEGI) of Mexico. The database contains information on the income and expenses of Mexican households.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm

In [2]:
# Settings
# show 100 columns in pandas
pd.set_option('display.max_columns', 500)

---
# Concentrado Hogar

The data diccionary is found [here](https://www.inegi.org.mx/rnm/index.php/catalog/685/data-dictionary/F28?file_name=concentradohogar)

## S1: Extract & Transform

In [3]:
# function to extract, transform and load each enigh file
def weighted_mean(df, cols, weight_col):
    return (df[cols].mul(df[weight_col], axis=0).sum() / df[weight_col].sum())


def weighted_mode(df, columns_to_mode, weighting_column):
    # Initialize a DataFrame to store the mode values
    mode_values = pd.DataFrame(columns=['Column', 'Weighted_Mode'])

    # Calculate weighted mode for each column in columns_to_mode
    for col in columns_to_mode:
        # Group by the column values and calculate weighted counts
        weighted_counts = df.groupby(col)[weighting_column].sum()

        # Find the index of maximum weighted count
        mode_index = weighted_counts.idxmax()

        # Get the mode value
        mode_value = df.loc[df[col] == mode_index, col].iloc[0]

        # Append mode value to mode_values DataFrame
        mode_values.loc[len(mode_values)] = [col, mode_value]

    # return a Series
    return mode_values.set_index('Column')['Weighted_Mode']


def get_statistics(df, cols_mode, cols_mean, weight_col):
    # s1: get weighted mode
    mode_values = weighted_mode(df, cols_mode, weight_col)

    # s2: get weighted mean
    mean_values = weighted_mean(df, cols_mean, weight_col)

    # s3: get total viviendas
    total_hogares = df[weight_col].sum()

    # return all the values in a pd series
    melt_values = pd.concat([mode_values, mean_values])
    melt_values['total_hogares'] = total_hogares

    return melt_values


def get_enigh_concentrado_hogar(file, cols_mode, cols_mean):
    # s1:read file
    df = pd.read_csv(file, na_values=[' '])
    
    # s2: subset and wrangle
    df = (
        df
        .query("foliohog == 1")
        .reset_index(drop=True)
        .assign(
            ubica_geo=lambda x: x["ubica_geo"].astype(str).str.zfill(5)
        )
    )

    # s3: get weighted mean
    table = (
        df
        .groupby("ubica_geo", as_index=False)
        .apply(
            lambda x: get_statistics(x, cols_mode, cols_mean, "factor"),
            include_groups=False
        )
    )
    
    return table


In [4]:
# get params
years_enigh = [2018, 2020, 2022]

cols_mode = [
    'est_dis', 'clase_hog', 'sexo_jefe',
]
cols_mean = [
    'tam_loc', 'est_socio',  'edad_jefe',
    'educa_jefe', 'tot_integ', 'hombres', 'mujeres',
    'mayores', 'menores', 'p12_64', 'p65mas', 'ocupados',
    'percep_ing', 'perc_ocupa', 'ing_cor', 'ingtrab',
    'trabajo', 'sueldos', 'horas_extr', 'comisiones',
    'aguinaldo', 'indemtrab', 'otra_rem', 'remu_espec',
    'negocio', 'noagrop', 'industria', 'comercio', 'servicios',
    'agrope', 'agricolas', 'pecuarios', 'reproducc', 'pesca',
    'otros_trab', 'rentas', 'utilidad', 'arrenda', 'transfer',
    'jubilacion', 'becas', 'donativos', 'remesas', 'bene_gob',
    'transf_hog', 'trans_inst', 'estim_alqu', 'otros_ing', 'gasto_mon',
    'alimentos', 'ali_dentro', 'cereales', 'carnes', 'pescado',
    'leche', 'huevo', 'aceites', 'tuberculo', 'verduras', 'frutas',
    'azucar', 'cafe', 'especias', 'otros_alim', 'bebidas', 'ali_fuera',
    'tabaco', 'vesti_calz', 'vestido', 'calzado', 'vivienda',
    'alquiler', 'pred_cons', 'agua', 'energia', 'limpieza', 'cuidados',
    'utensilios', 'enseres', 'salud', 'atenc_ambu', 'hospital',
    'medicinas', 'transporte', 'publico', 'foraneo', 'adqui_vehi',
    'mantenim', 'refaccion', 'combus', 'comunica', 'educa_espa',
    'educacion', 'esparci', 'paq_turist', 'personales', 'cuida_pers',
    'acces_pers', 'otros_gas', 'transf_gas', 'percep_tot', 'retiro_inv',
    'prestamos', 'otras_perc', 'ero_nm_viv', 'ero_nm_hog', 'erogac_tot',
    'cuota_viv', 'mater_serv', 'material', 'servicio', 'deposito',
    'prest_terc', 'pago_tarje', 'deudas', 'balance', 'otras_erog',
]

FILE_TEMPLATE = 'enigh{enigh_year}_ns_concentradohogar_csv.zip'
ENIGH_ROOT = "../../data/catalogues/adamuz_data/enigh_{enigh_year}/"

In [5]:
# get all tables
list_tables = []
for year in tqdm(years_enigh):
    # get file
    dir_file = ENIGH_ROOT.format(enigh_year=year)
    file = dir_file + FILE_TEMPLATE.format(enigh_year=year)

    # unzip
    os.system(f"unzip -o {file} -d {dir_file}")

    # get table
    table = get_enigh_concentrado_hogar(
        dir_file + 'concentradohogar.csv',
        cols_mode,
        cols_mean
        )

    # add year
    table["year"] = year

    # append
    list_tables.append(table)


  0%|          | 0/3 [00:00<?, ?it/s]

Archive:  ../../data/catalogues/adamuz_data/enigh_2018/enigh2018_ns_concentradohogar_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/concentradohogar.csv  
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/nota_bases_datos_enigh2018_ns.txt  


 33%|███▎      | 1/3 [00:04<00:09,  4.64s/it]

Archive:  ../../data/catalogues/adamuz_data/enigh_2020/enigh2020_ns_concentradohogar_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2020/concentradohogar.csv  


 67%|██████▋   | 2/3 [00:09<00:04,  4.76s/it]

Archive:  ../../data/catalogues/adamuz_data/enigh_2022/enigh2022_ns_concentradohogar_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2022/concentradohogar.csv  


100%|██████████| 3/3 [00:14<00:00,  4.82s/it]


In [6]:
# concat
df_concentradohogar_all = (
    pd.concat(list_tables)
    .sort_values(["ubica_geo", "year"], ignore_index=True)
    )
df_concentradohogar_all.shape

(3218, 123)

## S2: EDA

In [7]:
# see sample
df_concentradohogar_all.head()

Unnamed: 0,ubica_geo,est_dis,clase_hog,sexo_jefe,tam_loc,est_socio,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,total_hogares,year
0,1001,2.0,2.0,1.0,1.313242,2.830593,50.089514,6.479373,3.721097,1.780415,1.940683,2.999458,0.72164,2.691494,0.307964,1.803984,2.391824,1.774901,63956.301534,43070.626604,37697.35196,31202.010734,495.39457,1317.133005,2206.442086,252.169473,852.605979,1371.596112,3683.76857,3046.691033,415.123858,1268.234312,1363.332863,637.077537,0.783675,633.1576,3.136262,0.0,1689.506073,4540.471227,3606.090472,934.380755,9947.056421,5419.655226,159.891602,989.171367,295.72428,299.683788,1599.486996,1183.443163,6351.823899,46.323383,39119.615453,11954.944213,8940.815992,1409.190847,1791.346954,141.805814,1001.070483,256.830314,64.707445,116.593703,888.009195,411.459735,58.34193,74.699891,70.812195,1791.369493,864.577994,2928.15608,85.972141,1740.256643,1088.568451,651.688192,3114.482487,1146.30137,175.330648,644.507169,1148.3433,2429.241635,1727.998294,274.058488,427.184852,1639.883555,1227.447356,197.869469,214.566731,8381.883266,1171.234546,335.669626,1267.892256,3749.066387,355.85374,3393.212646,1858.020452,5514.06328,3462.857938,1320.678289,730.527053,2968.421188,2140.535891,117.545545,710.339753,1376.439186,4509.766233,906.226031,512.698217,1570.168785,10.959857,1509.713343,8311.22751,1231.299092,479.004866,209.606931,269.397936,2451.678069,257.427359,2074.223392,787.065174,316.65008,713.879477,234164.0,2018
1,1001,2.0,2.0,1.0,1.191967,2.848673,50.307018,6.518359,3.511532,1.67006,1.841472,2.877479,0.634053,2.578268,0.299212,1.696833,2.29508,1.642091,60912.502414,39994.510208,36177.707945,30166.188042,278.128582,943.278159,2202.367386,323.549054,1224.366678,1039.830043,2677.040577,2616.953301,378.597045,837.298788,1401.057468,60.087276,2.420698,57.607004,0.059573,0.0,1139.761686,3339.710125,2725.802014,613.90811,10562.410583,6146.652668,104.825597,1051.749094,361.69834,898.020269,1472.286234,527.178381,6937.108693,78.762806,37361.636895,12577.730151,10739.532593,1660.287784,1981.896889,180.714402,1105.435781,366.78371,92.778479,130.16186,1079.879732,523.272579,81.819202,84.717259,85.751375,2324.198133,1041.835407,1720.38709,117.810468,1351.179617,878.251851,472.927765,3523.484832,1395.231161,184.045724,789.58759,1154.620356,2287.033608,1552.910254,271.907183,462.216171,1778.678539,1277.235951,174.258678,327.18391,8257.743571,929.733346,119.353027,1820.589293,3162.720259,342.765384,2819.954876,2225.347646,3700.139892,2536.653595,776.041132,387.445165,2749.710175,1969.424468,112.600106,667.685601,1135.93651,4979.077693,1640.427443,570.14062,1558.225213,42.339486,1167.944931,10520.616231,1834.717706,532.231593,265.539319,266.692273,4694.411677,129.864933,1556.153613,757.984491,463.801093,551.451125,267473.0,2020
2,1001,2.0,2.0,1.0,1.260126,2.843422,50.853007,6.667584,3.485117,1.630896,1.854221,2.844748,0.640369,2.536844,0.307904,1.688633,2.319125,1.663681,85273.9755,54618.627742,46768.736235,38819.779869,589.695894,1155.565731,2649.834336,264.17195,1727.459018,1562.229437,6391.428561,5888.420235,766.776615,2344.601644,2777.041975,503.008326,0.0,503.008326,0.0,0.0,1458.462947,6910.592874,5149.578103,1761.014771,14780.159263,7191.002235,108.277001,1569.589724,673.929321,1579.764943,2139.159845,1518.436194,8849.450081,115.14554,47870.29216,15429.203656,12588.366776,2001.663323,2403.058055,175.872044,1287.168127,468.580378,134.253503,186.934718,1206.943568,589.087442,95.506481,104.592349,95.286204,2631.073397,1208.347185,2707.329485,133.507395,1836.530237,1121.475143,715.055094,4335.68938,1732.631168,335.986748,842.009523,1425.061941,3344.99517,2252.075444,580.827866,512.091861,1720.222464,1223.844712,246.189409,250.188344,9865.144573,1220.680673,311.682692,1854.674742,4071.129026,399.856847,3671.272179,2406.977439,6212.240008,3870.336809,1398.845743,943.057456,3802.91017,2583.958871,155.036877,1063.914422,1323.356501,6767.566527,2160.745485,464.943617,2153.185749,327.203453,1661.488223,13991.693495,1978.273587,288.019917,145.017387,143.00253,5997.86469,156.027955,2643.188077,744.784574,457.460912,1726.073781,267713.0,2022
3,1002,10.0,2.0,1.0,3.502535,2.0,49.600994,4.982965,3.604238,1.983979,1.62026,2.760799,0.843439,2.500406,0.260393,1.721456,2.213243,1.621882,56859.42945,24675.146233,22639.608039,18628.511903,886.846242,0.0,1274.586249,223.188603,460.445814,1166.029227,1191.927595,762.882436,295.869884,236.925137,230.087414,429.045159,164.702685,264.342474,0.0,0.0,843.610599,23675.976623,23631.338923,44.6377,5451.38016,443.291597,172.418478,802.779294,1569.348306,624.306757,1420.621389,418.614339,3056.926434,0.0,21993.396234,8219.964496,6756.400139,1514.529793,1135.884369,40.070674,525.476606,283.662692,113.508012,157.457868,937.194774,351.052311,61.911357,16.976191,52.826062,900.659416,665.190014,1397.342622,66.221735,1495.82783,1033.302822,462.525008,1339.606533,104.80488,16.754183,205.183786,1012.863684,1680.446001,1177.354195,292.371691,210.720115,552.17986,365.701076,100.468288,86.010496,3832.240098,551.075067,305.880559,525.023184,1672.733856,124.451836,1548.28202,777.527433,2771.949981,2014.941112,563.664797,193.344071,1748.0378,1547.277353,71.133487,129.626959,353.143635,1629.274389,646.414495,139.68577,38.114105,0.0,805.060019,3019.599582,0.0,474.202007,281.918093,192.283914,832.381317,10.572054,63.432649,153.873705,1376.595836,108.542015,9862.0,2018
4,1002,11.0,2.0,1.0,4.0,2.0,51.105312,5.020217,4.385703,2.075317,2.310386,3.304836,1.080867,2.881739,0.423097,1.839984,2.478858,1.762685,42676.932291,28949.53069,23455.044724,20353.21711,289.514035,100.071924,1067.784121,44.078525,344.034133,1256.344874,4318.092303,2342.715008,772.794728,1331.904181,238.016099,1975.377295,725.629626,1249.747669,0.0,0.0,1176.393663,2627.893293,2461.790284,166.103009,7652.633981,1284.361472,76.407115,1160.62607,1561.976716,1818.070979,1634.229198,116.96243,3409.293809,37.580518,24535.163513,10057.882685,8938.334839,1917.737784,1633.17432,78.674667,729.702141,359.271783,149.51401,163.616924,1091.038517,265.520017,143.136826,57.307307,61.904314,1377.758239,909.97799,1115.317676,4.23017,1494.923783,1034.875998,460.047785,1404.263898,60.937721,45.485267,296.539376,1001.301534,1987.917123,1279.63468,241.193533,467.08891,1378.318923,862.202386,135.480009,380.636527,4183.672015,463.693001,13.089827,556.921541,1915.928905,110.701546,1805.227359,1234.038742,1400.986002,1074.667875,231.7884,94.529726,2049.032189,1764.058932,34.384234,250.589024,578.166895,3222.72366,741.477315,229.954656,823.146531,0.0,1428.145157,6509.449112,0.0,411.181687,284.149183,127.032504,3711.725764,105.886901,96.435736,477.849499,1453.620311,252.749214,7568.0,2020


In [8]:
# shape
df_concentradohogar_all.shape

(3218, 123)

In [9]:
# see empty values
df_concentradohogar_all.isnull().sum()[df_concentradohogar_all.isnull().sum() > 0]

Series([], dtype: int64)

In [10]:
# count # of ubica_geo
df_concentradohogar_all["ubica_geo"].value_counts().value_counts()

count
3    664
1    502
2    362
Name: count, dtype: int64

In [11]:
# describe
df_concentradohogar_all.describe()

Unnamed: 0,est_dis,clase_hog,sexo_jefe,tam_loc,est_socio,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,total_hogares,year
count,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0
mean,306.894655,2.024549,1.018334,3.126627,1.738642,51.777733,5.09802,3.626452,1.756526,1.869925,2.874127,0.752324,2.516746,0.357382,1.737205,2.361901,1.659605,42832.739771,27342.263322,21022.137858,18360.965488,142.262048,387.370905,918.352346,100.772285,375.463892,736.950893,5144.973552,3571.022166,965.558829,1307.249235,1298.214102,1573.951386,839.207739,648.441564,35.62828,50.673804,1175.151912,2560.332132,2155.66974,404.662392,8356.073,2834.563251,69.764822,1111.201224,842.251982,1682.114427,1294.96088,521.216414,4540.223875,33.847443,26978.456638,10722.503389,9178.216533,1815.776905,1985.935418,191.479814,810.754164,399.759513,179.801073,158.835174,1166.499288,364.515518,132.1625,100.298404,85.964536,1004.160293,782.273934,1500.58238,43.704476,1089.014242,663.350024,425.664218,2143.950353,492.10169,105.836001,233.313465,1312.699197,1745.702951,1326.950768,154.707186,264.044997,1090.296698,805.042192,113.984523,171.269983,5031.318885,1135.877534,138.677962,654.763285,1912.605288,190.388716,1722.216572,1189.394816,2289.39638,1641.266439,485.30151,162.828431,2108.788971,1643.044461,70.585569,395.158941,757.484768,3404.853373,887.634839,423.18689,563.459168,41.937477,1488.634998,5091.98567,281.405813,382.341654,246.206399,136.135255,2710.776539,81.868545,453.112205,378.826458,479.237772,324.416685,32945.253263,2020.084525
std,145.922871,0.170082,0.134178,0.94244,0.599341,4.652554,1.114301,0.55671,0.310097,0.320843,0.361939,0.301188,0.414995,0.163814,0.338901,0.363624,0.302988,22079.261206,13624.868297,12552.947856,10571.895894,273.596572,730.809214,1129.441988,492.834326,561.607693,1319.945559,5602.526469,3210.407989,1566.704211,2023.140395,1581.801258,4704.270603,3594.124805,2272.837941,411.593423,383.87096,1238.658581,10020.006994,9002.40879,1575.824722,4809.179743,3930.437381,157.433559,902.178519,1471.656998,1033.746098,874.195265,1051.108887,2641.8804,150.033695,11049.292672,3424.128936,2527.859401,534.205421,847.653631,212.697699,352.802879,158.582726,108.802514,86.709701,403.089733,227.235148,96.496406,76.533667,52.637713,751.658398,458.862095,1432.257307,72.741868,613.473681,415.077051,245.074229,1598.960558,935.987191,290.270275,225.581083,603.345665,856.83661,670.747769,151.344192,246.698419,1082.226831,906.164427,373.188263,179.876313,2801.542113,866.237,170.5982,1382.168033,1347.640313,208.86354,1233.632288,701.579133,1836.578691,1455.647583,406.859784,360.552993,954.593303,597.307267,77.137441,497.376622,941.693618,3724.412643,2114.884268,876.415986,1636.452589,249.778868,1945.518702,8723.316637,679.453627,686.206314,469.394427,329.565881,7722.201818,289.404396,1291.502099,567.986961,1018.181229,1069.963643,60317.776856,1.624437
min,2.0,1.0,1.0,1.0,1.0,35.578947,1.647059,1.823529,0.823529,0.5,1.666667,0.0,0.769231,0.0,0.461538,1.23518,0.461538,7025.1776,2048.4985,220.3885,220.3885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,295.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1011.8672,0.0,4695.6392,2536.177368,2350.776316,217.0236,0.0,0.0,0.0,0.0,0.0,0.0,23.444706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.03125,0.0,0.0,0.0,2.902609,262.213182,262.213182,0.0,0.0,0.0,0.0,0.0,0.0,60.941579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,216.839545,216.839545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,596.0,2018.0
25%,199.0,2.0,1.0,2.516317,1.0,48.70524,4.317911,3.274568,1.55544,1.662957,2.642101,0.553403,2.263158,0.242709,1.517938,2.127926,1.459978,29889.115003,18309.341903,12378.825616,11072.997178,0.0,0.0,254.390377,0.0,18.631791,117.596704,2607.497615,1640.168641,135.239157,299.38593,278.24515,58.22265,0.0,0.0,0.0,0.0,489.940704,33.089443,0.0,0.0,5258.913105,354.136115,0.0,524.537711,0.0,913.734682,702.612945,106.035562,3047.424902,0.0,19930.566513,8467.494607,7415.360074,1467.966364,1400.911154,53.783281,566.324939,292.161705,105.219149,99.397628,875.613971,202.036786,71.306652,49.629522,50.750658,463.588681,435.558929,605.231759,0.0,673.740754,388.667558,254.205029,1228.604431,0.0,11.246043,60.800604,963.778083,1216.694076,948.304753,56.428613,97.353886,474.079375,314.354991,0.0,61.907056,3174.615883,482.369617,30.184493,0.0,931.085995,53.65583,827.607753,686.007408,1096.973883,691.06425,241.458912,0.0,1469.541017,1234.771067,28.305885,84.401607,222.672356,1658.39046,176.992429,19.426037,3.269956,0.0,771.909047,1914.533984,0.0,46.078569,26.493325,0.0,556.193607,0.0,0.0,0.0,59.941534,0.0,7035.75,2018.0
50%,299.0,2.0,1.0,3.298769,2.0,51.568747,5.03311,3.567275,1.727273,1.841641,2.850307,0.713246,2.512498,0.341979,1.70365,2.333333,1.640999,39869.456376,25822.063305,19296.596791,17100.243792,38.408246,88.657252,679.474123,0.0,197.241986,427.0986,4079.346108,2878.009426,577.544267,813.918873,921.155182,583.50728,172.150524,116.41754,0.0,0.0,953.984703,723.200522,413.322825,95.01625,7512.118099,1731.967372,16.429932,929.998177,250.751114,1542.821228,1164.594194,275.595066,3929.271023,0.0,25549.982164,10372.444365,9040.695648,1781.743083,1919.664698,125.585412,797.670121,377.515579,155.110305,147.409917,1129.385507,327.783708,107.632078,83.322976,77.78425,823.610429,701.263688,1180.660302,12.928432,991.937261,591.09785,383.660923,1826.589343,177.345808,43.5625,177.5,1254.396328,1605.797474,1208.304826,117.449471,209.771111,834.467063,577.86723,15.14107,129.732027,4533.068571,965.510262,86.330372,170.914101,1649.844215,141.823022,1486.880278,1093.465833,1871.105318,1290.694348,407.026774,44.933181,1954.620217,1589.027351,55.219715,254.782758,523.730182,2738.933253,507.548775,195.652,104.097826,0.0,1245.280821,3695.403654,0.0,174.538422,110.093782,37.408616,1623.270476,14.26625,101.574398,196.410853,215.5177,85.736678,13650.0,2020.0
75%,433.0,2.0,1.0,4.0,2.0,54.665891,5.887943,3.930711,1.928207,2.048896,3.090699,0.9,2.764706,0.45,1.937673,2.565217,1.85,51979.013228,34557.327656,27780.953793,24155.541584,173.152685,518.174376,1317.720547,37.420984,517.649674,918.850418,6197.604471,4602.488804,1254.863713,1571.958105,1717.450784,1807.15946,779.243506,579.18145,0.0,0.0,1525.109045,2392.000718,1861.381553,417.75019,10432.399989,4123.632094,79.134144,1467.921669,1054.334789,2267.545107,1702.184372,611.378844,5228.351652,10.460019,32365.405007,12636.303887,10773.701152,2122.136031,2466.701461,257.142232,1033.432579,486.214317,228.173664,201.349359,1410.714451,490.002623,162.626603,131.388522,110.655061,1386.760253,1045.150099,1977.874937,59.590384,1387.516859,843.723982,545.374897,2684.149462,636.671948,110.329848,344.777561,1577.770019,2095.839645,1541.562586,204.361144,363.639966,1342.229632,982.265733,92.627692,225.810786,6356.25861,1603.51335,186.759146,809.768713,2626.332944,258.371423,2364.476373,1565.653336,2996.003392,2224.12916,619.744899,184.787935,2570.163068,1981.441259,92.875114,532.505918,1026.92875,4167.929956,1030.580097,532.864064,489.13,3.161105,1887.595572,6413.763542,255.850618,437.189863,269.091982,146.049614,3292.116281,82.922562,489.557663,524.609937,519.523964,344.537772,28826.75,2022.0
max,560.0,3.0,2.0,4.0,4.0,72.368421,9.344718,6.777778,3.5625,3.421053,5.05,2.578947,4.55,1.6,3.8125,4.5625,3.0,572613.797646,191517.9725,141050.832806,140536.798588,5366.189524,9456.521667,33697.789019,10961.427185,12808.664,49188.752981,174002.7075,43576.716522,33887.6745,43411.645652,22387.291304,174002.7075,151022.995,72633.545833,20192.037143,12165.959545,32688.403687,374277.694588,301291.03504,72986.659548,80796.708462,80440.880769,3124.470497,15119.818552,16958.1505,10200.305556,11380.839483,23116.2405,40926.941033,3409.628571,198564.803559,53320.775816,23501.893873,5018.0915,8206.536555,1818.936842,2869.491789,1433.564167,925.709545,1158.525699,3975.332,1647.132222,1029.046957,610.709,605.48134,5422.739256,3433.241818,29682.632442,676.392174,8465.745762,5639.592,3146.540559,31567.734613,15105.459945,11225.096908,2583.217391,5974.432267,13679.22329,11005.484394,2678.53188,4009.788333,16597.893553,12904.743478,11957.522178,2827.603889,53276.999366,6400.39625,2040.118824,36905.93249,12453.499177,3799.247273,11114.038562,7120.0,32959.123901,23815.482807,8998.59896,5576.086,14088.718844,7771.963133,1888.085714,8161.349412,15232.414611,93552.134937,82188.836819,23521.026087,27214.896111,7072.6105,89973.999679,383153.34156,9495.0,11683.630909,10083.959091,9356.410909,369157.307305,10824.630678,38326.31105,12466.388882,20326.75,40983.606111,586990.0,2022.0


## S3: Explicit Nulls

In [12]:
# generate explicit nulls for ubica_geo and year
index_cols = ["ubica_geo", "year"]
new_index = pd.MultiIndex.from_product(
    [df_concentradohogar_all["ubica_geo"].unique(), df_concentradohogar_all["year"].unique()],
    names=index_cols
)

# reindex
df_concentradohogar_all = (
    df_concentradohogar_all
    .set_index(index_cols)
    .reindex(new_index)
    .reset_index()
    )

In [13]:
# count # of ubica_geo
df_concentradohogar_all["ubica_geo"].value_counts().value_counts()

count
3    1528
Name: count, dtype: int64

In [14]:
# see ubica geo 01004
df_concentradohogar_all.query("ubica_geo == '01004'")

Unnamed: 0,ubica_geo,year,est_dis,clase_hog,sexo_jefe,tam_loc,est_socio,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,total_hogares
9,1004,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10,1004,2020,11.0,2.0,1.0,4.0,2.0,53.095238,4.952381,4.666667,2.095238,2.571429,3.52381,1.142857,2.952381,0.571429,1.666667,3.095238,1.619048,50108.64,29205.945714,20554.909048,16689.658571,16.304286,0.0,1425.112381,0.0,863.847619,1559.98619,6453.678571,3989.44,0.0,58.229524,3931.210476,2464.238571,52.406667,2411.831905,0.0,0.0,2197.358095,0.0,0.0,0.0,17151.685238,6183.724286,58.229524,579.967619,3232.916667,2820.648571,2437.044762,1839.15381,3442.391429,308.617619,29510.594286,13492.470952,11801.460952,2150.163333,1509.784286,0.0,1468.144762,726.722857,230.506667,200.199524,1711.491905,512.132381,245.506667,91.223333,83.874762,1893.665238,978.045238,1678.765238,12.244762,815.899524,440.392381,375.507143,2110.618571,69.124286,229.52381,320.0,1491.970476,2548.0,2275.535238,253.831429,18.633333,405.97,376.81,10.993333,18.166667,5659.382381,745.705238,10.946667,1271.738571,2076.111429,71.505238,2004.60619,1554.880476,1202.63,493.482857,476.228095,232.919048,2283.779524,1708.030476,31.21,544.539048,991.843333,2058.328571,638.197619,698.756667,15.139048,0.0,706.235238,6272.042857,0.0,1411.489524,1276.396667,135.092857,4227.48,451.396667,0.0,0.0,0.0,181.676667,1155.0
11,1004,2022,11.0,2.0,1.0,4.0,2.0,55.555556,4.166667,4.5,2.111111,2.388889,3.333333,1.166667,2.833333,0.5,1.777778,2.666667,1.722222,42181.936667,32637.283333,26569.804444,23682.403333,288.609444,0.0,1494.448333,0.0,179.971111,924.372222,4776.420556,1205.441667,0.0,41.331111,1164.110556,3570.978889,972.763889,2598.215,0.0,0.0,1291.058333,0.0,0.0,0.0,6420.463333,0.0,132.596667,1272.037222,379.229444,2604.846111,1673.525,358.228889,3124.19,0.0,26342.328889,10408.444444,9682.019444,2974.238333,949.277778,117.856667,541.420556,427.138333,94.998889,210.711667,1291.761667,419.638889,47.142222,7.142778,14.285,1331.424444,1254.982222,610.711667,115.713333,847.595556,425.861111,421.734444,2041.126667,166.666667,29.375,423.333333,1421.751667,1961.920556,1568.127222,173.911667,219.881667,1284.121667,1086.679444,0.0,197.442222,4915.933889,1071.417778,70.765,0.0,2943.806667,208.862778,2734.943889,829.944444,1176.49,657.202778,329.07,190.217222,3333.295,2838.871111,2.825556,491.598333,373.401111,637.091111,0.0,0.0,0.0,0.0,637.091111,1127.683333,0.0,2.732222,2.732222,0.0,1124.951111,0.0,0.0,0.0,0.0,0.0,1494.0


In [15]:
# fill na with strategy ffill and then bfill by ubica_geo
df_concentradohogar_all = (
    df_concentradohogar_all
    .sort_values(["ubica_geo", "year"])
    .groupby("ubica_geo")
    .apply(lambda x: x.ffill().bfill())
    .reset_index(drop=True)
    )

  .apply(lambda x: x.ffill().bfill())


In [16]:
# see ubica geo 01004
df_concentradohogar_all.query("ubica_geo == '01004'")

Unnamed: 0,ubica_geo,year,est_dis,clase_hog,sexo_jefe,tam_loc,est_socio,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,total_hogares
9,1004,2018,11.0,2.0,1.0,4.0,2.0,53.095238,4.952381,4.666667,2.095238,2.571429,3.52381,1.142857,2.952381,0.571429,1.666667,3.095238,1.619048,50108.64,29205.945714,20554.909048,16689.658571,16.304286,0.0,1425.112381,0.0,863.847619,1559.98619,6453.678571,3989.44,0.0,58.229524,3931.210476,2464.238571,52.406667,2411.831905,0.0,0.0,2197.358095,0.0,0.0,0.0,17151.685238,6183.724286,58.229524,579.967619,3232.916667,2820.648571,2437.044762,1839.15381,3442.391429,308.617619,29510.594286,13492.470952,11801.460952,2150.163333,1509.784286,0.0,1468.144762,726.722857,230.506667,200.199524,1711.491905,512.132381,245.506667,91.223333,83.874762,1893.665238,978.045238,1678.765238,12.244762,815.899524,440.392381,375.507143,2110.618571,69.124286,229.52381,320.0,1491.970476,2548.0,2275.535238,253.831429,18.633333,405.97,376.81,10.993333,18.166667,5659.382381,745.705238,10.946667,1271.738571,2076.111429,71.505238,2004.60619,1554.880476,1202.63,493.482857,476.228095,232.919048,2283.779524,1708.030476,31.21,544.539048,991.843333,2058.328571,638.197619,698.756667,15.139048,0.0,706.235238,6272.042857,0.0,1411.489524,1276.396667,135.092857,4227.48,451.396667,0.0,0.0,0.0,181.676667,1155.0
10,1004,2020,11.0,2.0,1.0,4.0,2.0,53.095238,4.952381,4.666667,2.095238,2.571429,3.52381,1.142857,2.952381,0.571429,1.666667,3.095238,1.619048,50108.64,29205.945714,20554.909048,16689.658571,16.304286,0.0,1425.112381,0.0,863.847619,1559.98619,6453.678571,3989.44,0.0,58.229524,3931.210476,2464.238571,52.406667,2411.831905,0.0,0.0,2197.358095,0.0,0.0,0.0,17151.685238,6183.724286,58.229524,579.967619,3232.916667,2820.648571,2437.044762,1839.15381,3442.391429,308.617619,29510.594286,13492.470952,11801.460952,2150.163333,1509.784286,0.0,1468.144762,726.722857,230.506667,200.199524,1711.491905,512.132381,245.506667,91.223333,83.874762,1893.665238,978.045238,1678.765238,12.244762,815.899524,440.392381,375.507143,2110.618571,69.124286,229.52381,320.0,1491.970476,2548.0,2275.535238,253.831429,18.633333,405.97,376.81,10.993333,18.166667,5659.382381,745.705238,10.946667,1271.738571,2076.111429,71.505238,2004.60619,1554.880476,1202.63,493.482857,476.228095,232.919048,2283.779524,1708.030476,31.21,544.539048,991.843333,2058.328571,638.197619,698.756667,15.139048,0.0,706.235238,6272.042857,0.0,1411.489524,1276.396667,135.092857,4227.48,451.396667,0.0,0.0,0.0,181.676667,1155.0
11,1004,2022,11.0,2.0,1.0,4.0,2.0,55.555556,4.166667,4.5,2.111111,2.388889,3.333333,1.166667,2.833333,0.5,1.777778,2.666667,1.722222,42181.936667,32637.283333,26569.804444,23682.403333,288.609444,0.0,1494.448333,0.0,179.971111,924.372222,4776.420556,1205.441667,0.0,41.331111,1164.110556,3570.978889,972.763889,2598.215,0.0,0.0,1291.058333,0.0,0.0,0.0,6420.463333,0.0,132.596667,1272.037222,379.229444,2604.846111,1673.525,358.228889,3124.19,0.0,26342.328889,10408.444444,9682.019444,2974.238333,949.277778,117.856667,541.420556,427.138333,94.998889,210.711667,1291.761667,419.638889,47.142222,7.142778,14.285,1331.424444,1254.982222,610.711667,115.713333,847.595556,425.861111,421.734444,2041.126667,166.666667,29.375,423.333333,1421.751667,1961.920556,1568.127222,173.911667,219.881667,1284.121667,1086.679444,0.0,197.442222,4915.933889,1071.417778,70.765,0.0,2943.806667,208.862778,2734.943889,829.944444,1176.49,657.202778,329.07,190.217222,3333.295,2838.871111,2.825556,491.598333,373.401111,637.091111,0.0,0.0,0.0,0.0,637.091111,1127.683333,0.0,2.732222,2.732222,0.0,1124.951111,0.0,0.0,0.0,0.0,0.0,1494.0


## S4: Save

In [17]:
# see shape
df_concentradohogar_all.shape

(4584, 123)

In [18]:
# save
dir_save = "../../data/interim/"
df_concentradohogar_all.to_csv(dir_save + "concentrado_hogar_enigh.csv")

---
# Vivienda

INEGI's data diccionary is found [here](https://www.inegi.org.mx/rnm/index.php/catalog/685/data-dictionary/F13?file_name=viviendas)

Notes:
- A Hogar can have multiple viviendas

## S1: Extract & Transform

In [191]:
# function to extract, transform and load each enigh file
def weighted_mode(df, columns_to_mode, weighting_column):
    # Initialize a DataFrame to store the mode values
    mode_values = pd.DataFrame(columns=['Column', 'Weighted_Mode'])

    # Calculate weighted mode for each column in columns_to_mode
    for col in columns_to_mode:
        # Group by the column values and calculate weighted counts
        weighted_counts = df.groupby(col)[weighting_column].sum()

        # Find the index of maximum weighted count
        mode_index = weighted_counts.idxmax()

        # Get the mode value
        mode_value = df.loc[df[col] == mode_index, col].iloc[0]

        # Append mode value to mode_values DataFrame
        mode_values.loc[len(mode_values)] = [col, mode_value]

    # return a Series
    return mode_values.set_index('Column')['Weighted_Mode']


def weighted_mean(df, cols, weight_col):
    return (df[cols].mul(df[weight_col], axis=0).sum() / df[weight_col].sum())


def get_statistics(df, cols_mode, cols_mean, weight_col):
    # s1: get weighted mode
    mode_values = weighted_mode(df, cols_mode, weight_col)

    # s2: get weighted mean
    mean_values = weighted_mean(df, cols_mean, weight_col)

    # s3: get total viviendas
    total_viviendas = df[weight_col].sum()

    # return all the values in a pd series
    melt_values = pd.concat([mode_values, mean_values])
    melt_values['total_viviendas'] = total_viviendas

    return melt_values


def get_enigh_vivienda(file, cols_mode, cols_mean):
    # s1:read file
    df = pd.read_csv(file, na_values=[' '])

    # s2: wrangle
    yes_no_columns = [
        'cocina',
        'cocina_dor',
        'excusado',
        'uso_compar',
        'biodigest',
        'combustible',
        'estufa_chi',
        'lavadero',
        'fregadero',
        'regadera',
        'tinaco_azo',
        'cisterna',
        'pileta',
        'calent_sol',
        'calent_gas',
        'medidor_luz',
        'bomba_agua',
        'tanque_gas',
        'aire_acond',
        'calefacc',
        'pago_mesp',
        'viv_usada'
    ]
    # map 1 for 1 and 2 for 0 (yes and no)
    df[yes_no_columns] = df[yes_no_columns].replace({2: 0})
    # ubica_geo
    df["ubica_geo"] = df["ubica_geo"].astype(str).str.zfill(5)

    # s3: get statistics
    table = (
        df
        .groupby("ubica_geo", as_index=False)
        .apply(
            lambda x: get_statistics(x, cols_mode, cols_mean, "factor"),
            include_groups=False
        )
    )

    return table


In [192]:
# get params
years_enigh = [2018, 2020, 2022]
cols_mode = [
    'mat_pared', 'mat_techos', 'mat_pisos', 'disp_agua',
    'combustible', 'eli_basura', 'tenencia', 'tipo_adqui',
    'tipo_adqui', 'tipo_finan', 'escrituras', 'disp_elect',
    'tipo_viv'
]
cols_mean = [
    'regadera', 'pago_mesp', 'tinaco_azo',
    'lavadero', 'procaptar', 'tot_resid', 'antiguedad',
    'bano_comp', 'calent_sol', 'cocina',
    'aire_acond', 'tot_hom', 'cocina_dor', 'renta',
    'fregadero', 'focos_inca', 'sanit_agua',
    'uso_compar', 'medidor_luz', 'est_socio',
    'tot_muj', 'dotac_agua', 'bano_regad', 'estufa_chi',
    'tam_loc', 'tanque_gas', 'focos_ahor',
    'cisterna', 'cuart_dorm', 'drenaje', 'excusado', 'pileta',
    'num_cuarto', 'calent_gas', 'calefacc',
    'bano_excus', 'pago_viv',
    'bomba_agua', 'viv_usada', 'biodigest', 'tot_hog',
    'estim_pago'
]

FILE_TEMPLATE = 'enigh{enigh_year}_ns_viviendas_csv.zip'
ENIGH_ROOT = "../../data/catalogues/adamuz_data/enigh_{enigh_year}/"

In [193]:
# get all tables
list_tables = []
for year in tqdm(years_enigh):
    # get file
    dir_file = ENIGH_ROOT.format(enigh_year=year)
    file = dir_file + FILE_TEMPLATE.format(enigh_year=year)

    # unzip
    os.system(f"unzip -o {file} -d {dir_file}")

    # get table
    table = get_enigh_vivienda(dir_file + 'viviendas.csv', cols_mode, cols_mean)

    # add year
    table["year"] = year

    # append
    list_tables.append(table)

  0%|          | 0/3 [00:00<?, ?it/s]

Archive:  ../../data/catalogues/adamuz_data/enigh_2018/enigh2018_ns_viviendas_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/viviendas.csv  
  inflating: ../../data/catalogues/adamuz_data/enigh_2018/nota_bases_datos_enigh2018_ns.txt  


  df = pd.read_csv(file, na_values=[' '])
 33%|███▎      | 1/3 [00:14<00:28, 14.37s/it]

Archive:  ../../data/catalogues/adamuz_data/enigh_2020/enigh2020_ns_viviendas_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2020/viviendas.csv  


  df = pd.read_csv(file, na_values=[' '])
 67%|██████▋   | 2/3 [00:29<00:14, 14.83s/it]

Archive:  ../../data/catalogues/adamuz_data/enigh_2022/enigh2022_ns_viviendas_csv.zip
  inflating: ../../data/catalogues/adamuz_data/enigh_2022/viviendas.csv  


  df = pd.read_csv(file, na_values=[' '])
100%|██████████| 3/3 [00:45<00:00, 15.22s/it]


In [194]:
# concat
df_viviendas_all = (
    pd.concat(list_tables)
    .sort_values(["ubica_geo", "year"], ignore_index=True)
    )
df_viviendas_all

Unnamed: 0,ubica_geo,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_adqui.1,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas,year
0,01001,8.0,10.0,3.0,1.0,3.0,3.0,4.0,1.0,1.0,1,1.0,1,1,0.945602,0.362238,0.782853,0.895975,0.0,3.781875,19.145343,1.265395,0.263397,0.979702,0.015126,1.806226,0.023304,385.384175,0.882548,1.506320,1.069981,0.029129,0.972374,2.830593,1.975650,1.091257,0.010228,0.001665,1.313242,0.146555,7.696495,0.493936,2.460400,1.044960,0.996631,0.133018,4.529099,0.515284,0.010715,0.266219,442.032669,0.416913,0.166781,0.014665,1.019717,2165.050435,234164,2018
1,01001,8.0,10.0,3,1.0,3.0,3.0,4.0,1.0,1.0,1,1.0,1,1,0.962908,0.385572,0.805143,0.903710,0.0,3.555936,18.578944,1.301716,0.287558,0.988414,0.017228,1.693341,0.020858,484.117574,0.898580,1.067147,1.066227,0.022514,0.982148,2.848673,1.862595,1.177386,0.007960,0.003885,1.191967,0.152516,7.850508,0.475506,2.355871,1.009949,0.996949,0.098937,4.408034,0.492554,0.012072,0.198125,670.729401,0.391329,0.161022,0.010222,1.016633,2361.336658,267473,2020
2,01001,8.0,10.0,3,1.0,3,3.0,4.0,1.0,1.0,1,1.0,1,1,0.970420,0.386612,0.784273,0.905320,0.0,3.514820,20.415628,1.364741,0.350969,0.986751,0.027798,1.645770,0.018158,594.349359,0.916743,0.690979,1.054902,0.015830,0.972788,2.843422,1.869050,1.180679,0.005229,0.004247,1.260126,0.167579,8.915832,0.438922,2.393433,1.033536,0.998091,0.120005,4.433587,0.439448,0.017268,0.239865,711.650391,0.403757,0.179857,0.029685,1.010339,3013.785845,267713,2022
3,01002,8.0,10.0,2.0,1.0,3.0,3.0,4.0,3.0,3.0,5,1.0,1,1,0.858142,0.033360,0.825390,0.900527,0.0,3.604238,19.959440,0.925979,0.559116,0.966640,0.000000,1.983979,0.110221,35.956195,0.604847,2.070980,1.075745,0.064997,0.978503,2.000000,1.620260,1.121476,0.032752,0.000000,3.502535,0.193673,3.612148,0.108497,2.040256,1.031637,0.989252,0.097141,3.856520,0.383695,0.000000,0.225918,0.000000,0.066112,0.044109,0.010748,1.000000,1048.428311,9862,2018
4,01002,8.0,10.0,2,1.0,3.0,3.0,4.0,3.0,3.0,5,1.0,1,1,0.835095,0.026823,0.828621,0.833510,0.0,4.468816,22.767310,0.884514,0.607426,0.990618,0.008060,2.112183,0.018763,20.989693,0.569371,2.511364,1.166226,0.051401,0.951506,2.000000,2.356633,2.066332,0.021934,0.021010,4.000000,0.085359,4.049154,0.215777,2.379889,1.227669,0.979651,0.229783,4.209567,0.219212,0.008060,0.231501,0.000000,0.203356,0.029070,0.000000,1.027484,1159.791226,7568,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3213,32056,8.0,10.0,3.0,1.0,3.0,1.0,4.0,1.0,1.0,5,1.0,1,1,0.988622,0.307692,0.976436,0.669133,0.0,3.634024,20.698178,1.414077,0.417638,0.993963,0.000000,1.681708,0.000000,435.273334,0.899652,1.926805,1.035972,0.006065,0.993434,3.121463,1.952316,2.024788,0.011963,0.000000,1.000000,0.240840,7.018640,0.221616,2.378717,1.000000,1.000000,0.054389,4.806148,0.810238,0.000000,0.247016,363.107525,0.197357,0.084880,0.005648,1.010433,2008.997079,35945,2018
3214,32056,8.0,10.0,3,1.0,3.0,1.0,4.0,2.0,2.0,5,1.0,1,1,0.914939,0.214437,0.927101,0.696612,0.0,3.529814,21.250086,1.260574,0.583608,0.980378,0.014230,1.642572,0.019228,358.733813,0.821385,1.129967,1.119824,0.019942,0.968511,2.869344,1.887242,1.902826,0.018415,0.005072,1.304299,0.267098,7.814270,0.217613,2.318750,1.071003,0.994928,0.135900,4.486508,0.682087,0.014058,0.343468,295.051455,0.215742,0.058841,0.004481,1.021641,2243.818012,40618,2020
3215,32056,8.0,10.0,3.0,1.0,3.0,1.0,4.0,3.0,3.0,5,1.0,1,1,0.967891,0.236054,0.983913,0.725864,0.0,3.566169,19.698376,1.344131,0.664632,0.972469,0.018398,1.656065,0.007913,389.496458,0.895782,1.107357,1.050638,0.033569,0.981232,2.884752,1.910104,2.172752,0.012425,0.004752,1.270866,0.261253,8.239760,0.203749,2.521766,1.008153,0.990692,0.064763,4.555946,0.677144,0.018332,0.282747,249.263433,0.196403,0.070627,0.017068,1.032698,2658.267030,45875,2022
3216,32057,8.0,10.0,2,1.0,3.0,1.0,4.0,3.0,3.0,5,1.0,1,1,0.719911,0.000000,0.875070,0.812605,0.0,4.152593,23.318238,0.889905,0.402454,0.960067,0.000000,1.934969,0.065031,0.000000,0.387619,2.494590,1.280089,0.022532,0.957501,2.000000,2.217624,3.779810,0.000000,0.000000,3.000000,0.087563,3.455772,0.382487,2.254992,1.079866,0.957501,0.260123,4.019967,0.410151,0.000000,0.300056,0.000000,0.424986,0.042499,0.000000,1.062465,1562.699387,8965,2020


## S2: EDA

In [195]:
# see sample
df_viviendas_all.head()

Unnamed: 0,ubica_geo,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_adqui.1,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas,year
0,1001,8.0,10.0,3.0,1.0,3.0,3.0,4.0,1.0,1.0,1,1.0,1,1,0.945602,0.362238,0.782853,0.895975,0.0,3.781875,19.145343,1.265395,0.263397,0.979702,0.015126,1.806226,0.023304,385.384175,0.882548,1.50632,1.069981,0.029129,0.972374,2.830593,1.97565,1.091257,0.010228,0.001665,1.313242,0.146555,7.696495,0.493936,2.4604,1.04496,0.996631,0.133018,4.529099,0.515284,0.010715,0.266219,442.032669,0.416913,0.166781,0.014665,1.019717,2165.050435,234164,2018
1,1001,8.0,10.0,3.0,1.0,3.0,3.0,4.0,1.0,1.0,1,1.0,1,1,0.962908,0.385572,0.805143,0.90371,0.0,3.555936,18.578944,1.301716,0.287558,0.988414,0.017228,1.693341,0.020858,484.117574,0.89858,1.067147,1.066227,0.022514,0.982148,2.848673,1.862595,1.177386,0.00796,0.003885,1.191967,0.152516,7.850508,0.475506,2.355871,1.009949,0.996949,0.098937,4.408034,0.492554,0.012072,0.198125,670.729401,0.391329,0.161022,0.010222,1.016633,2361.336658,267473,2020
2,1001,8.0,10.0,3.0,1.0,3.0,3.0,4.0,1.0,1.0,1,1.0,1,1,0.97042,0.386612,0.784273,0.90532,0.0,3.51482,20.415628,1.364741,0.350969,0.986751,0.027798,1.64577,0.018158,594.349359,0.916743,0.690979,1.054902,0.01583,0.972788,2.843422,1.86905,1.180679,0.005229,0.004247,1.260126,0.167579,8.915832,0.438922,2.393433,1.033536,0.998091,0.120005,4.433587,0.439448,0.017268,0.239865,711.650391,0.403757,0.179857,0.029685,1.010339,3013.785845,267713,2022
3,1002,8.0,10.0,2.0,1.0,3.0,3.0,4.0,3.0,3.0,5,1.0,1,1,0.858142,0.03336,0.82539,0.900527,0.0,3.604238,19.95944,0.925979,0.559116,0.96664,0.0,1.983979,0.110221,35.956195,0.604847,2.07098,1.075745,0.064997,0.978503,2.0,1.62026,1.121476,0.032752,0.0,3.502535,0.193673,3.612148,0.108497,2.040256,1.031637,0.989252,0.097141,3.85652,0.383695,0.0,0.225918,0.0,0.066112,0.044109,0.010748,1.0,1048.428311,9862,2018
4,1002,8.0,10.0,2.0,1.0,3.0,3.0,4.0,3.0,3.0,5,1.0,1,1,0.835095,0.026823,0.828621,0.83351,0.0,4.468816,22.76731,0.884514,0.607426,0.990618,0.00806,2.112183,0.018763,20.989693,0.569371,2.511364,1.166226,0.051401,0.951506,2.0,2.356633,2.066332,0.021934,0.02101,4.0,0.085359,4.049154,0.215777,2.379889,1.227669,0.979651,0.229783,4.209567,0.219212,0.00806,0.231501,0.0,0.203356,0.02907,0.0,1.027484,1159.791226,7568,2020


In [196]:
# shape
df_viviendas_all.shape

(3218, 58)

In [197]:
# see empty values
df_viviendas_all.isnull().sum()[df_viviendas_all.isnull().sum() > 0]

Series([], dtype: int64)

In [198]:
# count # of ubica_geo
df_viviendas_all["ubica_geo"].value_counts().value_counts()

count
3    664
1    502
2    362
Name: count, dtype: int64

In [199]:
# describe
df_viviendas_all.describe()

Unnamed: 0,mat_pared,mat_techos,disp_agua,eli_basura,tenencia,tipo_adqui,tipo_adqui.1,escrituras,disp_elect,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas,year
count,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0
mean,7.872281,8.279366,1.60317,1.633002,3.943443,2.169049,2.169049,1.353014,1.006837,0.632663,0.116859,0.539838,0.833896,0.072518,3.680122,19.966795,0.717635,0.131332,0.907696,0.107184,1.78254,0.060955,168.136975,0.468555,1.124025,1.461249,0.073366,0.919543,1.738642,1.897582,1.527458,0.043776,0.085863,3.126627,0.078632,5.008371,0.152714,2.017363,1.704186,0.967612,0.446672,3.619212,0.27053,0.020671,0.479583,105.376335,0.223503,0.057935,0.0239,1.016293,1546.849737,32945.253263,2020.084525
std,0.428437,2.956521,1.409176,1.206857,0.373886,0.65259,0.65259,0.760988,0.15355,0.290137,0.137805,0.281555,0.204577,0.250589,0.578878,5.926552,0.388612,0.191552,0.110453,0.204979,0.319315,0.058519,321.772576,0.283027,0.73889,0.411315,0.071763,0.112973,0.599341,0.332447,0.764325,0.066133,0.157747,0.94244,0.103955,1.836917,0.174639,0.275016,0.771586,0.066174,0.261569,0.583695,0.249719,0.065202,0.275174,243.101552,0.20783,0.067028,0.047719,0.030076,899.843595,60317.776856,1.624437
min,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.823529,3.733333,0.0,0.0,0.1875,0.0,0.823529,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,1.0,0.5,0.0,0.0,0.0,1.0,0.0,0.032258,0.0,1.055556,1.0,0.142857,0.0,1.684211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,346.0,596.0,2018.0
25%,8.0,8.0,1.0,1.0,4.0,2.0,2.0,1.0,1.0,0.421113,0.0,0.3,0.773755,0.0,3.315789,15.98703,0.425251,0.0,0.875,0.0,1.578024,0.020557,0.0,0.227002,0.565231,1.15,0.023877,0.898095,1.0,1.686838,1.000501,0.0,0.0,2.516317,0.0,3.782784,0.029041,1.833333,1.086957,0.956431,0.225509,3.244407,0.045455,0.0,0.252286,0.0,0.052632,0.0,0.0,1.0,1039.152261,7035.75,2018.0
50%,8.0,10.0,1.0,1.0,4.0,2.0,2.0,1.0,1.0,0.697116,0.063959,0.580633,0.918696,0.0,3.619048,19.370338,0.765071,0.029813,0.947368,0.0,1.750978,0.048585,60.0,0.462269,0.992645,1.352941,0.052632,0.954164,2.0,1.869565,1.262893,0.021841,0.023276,3.298769,0.045455,4.947368,0.092385,2.017716,1.423005,0.997983,0.437706,3.631579,0.210526,0.0,0.445595,0.0,0.166667,0.043478,0.0,1.0,1340.976532,13650.0,2020.0
75%,8.0,10.0,2.0,1.0,4.0,3.0,3.0,1.0,1.0,0.878542,0.190476,0.786549,0.970587,0.0,3.991653,23.232315,1.018641,0.203478,0.984959,0.095625,1.954545,0.086957,221.486058,0.7,1.565217,1.666667,0.10095,0.992186,2.0,2.076226,1.904762,0.055767,0.094195,4.0,0.111713,6.126788,0.217391,2.19597,2.110828,1.0,0.652174,4.0,0.450649,0.003781,0.69367,99.310637,0.339218,0.090909,0.027487,1.023641,1782.648405,28826.75,2022.0
max,8.0,10.0,7.0,7.0,5.0,4.0,4.0,3.0,5.0,1.0,1.0,1.0,1.0,1.0,7.0,50.809524,2.375818,0.95,1.0,1.0,3.6875,0.535294,5180.798358,1.0,4.608696,3.0,0.6,1.0,4.0,3.5,4.809524,1.102098,1.0,4.0,0.8,21.994528,1.0,3.111687,5.0,1.0,1.0,6.217034,1.0,0.785714,1.4,3415.0,1.0,0.478261,0.545455,1.5,13853.746046,586990.0,2022.0


## S3: Explicit Nulls

In [200]:
# generate explicit nulls for ubica_geo and year
index_cols = ["ubica_geo", "year"]
new_index = pd.MultiIndex.from_product(
    [df_viviendas_all["ubica_geo"].unique(), df_viviendas_all["year"].unique()],
    names=index_cols
)

# reindex
df_viviendas_all = (
    df_viviendas_all
    .set_index(index_cols)
    .reindex(new_index)
    .reset_index()
    )

In [201]:
# count # of ubica_geo
df_viviendas_all["ubica_geo"].value_counts().value_counts()

count
3    1528
Name: count, dtype: int64

In [202]:
# see ubica geo 01004
df_viviendas_all.query("ubica_geo == '01004'")

Unnamed: 0,ubica_geo,year,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_adqui.1,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas
9,1004,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10,1004,2020,8.0,10.0,2.0,1.0,3.0,3.0,4.0,3.0,3.0,5.0,1.0,1.0,1.0,0.904762,0.047619,0.809524,0.952381,0.0,4.666667,28.666667,1.0,0.333333,0.904762,0.0,2.095238,0.0,23.809524,0.666667,1.571429,1.190476,0.047619,0.952381,2.0,2.571429,1.142857,0.047619,0.0,4.0,0.047619,5.047619,0.333333,2.666667,1.0,1.0,0.238095,4.428571,0.285714,0.0,0.142857,0.0,0.619048,0.0,0.0,1.0,1185.714286,1155.0
11,1004,2022,8.0,10.0,2.0,1.0,3.0,3.0,4.0,3.0,3.0,5.0,1.0,1.0,1.0,0.833333,0.055556,0.666667,0.666667,0.0,4.5,15.277778,0.833333,0.222222,0.944444,0.0,2.111111,0.055556,55.555556,0.611111,0.555556,1.277778,0.111111,0.888889,2.0,2.388889,1.0,0.0,0.0,4.0,0.0,4.333333,0.111111,2.111111,1.277778,1.0,0.222222,3.611111,0.166667,0.055556,0.333333,0.0,0.111111,0.0,0.0,1.0,1072.222222,1494.0


In [203]:
# fill na with strategy ffill and then bfill by ubica_geo
df_viviendas_all = (
    df_viviendas_all
    .sort_values(["ubica_geo", "year"])
    .groupby("ubica_geo")
    .apply(lambda x: x.ffill().bfill())
    .reset_index(drop=True)
    )

  .apply(lambda x: x.ffill().bfill())
  .apply(lambda x: x.ffill().bfill())


In [204]:
# see ubica geo 01004
df_viviendas_all.query("ubica_geo == '01004'")

Unnamed: 0,ubica_geo,year,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_adqui.1,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas
9,1004,2018,8.0,10.0,2,1.0,3.0,3.0,4.0,3.0,3.0,5,1.0,1.0,1,0.904762,0.047619,0.809524,0.952381,0.0,4.666667,28.666667,1.0,0.333333,0.904762,0.0,2.095238,0.0,23.809524,0.666667,1.571429,1.190476,0.047619,0.952381,2.0,2.571429,1.142857,0.047619,0.0,4.0,0.047619,5.047619,0.333333,2.666667,1.0,1.0,0.238095,4.428571,0.285714,0.0,0.142857,0.0,0.619048,0.0,0.0,1.0,1185.714286,1155.0
10,1004,2020,8.0,10.0,2,1.0,3.0,3.0,4.0,3.0,3.0,5,1.0,1.0,1,0.904762,0.047619,0.809524,0.952381,0.0,4.666667,28.666667,1.0,0.333333,0.904762,0.0,2.095238,0.0,23.809524,0.666667,1.571429,1.190476,0.047619,0.952381,2.0,2.571429,1.142857,0.047619,0.0,4.0,0.047619,5.047619,0.333333,2.666667,1.0,1.0,0.238095,4.428571,0.285714,0.0,0.142857,0.0,0.619048,0.0,0.0,1.0,1185.714286,1155.0
11,1004,2022,8.0,10.0,2,1.0,3.0,3.0,4.0,3.0,3.0,5,1.0,1.0,1,0.833333,0.055556,0.666667,0.666667,0.0,4.5,15.277778,0.833333,0.222222,0.944444,0.0,2.111111,0.055556,55.555556,0.611111,0.555556,1.277778,0.111111,0.888889,2.0,2.388889,1.0,0.0,0.0,4.0,0.0,4.333333,0.111111,2.111111,1.277778,1.0,0.222222,3.611111,0.166667,0.055556,0.333333,0.0,0.111111,0.0,0.0,1.0,1072.222222,1494.0


## S4: Save

In [205]:
# see shape
df_viviendas_all.shape

(4584, 58)

In [206]:
# save
dir_save = "../../data/interim/"
df_viviendas_all.to_csv(dir_save + "viviendas_enigh.csv")

---
# Join with Properties

## S1: Load & Transform

In [207]:
# read parquet
gdf_properties = pd.read_parquet("../../data/interim/cleaned_data_s4.parquet")

# get original columns
original_columns = gdf_properties.columns

# see num of rows
gdf_properties.shape

(852931, 41)

In [208]:
# see columns cve ent & cve mun
gdf_properties[["id_entidad_f", "id_municipio"]]

Unnamed: 0,id_entidad_f,id_municipio
0,09,003
1,15,058
2,15,121
3,09,003
4,15,058
...,...,...
853028,31,041
853029,31,050
853030,31,050
853031,31,050


In [209]:
# create 'ubica_geo' column
gdf_properties["ubica_geo"] = (
    gdf_properties["id_entidad_f"].astype(str).str.zfill(2)
    + gdf_properties["id_municipio"].astype(str).str.zfill(3)
    )

# see some values
gdf_properties["ubica_geo"]

0         09003
1         15058
2         15121
3         09003
4         15058
          ...  
853028    31041
853029    31050
853030    31050
853031    31050
853032    31050
Name: ubica_geo, Length: 852931, dtype: object

In [210]:
# get year of fecha avaluo and floor every 2 years
gdf_properties["year_enigh"] = (
    gdf_properties["fecha_avaluo"].dt.year
    // 2 * 2
    )

# describe
gdf_properties["year_enigh"].describe()

count    852931.000000
mean       2020.196398
std           1.485187
min        2018.000000
25%        2020.000000
50%        2020.000000
75%        2022.000000
max        2022.000000
Name: year_enigh, dtype: float64

## S2: Join with Concentrado Hogar

In [211]:
# join with df_concentradohogar_all
gdf_properties = (
    gdf_properties
    .merge(
        df_concentradohogar_all,
        left_on=["ubica_geo", "year_enigh"],
        right_on=["ubica_geo", "year"],
        how="left"
        )
    )

# see columns
gdf_properties.shape

(852931, 166)

In [212]:
# see size
gdf_properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 852931 entries, 0 to 852930
Columns: 166 entries, property_id to cve_ent
dtypes: datetime64[ns](1), float64(129), int32(2), int64(24), object(10)
memory usage: 1.0+ GB


In [213]:
# see if there are columns with "_y"
gdf_properties.filter(like="_x").columns

Index([], dtype='object')

In [214]:
# see how many nan values 
gdf_properties.loc[:, df_concentradohogar_all.columns].isnull().sum()

ubica_geo        0
year             0
est_dis          0
clase_hog        0
sexo_jefe        0
                ..
deudas           0
balance          0
otras_erog       0
total_hogares    0
cve_ent          0
Length: 124, dtype: int64

In [215]:
# see how many nan values 
gdf_properties.loc[:, df_concentradohogar_all.columns].isnull().mean()

ubica_geo        0.0
year             0.0
est_dis          0.0
clase_hog        0.0
sexo_jefe        0.0
                ... 
deudas           0.0
balance          0.0
otras_erog       0.0
total_hogares    0.0
cve_ent          0.0
Length: 124, dtype: float64

## S3: Join with Vivienda

In [216]:
# join with df_viviendas_all
gdf_properties = (
    gdf_properties
    .drop(columns=["year", "tam_loc", "est_socio"])
    .merge(
        df_viviendas_all,
        left_on=["ubica_geo", "year_enigh"],
        right_on=["ubica_geo", "year"],
        how="left"
        )
    )

# see columns
gdf_properties.shape

(852931, 220)

In [217]:
# see size
gdf_properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 852931 entries, 0 to 852930
Columns: 220 entries, property_id to total_viviendas
dtypes: datetime64[ns](1), float64(180), int32(1), int64(24), object(14)
memory usage: 1.4+ GB


In [218]:
# see if there are columns with "_y"
gdf_properties.filter(like="_x").columns

Index([], dtype='object')

In [219]:
# see how many nan values 
gdf_properties.loc[:, df_viviendas_all.columns].isnull().sum()

ubica_geo             0
year               2592
mat_pared          2592
mat_techos         2592
mat_pisos          2592
disp_agua          2592
combustible        2592
eli_basura         2592
tenencia           2592
tipo_adqui         2592
tipo_adqui         2592
tipo_adqui         2592
tipo_adqui         2592
tipo_finan         2592
escrituras         2592
disp_elect         2592
tipo_viv           2592
regadera           2592
pago_mesp          2592
tinaco_azo         2592
lavadero           2592
procaptar          2592
tot_resid          2592
antiguedad         2592
bano_comp          2592
calent_sol         2592
cocina             2592
aire_acond         2592
tot_hom            2592
cocina_dor         2592
renta              2592
fregadero          2592
focos_inca         2592
sanit_agua         2592
uso_compar         2592
medidor_luz        2592
est_socio          2592
tot_muj            2592
dotac_agua         2592
bano_regad         2592
estufa_chi         2592
tam_loc         

In [220]:
# see how many nan values 
gdf_properties.loc[:, df_viviendas_all.columns].isnull().mean()

ubica_geo          0.000000
year               0.003039
mat_pared          0.003039
mat_techos         0.003039
mat_pisos          0.003039
disp_agua          0.003039
combustible        0.003039
eli_basura         0.003039
tenencia           0.003039
tipo_adqui         0.003039
tipo_adqui         0.003039
tipo_adqui         0.003039
tipo_adqui         0.003039
tipo_finan         0.003039
escrituras         0.003039
disp_elect         0.003039
tipo_viv           0.003039
regadera           0.003039
pago_mesp          0.003039
tinaco_azo         0.003039
lavadero           0.003039
procaptar          0.003039
tot_resid          0.003039
antiguedad         0.003039
bano_comp          0.003039
calent_sol         0.003039
cocina             0.003039
aire_acond         0.003039
tot_hom            0.003039
cocina_dor         0.003039
renta              0.003039
fregadero          0.003039
focos_inca         0.003039
sanit_agua         0.003039
uso_compar         0.003039
medidor_luz        0

## S4: See which ubica_geo have nan values

In [60]:
# see how many nan values in ubica_geo
nan_ubica_geo = (
    gdf_properties
    .loc[:, df_viviendas_all.columns]
    .groupby("ubica_geo")
    .apply(lambda x: x['tipo_viv'].isnull().sum())
    .to_frame()
    .rename(columns={0: "nan_values"})
    .query("nan_values > 0")
    .reset_index()
)
nan_ubica_geo

  .apply(lambda x: x['tipo_viv'].isnull().sum())


Unnamed: 0,ubica_geo,nan_values
0,7012,891
1,14051,424
2,15012,1
3,15022,3
4,15073,489
5,15125,187
6,21128,189
7,21175,260
8,30074,2
9,30081,140


ideas to input missing values:
1. remove rows with missing values
2. fill with mean/median
3. fill with the most common value in the state
4. knn imputer using centroids of the municipalities

## S5: NaN Imputation

### Concentrado Hogar

In [72]:
# to new_index add nan_ubica_geo ubica_geo's
index_cols = ["ubica_geo", "year"]
new_index = pd.MultiIndex.from_product(
    [gdf_properties["ubica_geo"].unique(), gdf_properties["year_enigh"].unique()],
    names=index_cols
)

# reindex
df_concentradohogar_all = (
    df_concentradohogar_all
    .set_index(index_cols)
    .reindex(new_index)
    .reset_index()
    )

In [73]:
# see 31100
df_concentradohogar_all.query("ubica_geo == '31100'")

Unnamed: 0,ubica_geo,year,est_dis,clase_hog,sexo_jefe,tam_loc,est_socio,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,total_hogares
1011,31100,2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1012,31100,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1013,31100,2022,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [77]:
# get cve_ent from ubica_geo
df_concentradohogar_all["cve_ent"] = df_concentradohogar_all["ubica_geo"].str[:2]

In [79]:
# get table of median values by cve_ent
table_concentradohogar_medians = (
    df_concentradohogar_all
    .drop(columns=["ubica_geo"])
    .groupby(["cve_ent", "year"], as_index=False)
    .mean()
)
table_concentradohogar_medians

Unnamed: 0,cve_ent,year,est_dis,clase_hog,sexo_jefe,tam_loc,est_socio,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,total_hogares
0,01,2018,7.20,2.0,1.0,2.586135,2.331342,48.783703,5.743871,4.257246,2.121937,2.135309,3.254090,1.003157,2.980434,0.273656,1.955393,2.664982,1.915714,57003.504263,41681.567996,35902.188331,29509.240219,575.049637,470.237026,2178.863150,132.325245,1169.721151,1866.751904,4448.840622,3298.103376,601.307112,1302.303967,1394.492297,1150.737247,35.635823,1102.249488,12.851936,0.000000,1330.539042,2816.245228,2188.106441,628.138787,7807.205212,2908.670513,196.637780,989.725055,1089.355676,484.013078,1465.024028,673.779083,4682.537762,15.948064,35308.415772,11490.114078,8956.255259,1721.458645,1659.782878,93.022588,911.190521,302.462996,100.764355,145.429789,961.271191,348.633584,67.029367,41.581514,76.493094,1646.774376,880.360359,2452.662451,81.196368,1917.673989,1131.084290,786.589699,2456.622166,667.749256,121.500645,504.434768,1162.937496,2217.131515,1492.606818,223.306113,501.218584,1040.114815,771.435826,96.896386,171.782603,7875.721124,986.020634,205.237185,2373.235377,2984.130192,216.979411,2767.150781,1327.097736,4460.088873,2791.198760,949.075946,719.814167,2929.358896,2154.403129,129.779439,645.176328,921.590317,6502.857419,1816.592079,885.618371,2307.716148,9.054218,1483.876603,7707.718195,916.327180,790.878558,658.848444,132.030114,2062.212054,142.976718,898.248205,904.162830,424.560458,1568.352193,60702.20
1,01,2020,8.40,2.0,1.0,2.552323,2.338150,49.024817,5.993201,3.942444,1.911562,2.030882,3.122261,0.820183,2.878134,0.244128,1.879896,2.520096,1.816300,55607.465192,39529.635623,34917.052727,28762.862252,497.320507,638.114735,1912.699579,614.789694,939.792060,1551.473900,3269.108659,2694.511966,387.668448,1119.926874,1186.916644,574.596693,5.247301,569.337478,0.011915,0.000000,1343.474236,2140.055736,1832.356876,307.698860,9060.285523,3425.700574,77.654076,856.939056,1507.916854,863.449957,1628.962203,699.662803,4824.084565,53.403745,32658.931937,11966.251378,10399.257381,1846.769751,1897.050996,124.432911,972.562330,366.999857,107.685977,158.466069,1212.596605,500.884351,83.508337,59.233375,87.181438,1972.604194,1009.281189,1482.483549,84.510447,1271.090991,773.997413,497.093578,2637.636644,794.343337,153.137369,553.095445,1137.060492,2123.512545,1510.490290,197.044821,415.977434,1473.007048,1063.957994,151.258384,257.790671,6658.716677,612.850366,74.691228,1511.348658,2565.142486,267.545540,2297.596946,1894.683940,2955.457738,2043.645572,619.367061,292.445105,2648.634318,2098.746676,84.852098,465.035544,924.624598,5233.021834,2120.481024,453.993038,1049.556618,14.818160,1594.172994,9077.830528,851.922920,667.920633,267.581332,400.339301,4873.159292,131.430039,784.317997,690.069029,601.207445,477.803173,71725.00
2,01,2022,7.60,2.0,1.0,2.631701,2.327185,48.063382,6.178175,3.838164,1.866993,1.971171,2.964643,0.873521,2.744548,0.220094,1.739571,2.467944,1.726860,70876.362012,49432.641472,42941.309720,35180.076979,601.221747,797.360286,2696.329917,878.681694,1357.010404,1430.628694,4346.761979,4023.252786,612.071229,1743.125228,1668.056328,323.509193,57.091123,266.418070,0.000000,0.000000,2144.569773,3699.178161,2909.545855,789.632307,11590.128313,4746.875812,125.176490,1284.077994,1620.560021,1299.739910,1959.611630,554.086456,6104.295979,50.118087,42050.415428,14891.496001,12367.305218,2263.039403,2419.289394,148.340392,1157.545918,514.619115,153.939107,182.689189,1292.427559,529.997503,82.536125,69.457187,106.541103,2284.411434,1162.471790,2418.267983,105.922800,1815.311482,1108.123787,707.187696,3190.532164,988.032569,190.487898,653.135455,1358.876243,2802.620678,2047.271585,331.334971,424.014123,1563.229699,912.165840,418.598007,232.465852,8742.766424,942.337818,241.214223,2071.970327,3475.513593,353.799216,3121.714376,2011.730463,4229.182575,2619.806721,993.948281,615.427573,3472.719450,2762.887286,131.140721,578.691443,1342.556954,4713.717005,1321.913239,628.193628,1458.376651,81.722689,1223.510798,10023.020615,1648.086511,675.100559,397.667858,277.432701,4509.183242,174.837080,1262.517453,488.737181,720.939663,543.618927,72441.00
3,02,2018,17.50,2.0,1.0,1.786824,2.450202,48.275740,6.029622,3.357513,1.675624,1.681889,2.683144,0.674369,2.423732,0.259412,1.524581,2.034405,1.512097,56915.238306,40919.095225,37804.598595,34016.058648,322.359775,1193.046342,1156.710523,60.947327,343.486988,711.988991,2220.240730,2196.625641,836.240270,514.032360,846.353011,23.615089,16.021613,3.193825,0.000000,4.399650,894.255900,1808.597256,1022.502522,786.094734,7854.593944,4939.687213,34.689122,1389.941265,218.811523,202.472223,835.822501,233.170098,6286.549538,46.402343,35603.841750,11196.029500,7462.355140,1247.997974,1535.836232,184.211693,834.909768,287.775401,104.802455,110.718043,685.124646,392.189067,51.171443,58.091097,111.140383,876.472038,981.914901,3674.280820,59.393540,1332.281462,895.828206,436.453255,4403.212035,1452.330263,120.837559,565.867133,2264.177080,1985.978527,1417.999618,137.659829,430.319080,606.164094,461.028165,50.490846,94.645083,8775.436810,1150.179869,286.686791,683.244173,4824.136791,275.695714,4548.441077,1831.189187,3401.200265,1940.186122,1223.059953,237.954190,2988.034747,2346.281936,68.685699,573.067112,915.504310,1638.962794,306.998297,222.102263,272.325872,16.482340,821.054022,5835.221543,1005.534322,270.599035,195.549560,75.049475,3465.772226,49.952377,450.041092,76.439212,84.064264,432.819015,266591.00
4,02,2020,19.00,2.0,1.0,1.784814,2.485101,50.364883,6.013806,3.292556,1.607149,1.685407,2.766915,0.525641,2.450593,0.316322,1.503161,2.044177,1.473485,63294.979636,42338.628255,38639.018149,34256.229522,392.927556,762.441270,1590.122527,268.804832,647.472736,721.019706,2684.228334,2631.456297,723.995932,678.958014,1228.502351,52.772037,10.515240,33.519705,0.000000,8.737092,1015.381773,3096.739783,2321.807545,774.932237,9279.724872,5272.059012,85.857624,1300.099336,389.430085,647.436339,1163.135963,421.706514,8546.212531,33.674195,36065.912649,11967.181916,9938.924112,1463.704726,2048.908996,298.431649,969.877252,372.672169,143.174907,138.866564,862.781140,484.165491,83.676698,89.423820,142.574890,1575.471469,1265.194340,1957.703177,70.554628,968.607592,675.388276,293.219316,5021.729671,1501.968346,273.738694,765.665469,2480.357162,2420.109653,1925.147237,167.587360,327.375056,1029.625673,705.151834,137.728193,186.745646,7939.797097,753.598608,94.450401,943.264989,3995.990392,253.380619,3742.609773,2152.492707,2615.620763,2119.038481,444.433401,52.148880,3139.629518,2457.785624,54.596843,627.247050,963.610767,2714.744714,629.291253,142.523321,597.003405,14.598804,1331.327931,6377.658354,1008.415648,326.996259,229.963159,97.033099,3587.609357,164.992464,476.431958,144.373221,219.433915,449.405532,272823.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,31,2020,538.20,2.0,1.0,2.366885,2.009724,48.066265,5.787704,3.652757,1.717995,1.934761,2.905382,0.747375,2.596984,0.308398,1.869483,2.482407,1.801681,40421.281691,27073.997257,22677.279845,18746.035760,240.811887,934.557073,1100.439085,189.314094,435.802725,1030.319221,2841.770013,2716.014952,532.017045,776.691897,1407.306010,125.755060,45.739422,49.120453,30.895185,0.000000,1554.947400,1060.804380,659.193126,401.611254,7586.348131,3071.862462,59.693005,869.390254,22.717685,1161.888456,1201.896411,1198.899858,4682.265630,17.866292,26623.707247,11969.166748,10995.247926,1955.234344,2057.072077,70.886264,611.225621,372.877715,68.513312,109.320337,722.070243,313.985029,58.899916,103.206421,122.059310,3111.246486,1318.650851,952.610015,21.308807,397.633798,274.224499,123.409300,2006.511640,583.547567,104.844940,109.385843,1208.733290,1785.897601,1383.812674,82.993564,319.091363,1038.869924,713.176820,110.919327,214.773777,5096.703074,858.442957,37.081385,665.810543,1956.666767,224.987169,1731.679598,1578.701422,1668.108397,1073.126983,492.400075,102.581339,1805.032497,1434.010774,71.279100,299.742623,855.783568,4251.168373,1448.033354,688.736178,639.845547,13.979376,1460.573918,7786.162019,748.048888,492.414048,213.012353,279.401695,2960.324571,77.938185,1132.691689,1078.244246,781.183222,515.317170,73190.20
92,31,2022,542.00,2.0,1.0,2.428342,2.061257,48.833977,5.898544,3.519161,1.803041,1.716120,2.915355,0.603806,2.588247,0.327108,1.798277,2.394296,1.788719,57073.211817,39081.343509,33736.874887,29681.421936,303.480831,650.720050,1230.872729,34.136135,556.214716,1280.028490,4242.094859,4201.533828,1527.346085,895.050128,1779.137615,40.561030,11.617623,15.028774,13.914633,0.000000,1102.373763,2323.356149,1971.135966,352.220183,9673.661713,3951.247369,55.399599,964.336502,8.070297,2217.390757,1847.563126,629.654062,5986.858927,7.991519,35254.945375,15078.019025,12657.497222,1980.316409,2871.929638,75.665750,560.500014,417.541059,92.947084,107.394137,895.607883,238.852509,56.187116,85.021205,112.372462,3720.187942,1442.974013,2403.341234,17.180569,1039.560203,691.923726,347.636477,2196.272020,526.181327,125.459371,137.312384,1407.318939,2115.842771,1717.823493,64.765761,333.253517,1022.981109,786.869159,58.685023,177.426927,7105.615675,1599.086956,102.898686,1210.833103,2302.515108,195.019233,2107.495875,1890.281821,3189.630927,2343.165965,632.930843,213.534119,2447.844684,1980.841815,99.421333,367.581536,1059.178961,3686.488246,1046.786579,379.392365,666.576372,17.727108,1576.005822,8164.248698,1312.943875,684.908130,294.638798,390.269332,3764.217385,118.615560,1045.758775,729.821633,215.393367,292.589973,75934.20
93,32,2018,539.50,2.0,1.0,1.913677,2.437413,48.454606,5.936320,3.704873,1.765760,1.939113,2.870024,0.834849,2.573343,0.296682,1.640334,2.398005,1.586638,46160.884950,30859.269887,27024.346769,23087.305735,271.154277,514.948352,1627.896378,244.479638,792.967135,485.595254,2573.604738,2173.405494,293.056389,1140.690407,739.658698,400.199244,275.889231,123.628729,0.681285,0.000000,1261.318379,2986.493795,2692.987549,293.506246,7985.590956,2986.580958,106.128163,1157.391271,688.324623,516.378422,1590.949046,939.838473,4282.450337,47.079975,31613.315570,11154.997102,9228.403131,1846.659305,2073.130377,112.447666,1100.273617,371.905381,147.832199,210.110930,1065.421241,496.628147,98.204689,101.910238,96.687639,806.997604,700.194099,1866.972551,59.621421,1663.170079,979.827475,683.342603,2636.822526,697.768357,137.512367,359.751546,1441.790256,2014.875073,1489.286358,193.051243,332.537472,691.215826,535.584789,40.309764,115.321273,6451.625175,891.126892,221.247991,861.645763,3099.717147,269.890615,2829.826533,1377.887382,3436.936392,2165.215426,865.929618,405.791348,2856.722346,2050.747935,132.506590,673.467820,706.951051,2642.128027,594.744785,269.372195,458.625273,1.997480,1317.388296,3849.238518,866.502284,472.021213,303.605936,168.415278,1131.565585,53.676606,267.661117,307.790792,285.999413,464.021508,44659.75
94,32,2020,550.75,2.0,1.0,1.713133,2.518761,49.687160,6.178598,3.576835,1.749196,1.827639,2.833881,0.742954,2.556689,0.277192,1.677805,2.255929,1.658489,55976.157620,36335.525309,33275.126645,29073.311703,142.803765,360.770491,2147.991728,128.268612,784.909869,637.070477,2303.988467,2051.951912,449.221496,490.556447,1112.173969,252.036554,208.703796,43.332758,0.000000,0.000000,756.410197,5040.329184,4261.129162,779.200022,9172.998011,4141.702591,151.100265,636.983164,506.724446,914.410232,1849.018650,973.058663,5416.986685,10.318430,33300.180567,11723.941306,10150.909762,1871.213924,2163.520875,227.831949,1059.186179,415.185827,143.402732,165.477873,1184.039948,546.871954,87.013339,73.255752,92.553453,1344.053388,777.302571,1504.565762,68.465781,1232.268882,783.289257,448.979625,2918.430636,818.123713,160.532272,487.779703,1451.994947,2365.117447,1739.023669,269.518837,356.574941,1836.115882,1550.627979,88.977650,196.510253,6456.523155,334.468180,129.179580,1186.242518,2919.925944,298.856330,2621.069614,1886.706933,2231.504491,1502.079609,572.335152,157.089730,3208.533200,2059.103462,61.211865,1088.217873,1327.745568,4095.912236,1554.273247,484.079575,675.197921,30.605807,1351.755687,11012.969074,665.502043,608.281123,377.854297,230.426825,7261.444602,65.475680,869.949935,403.834531,230.584527,907.896633,40712.50


In [105]:
# append to df_concentradohogar_all for nan values
mask = (
    df_concentradohogar_all['clase_hog'].isnull()
)
columns_to_impute = df_concentradohogar_all.drop(columns=["ubica_geo", "year", "cve_ent"]).columns.copy()

# append to df_concentradohogar_all for nan values
df_concentradohogar_all.loc[mask, columns_to_impute] = (
    df_concentradohogar_all
    .loc[mask, :]
    .merge(
        table_concentradohogar_medians,
        on=["cve_ent", "year"],
        how="left"
        )
    .filter(like="_y")
    .values
)

# sort by ubica_geo and year
df_concentradohogar_all = (
    df_concentradohogar_all
    .sort_values(["ubica_geo", "year"], ignore_index=True)
    )

# see how many nan values
df_concentradohogar_all.isnull().sum()[df_concentradohogar_all.isnull().sum() > 0]

Series([], dtype: int64)

In [106]:
# see 31100
df_concentradohogar_all.query("ubica_geo == '31100'")

Unnamed: 0,ubica_geo,year,est_dis,clase_hog,sexo_jefe,tam_loc,est_socio,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,total_hogares,cve_ent
1002,31100,2018,524.0,2.0,1.0,2.425461,2.049813,47.532147,5.445398,3.839052,1.949956,1.889096,3.063356,0.775696,2.806116,0.25724,2.074376,2.622248,2.054416,47166.909216,33223.618704,28166.396221,23392.79815,255.884311,1383.336781,1217.078569,98.929386,417.411389,1400.957635,3619.342411,3541.411794,1127.999913,617.14737,1796.264511,77.930617,20.246164,25.267042,32.417411,0.0,1437.880071,3242.639994,2549.669459,692.970535,6668.996102,3177.021043,104.121556,1061.758079,18.372205,550.59628,1262.778869,494.34807,4008.765455,22.88896,30384.066068,11966.938045,10115.217337,1758.916783,2124.604219,74.465749,562.026725,321.081756,82.670991,95.77521,754.264073,223.6671,62.299581,87.795633,129.175719,2461.038767,1377.43503,1817.934786,33.785922,1068.692916,695.211072,373.481844,1856.746948,481.563559,54.309289,105.661346,1215.212754,1637.332664,1246.258383,98.390473,292.683808,1044.632368,784.320884,151.438181,108.873303,6570.661913,2099.763625,143.172816,938.252991,1900.918374,173.944348,1726.974025,1488.554108,3335.165337,2346.79574,820.341927,168.027671,1832.580439,1515.684307,99.381235,217.514898,1071.315437,3583.556207,1046.028682,630.052661,556.207632,24.715178,1326.552054,7332.809878,930.68068,208.73831,137.14371,71.5946,3246.158424,308.053374,918.910819,730.714674,178.170154,811.383443,72186.4,31
1003,31100,2020,538.2,2.0,1.0,2.366885,2.009724,48.066265,5.787704,3.652757,1.717995,1.934761,2.905382,0.747375,2.596984,0.308398,1.869483,2.482407,1.801681,40421.281691,27073.997257,22677.279845,18746.03576,240.811887,934.557073,1100.439085,189.314094,435.802725,1030.319221,2841.770013,2716.014952,532.017045,776.691897,1407.30601,125.75506,45.739422,49.120453,30.895185,0.0,1554.9474,1060.80438,659.193126,401.611254,7586.348131,3071.862462,59.693005,869.390254,22.717685,1161.888456,1201.896411,1198.899858,4682.26563,17.866292,26623.707247,11969.166748,10995.247926,1955.234344,2057.072077,70.886264,611.225621,372.877715,68.513312,109.320337,722.070243,313.985029,58.899916,103.206421,122.05931,3111.246486,1318.650851,952.610015,21.308807,397.633798,274.224499,123.4093,2006.51164,583.547567,104.84494,109.385843,1208.73329,1785.897601,1383.812674,82.993564,319.091363,1038.869924,713.17682,110.919327,214.773777,5096.703074,858.442957,37.081385,665.810543,1956.666767,224.987169,1731.679598,1578.701422,1668.108397,1073.126983,492.400075,102.581339,1805.032497,1434.010774,71.2791,299.742623,855.783568,4251.168373,1448.033354,688.736178,639.845547,13.979376,1460.573918,7786.162019,748.048888,492.414048,213.012353,279.401695,2960.324571,77.938185,1132.691689,1078.244246,781.183222,515.31717,73190.2,31
1004,31100,2022,542.0,2.0,1.0,2.428342,2.061257,48.833977,5.898544,3.519161,1.803041,1.71612,2.915355,0.603806,2.588247,0.327108,1.798277,2.394296,1.788719,57073.211817,39081.343509,33736.874887,29681.421936,303.480831,650.72005,1230.872729,34.136135,556.214716,1280.02849,4242.094859,4201.533828,1527.346085,895.050128,1779.137615,40.56103,11.617623,15.028774,13.914633,0.0,1102.373763,2323.356149,1971.135966,352.220183,9673.661713,3951.247369,55.399599,964.336502,8.070297,2217.390757,1847.563126,629.654062,5986.858927,7.991519,35254.945375,15078.019025,12657.497222,1980.316409,2871.929638,75.66575,560.500014,417.541059,92.947084,107.394137,895.607883,238.852509,56.187116,85.021205,112.372462,3720.187942,1442.974013,2403.341234,17.180569,1039.560203,691.923726,347.636477,2196.27202,526.181327,125.459371,137.312384,1407.318939,2115.842771,1717.823493,64.765761,333.253517,1022.981109,786.869159,58.685023,177.426927,7105.615675,1599.086956,102.898686,1210.833103,2302.515108,195.019233,2107.495875,1890.281821,3189.630927,2343.165965,632.930843,213.534119,2447.844684,1980.841815,99.421333,367.581536,1059.178961,3686.488246,1046.786579,379.392365,666.576372,17.727108,1576.005822,8164.248698,1312.943875,684.90813,294.638798,390.269332,3764.217385,118.61556,1045.758775,729.821633,215.393367,292.589973,75934.2,31


In [104]:
# see 31 in table
table_concentradohogar_medians.query("cve_ent == '31'")

Unnamed: 0,cve_ent,year,est_dis,clase_hog,sexo_jefe,tam_loc,est_socio,edad_jefe,educa_jefe,tot_integ,hombres,mujeres,mayores,menores,p12_64,p65mas,ocupados,percep_ing,perc_ocupa,ing_cor,ingtrab,trabajo,sueldos,horas_extr,comisiones,aguinaldo,indemtrab,otra_rem,remu_espec,negocio,noagrop,industria,comercio,servicios,agrope,agricolas,pecuarios,reproducc,pesca,otros_trab,rentas,utilidad,arrenda,transfer,jubilacion,becas,donativos,remesas,bene_gob,transf_hog,trans_inst,estim_alqu,otros_ing,gasto_mon,alimentos,ali_dentro,cereales,carnes,pescado,leche,huevo,aceites,tuberculo,verduras,frutas,azucar,cafe,especias,otros_alim,bebidas,ali_fuera,tabaco,vesti_calz,vestido,calzado,vivienda,alquiler,pred_cons,agua,energia,limpieza,cuidados,utensilios,enseres,salud,atenc_ambu,hospital,medicinas,transporte,publico,foraneo,adqui_vehi,mantenim,refaccion,combus,comunica,educa_espa,educacion,esparci,paq_turist,personales,cuida_pers,acces_pers,otros_gas,transf_gas,percep_tot,retiro_inv,prestamos,otras_perc,ero_nm_viv,ero_nm_hog,erogac_tot,cuota_viv,mater_serv,material,servicio,deposito,prest_terc,pago_tarje,deudas,balance,otras_erog,total_hogares
90,31,2018,524.0,2.0,1.0,2.425461,2.049813,47.532147,5.445398,3.839052,1.949956,1.889096,3.063356,0.775696,2.806116,0.25724,2.074376,2.622248,2.054416,47166.909216,33223.618704,28166.396221,23392.79815,255.884311,1383.336781,1217.078569,98.929386,417.411389,1400.957635,3619.342411,3541.411794,1127.999913,617.14737,1796.264511,77.930617,20.246164,25.267042,32.417411,0.0,1437.880071,3242.639994,2549.669459,692.970535,6668.996102,3177.021043,104.121556,1061.758079,18.372205,550.59628,1262.778869,494.34807,4008.765455,22.88896,30384.066068,11966.938045,10115.217337,1758.916783,2124.604219,74.465749,562.026725,321.081756,82.670991,95.77521,754.264073,223.6671,62.299581,87.795633,129.175719,2461.038767,1377.43503,1817.934786,33.785922,1068.692916,695.211072,373.481844,1856.746948,481.563559,54.309289,105.661346,1215.212754,1637.332664,1246.258383,98.390473,292.683808,1044.632368,784.320884,151.438181,108.873303,6570.661913,2099.763625,143.172816,938.252991,1900.918374,173.944348,1726.974025,1488.554108,3335.165337,2346.79574,820.341927,168.027671,1832.580439,1515.684307,99.381235,217.514898,1071.315437,3583.556207,1046.028682,630.052661,556.207632,24.715178,1326.552054,7332.809878,930.68068,208.73831,137.14371,71.5946,3246.158424,308.053374,918.910819,730.714674,178.170154,811.383443,72186.4
91,31,2020,538.2,2.0,1.0,2.366885,2.009724,48.066265,5.787704,3.652757,1.717995,1.934761,2.905382,0.747375,2.596984,0.308398,1.869483,2.482407,1.801681,40421.281691,27073.997257,22677.279845,18746.03576,240.811887,934.557073,1100.439085,189.314094,435.802725,1030.319221,2841.770013,2716.014952,532.017045,776.691897,1407.30601,125.75506,45.739422,49.120453,30.895185,0.0,1554.9474,1060.80438,659.193126,401.611254,7586.348131,3071.862462,59.693005,869.390254,22.717685,1161.888456,1201.896411,1198.899858,4682.26563,17.866292,26623.707247,11969.166748,10995.247926,1955.234344,2057.072077,70.886264,611.225621,372.877715,68.513312,109.320337,722.070243,313.985029,58.899916,103.206421,122.05931,3111.246486,1318.650851,952.610015,21.308807,397.633798,274.224499,123.4093,2006.51164,583.547567,104.84494,109.385843,1208.73329,1785.897601,1383.812674,82.993564,319.091363,1038.869924,713.17682,110.919327,214.773777,5096.703074,858.442957,37.081385,665.810543,1956.666767,224.987169,1731.679598,1578.701422,1668.108397,1073.126983,492.400075,102.581339,1805.032497,1434.010774,71.2791,299.742623,855.783568,4251.168373,1448.033354,688.736178,639.845547,13.979376,1460.573918,7786.162019,748.048888,492.414048,213.012353,279.401695,2960.324571,77.938185,1132.691689,1078.244246,781.183222,515.31717,73190.2
92,31,2022,542.0,2.0,1.0,2.428342,2.061257,48.833977,5.898544,3.519161,1.803041,1.71612,2.915355,0.603806,2.588247,0.327108,1.798277,2.394296,1.788719,57073.211817,39081.343509,33736.874887,29681.421936,303.480831,650.72005,1230.872729,34.136135,556.214716,1280.02849,4242.094859,4201.533828,1527.346085,895.050128,1779.137615,40.56103,11.617623,15.028774,13.914633,0.0,1102.373763,2323.356149,1971.135966,352.220183,9673.661713,3951.247369,55.399599,964.336502,8.070297,2217.390757,1847.563126,629.654062,5986.858927,7.991519,35254.945375,15078.019025,12657.497222,1980.316409,2871.929638,75.66575,560.500014,417.541059,92.947084,107.394137,895.607883,238.852509,56.187116,85.021205,112.372462,3720.187942,1442.974013,2403.341234,17.180569,1039.560203,691.923726,347.636477,2196.27202,526.181327,125.459371,137.312384,1407.318939,2115.842771,1717.823493,64.765761,333.253517,1022.981109,786.869159,58.685023,177.426927,7105.615675,1599.086956,102.898686,1210.833103,2302.515108,195.019233,2107.495875,1890.281821,3189.630927,2343.165965,632.930843,213.534119,2447.844684,1980.841815,99.421333,367.581536,1059.178961,3686.488246,1046.786579,379.392365,666.576372,17.727108,1576.005822,8164.248698,1312.943875,684.90813,294.638798,390.269332,3764.217385,118.61556,1045.758775,729.821633,215.393367,292.589973,75934.2


### Vivendas

In [221]:
# to new_index add nan_ubica_geo ubica_geo's
index_cols = ["ubica_geo", "year"]
new_index = pd.MultiIndex.from_product(
    [gdf_properties["ubica_geo"].unique(), gdf_properties["year_enigh"].unique()],
    names=index_cols
)

# reindex
df_viviendas_all = (
    df_viviendas_all
    .set_index(index_cols)
    .reindex(new_index)
    .reset_index()
    )

In [222]:
# subset columns that are not duplicated
df_viviendas_all = (
    df_viviendas_all
    .loc[:, ~df_viviendas_all.columns.duplicated()]
)

In [223]:
# see 31100
df_viviendas_all.query("ubica_geo == '31100'")

Unnamed: 0,ubica_geo,year,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas
1011,31100,2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1012,31100,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1013,31100,2022,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [224]:
# get cve_ent from ubica_geo
df_viviendas_all["cve_ent"] = df_viviendas_all["ubica_geo"].str[:2]

In [225]:
# convert to float
cols_to_change_type = ["combustible", "tipo_finan", "tipo_viv", "mat_pisos"]
df_viviendas_all[cols_to_change_type] = df_viviendas_all[cols_to_change_type].astype(float)

In [226]:
# get table of median values by cve_ent
table_vivienda_medians = (
    df_viviendas_all
    .drop(columns=["ubica_geo"])
    .groupby(["cve_ent", "year"], as_index=False)
    .mean()
)
table_vivienda_medians

Unnamed: 0,cve_ent,year,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas
0,01,2018,8.0,10.0,2.80,1.0,3.0,3.0,4.0,2.20,4.2,1.0,1.0,1.0,0.912137,0.271362,0.800511,0.909509,0.0,4.420127,20.250340,1.112884,0.399239,0.967455,0.003587,2.187402,0.039379,230.706894,0.766226,1.670811,1.083465,0.042528,0.964255,2.331342,2.232725,1.146075,0.021342,0.005698,2.586135,0.082991,5.998591,0.301207,2.436640,1.079774,0.993295,0.102691,4.231534,0.407684,0.004955,0.243605,320.586506,0.264143,0.089692,0.013002,1.046560,1595.553006,60702.20
1,01,2020,8.0,10.0,3.00,1.0,3.0,3.0,4.0,1.80,4.2,1.0,1.0,1.0,0.943074,0.263799,0.795834,0.907967,0.0,3.981893,17.911069,1.152286,0.437911,0.984080,0.005726,1.926812,0.043178,268.739821,0.793554,1.201682,1.087265,0.031128,0.966095,2.338150,2.055080,1.132671,0.014753,0.007867,2.552323,0.113611,6.247882,0.318703,2.321090,1.063589,0.991903,0.107332,4.278451,0.365571,0.007148,0.193672,316.025198,0.294593,0.099538,0.008199,1.015828,1645.189596,71725.00
2,01,2022,8.0,10.0,3.00,1.0,3.0,2.6,3.8,1.80,3.4,1.0,1.0,1.0,0.955338,0.364700,0.843292,0.890058,0.0,3.914585,17.248517,1.176136,0.515727,0.970398,0.015353,1.903469,0.022253,339.712427,0.850006,0.795252,1.086765,0.036597,0.967262,2.327185,2.011116,1.232014,0.008596,0.006710,2.631701,0.107142,7.207891,0.313931,2.331273,1.083152,0.989237,0.104247,4.185339,0.413389,0.009488,0.145486,588.326450,0.361911,0.102420,0.021380,1.018819,2080.635767,72441.00
3,02,2018,8.0,8.5,2.25,1.0,3.0,1.0,4.0,1.50,5.0,1.0,1.0,1.0,0.852444,0.313906,0.101366,0.603249,0.0,3.370514,14.778483,0.976271,0.013631,0.974302,0.271139,1.682316,0.068284,495.536871,0.822640,1.190902,1.249026,0.059280,0.934396,2.450202,1.688198,0.994116,0.034741,0.002962,1.786824,0.050373,5.619687,0.042568,2.115245,1.394415,0.995097,0.127318,3.651680,0.542618,0.030859,0.331060,355.509805,0.063110,0.130529,0.006312,1.003140,2137.880791,266591.00
4,02,2020,8.0,8.5,2.50,1.0,3.0,1.0,4.0,2.00,5.0,1.0,1.0,1.0,0.913127,0.279764,0.108240,0.577239,0.0,3.309996,18.097982,1.099012,0.025186,0.979650,0.272621,1.613549,0.072284,519.522103,0.860794,0.819700,1.161200,0.088035,0.964287,2.485101,1.696447,1.115911,0.031589,0.001947,1.784814,0.062003,6.346981,0.043219,2.161368,1.252475,0.994902,0.170105,3.842881,0.588295,0.044019,0.279955,386.555622,0.059437,0.128789,0.021931,1.005444,2910.766735,272823.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,31,2020,8.0,10.0,3.00,1.0,2.6,1.0,4.0,1.40,3.4,1.0,1.0,1.0,0.861660,0.289199,0.820661,0.594545,0.0,3.731933,16.879545,1.001989,0.002678,0.905991,0.250149,1.754900,0.093957,204.002737,0.744071,0.348565,1.190140,0.049395,0.953110,2.009724,1.977033,0.991318,0.005589,0.012016,2.366885,0.022467,7.058724,0.092987,1.850504,2.133554,0.960583,0.073838,3.508086,0.105279,0.001947,0.136500,359.305775,0.229901,0.090638,0.074315,1.026010,1593.574772,73190.20
92,31,2022,8.0,10.0,2.80,1.0,2.6,1.0,4.0,1.40,2.6,1.0,1.0,1.0,0.862909,0.265821,0.831832,0.628512,0.0,3.580125,19.657799,0.988764,0.006678,0.908434,0.316408,1.840188,0.113051,180.334891,0.715371,0.178975,1.136088,0.022505,0.951691,2.061257,1.739937,1.008961,0.003557,0.008526,2.428342,0.038166,7.284024,0.093629,1.815024,2.019622,0.968428,0.113511,3.357695,0.097785,0.003806,0.144380,477.986526,0.134927,0.143175,0.067798,1.019311,2040.262618,75934.20
93,32,2018,8.0,10.0,2.75,1.0,3.0,1.0,4.0,2.00,5.0,1.0,1.0,1.0,0.908752,0.272821,0.891276,0.716853,0.0,3.725596,18.892552,1.087911,0.402190,0.984807,0.002498,1.777258,0.034656,245.178736,0.748566,2.062146,1.112747,0.038558,0.965419,2.437413,1.948339,1.804461,0.018773,0.008392,1.913677,0.167371,5.424019,0.230324,2.272309,1.146149,0.984123,0.104604,4.298982,0.620104,0.003442,0.246279,298.976532,0.239029,0.097143,0.009323,1.007008,1457.800094,44659.75
94,32,2020,8.0,10.0,3.00,1.0,3.0,1.0,4.0,2.25,4.0,1.0,1.0,1.0,0.905209,0.245524,0.907330,0.739196,0.0,3.602131,19.213552,1.135389,0.481329,0.981950,0.007961,1.762254,0.037586,272.649519,0.775875,1.710288,1.111275,0.046909,0.973919,2.518761,1.839877,1.802442,0.024400,0.007756,1.713133,0.203805,6.143193,0.249493,2.266584,1.085417,0.988560,0.126049,4.138004,0.631162,0.008370,0.298459,233.218299,0.257501,0.071726,0.009894,1.009633,1844.846296,40712.50


In [227]:
mask.sum()

36

In [232]:
# append to df_viviendas_all for nan values
mask = (
    df_viviendas_all['mat_pared'].isnull()
)
columns_to_impute = df_viviendas_all.drop(columns=["ubica_geo", "year", "cve_ent"]).columns.copy()

# append to df_viviendas_all for nan values
df_viviendas_all.loc[mask, columns_to_impute] = (
    df_viviendas_all
    .loc[mask, :]
    .merge(
        table_vivienda_medians,
        on=["cve_ent", "year"],
        how="left"
        )
    .filter(like="_y")
    .values
)

# sort by ubica_geo and year
df_viviendas_all = (
    df_viviendas_all
    .sort_values(["ubica_geo", "year"], ignore_index=True)
    )

# see how many nan values
df_viviendas_all.isnull().sum()[df_viviendas_all.isnull().sum() > 0]

Series([], dtype: int64)

In [233]:
# see 31100
df_viviendas_all.query("ubica_geo == '31100'")

Unnamed: 0,ubica_geo,year,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas,cve_ent
1002,31100,2018,8.0,10.0,2.8,1.0,2.6,1.0,3.8,1.2,3.4,1.0,1.0,1.0,0.885791,0.272248,0.841223,0.502584,0.0,3.985898,18.885491,0.996769,0.002285,0.825177,0.213154,2.029412,0.0909,162.986406,0.703972,0.605953,1.137121,0.026766,0.980615,2.049813,1.956486,0.98303,0.00196,0.02182,2.425461,0.03856,6.56762,0.093586,1.909671,2.056778,0.969455,0.046804,3.437131,0.136885,0.000373,0.131111,340.937094,0.148716,0.131848,0.103535,1.045526,1366.720974,72186.4,31
1003,31100,2020,8.0,10.0,3.0,1.0,2.6,1.0,4.0,1.4,3.4,1.0,1.0,1.0,0.86166,0.289199,0.820661,0.594545,0.0,3.731933,16.879545,1.001989,0.002678,0.905991,0.250149,1.7549,0.093957,204.002737,0.744071,0.348565,1.19014,0.049395,0.95311,2.009724,1.977033,0.991318,0.005589,0.012016,2.366885,0.022467,7.058724,0.092987,1.850504,2.133554,0.960583,0.073838,3.508086,0.105279,0.001947,0.1365,359.305775,0.229901,0.090638,0.074315,1.02601,1593.574772,73190.2,31
1004,31100,2022,8.0,10.0,2.8,1.0,2.6,1.0,4.0,1.4,2.6,1.0,1.0,1.0,0.862909,0.265821,0.831832,0.628512,0.0,3.580125,19.657799,0.988764,0.006678,0.908434,0.316408,1.840188,0.113051,180.334891,0.715371,0.178975,1.136088,0.022505,0.951691,2.061257,1.739937,1.008961,0.003557,0.008526,2.428342,0.038166,7.284024,0.093629,1.815024,2.019622,0.968428,0.113511,3.357695,0.097785,0.003806,0.14438,477.986526,0.134927,0.143175,0.067798,1.019311,2040.262618,75934.2,31


In [234]:
# see 31 in table
table_vivienda_medians.query("cve_ent == '31'")

Unnamed: 0,cve_ent,year,mat_pared,mat_techos,mat_pisos,disp_agua,combustible,eli_basura,tenencia,tipo_adqui,tipo_finan,escrituras,disp_elect,tipo_viv,regadera,pago_mesp,tinaco_azo,lavadero,procaptar,tot_resid,antiguedad,bano_comp,calent_sol,cocina,aire_acond,tot_hom,cocina_dor,renta,fregadero,focos_inca,sanit_agua,uso_compar,medidor_luz,est_socio,tot_muj,dotac_agua,bano_regad,estufa_chi,tam_loc,tanque_gas,focos_ahor,cisterna,cuart_dorm,drenaje,excusado,pileta,num_cuarto,calent_gas,calefacc,bano_excus,pago_viv,bomba_agua,viv_usada,biodigest,tot_hog,estim_pago,total_viviendas
90,31,2018,8.0,10.0,2.8,1.0,2.6,1.0,3.8,1.2,3.4,1.0,1.0,1.0,0.885791,0.272248,0.841223,0.502584,0.0,3.985898,18.885491,0.996769,0.002285,0.825177,0.213154,2.029412,0.0909,162.986406,0.703972,0.605953,1.137121,0.026766,0.980615,2.049813,1.956486,0.98303,0.00196,0.02182,2.425461,0.03856,6.56762,0.093586,1.909671,2.056778,0.969455,0.046804,3.437131,0.136885,0.000373,0.131111,340.937094,0.148716,0.131848,0.103535,1.045526,1366.720974,72186.4
91,31,2020,8.0,10.0,3.0,1.0,2.6,1.0,4.0,1.4,3.4,1.0,1.0,1.0,0.86166,0.289199,0.820661,0.594545,0.0,3.731933,16.879545,1.001989,0.002678,0.905991,0.250149,1.7549,0.093957,204.002737,0.744071,0.348565,1.19014,0.049395,0.95311,2.009724,1.977033,0.991318,0.005589,0.012016,2.366885,0.022467,7.058724,0.092987,1.850504,2.133554,0.960583,0.073838,3.508086,0.105279,0.001947,0.1365,359.305775,0.229901,0.090638,0.074315,1.02601,1593.574772,73190.2
92,31,2022,8.0,10.0,2.8,1.0,2.6,1.0,4.0,1.4,2.6,1.0,1.0,1.0,0.862909,0.265821,0.831832,0.628512,0.0,3.580125,19.657799,0.988764,0.006678,0.908434,0.316408,1.840188,0.113051,180.334891,0.715371,0.178975,1.136088,0.022505,0.951691,2.061257,1.739937,1.008961,0.003557,0.008526,2.428342,0.038166,7.284024,0.093629,1.815024,2.019622,0.968428,0.113511,3.357695,0.097785,0.003806,0.14438,477.986526,0.134927,0.143175,0.067798,1.019311,2040.262618,75934.2


## S6: Re-Join with gdf_properties

### Load & Transform

In [235]:
# read parquet
gdf_properties = pd.read_parquet("../../data/interim/cleaned_data_s4.parquet")

# get original columns
original_columns = gdf_properties.columns

# see num of rows
gdf_properties.shape

(852931, 41)

In [236]:
# see columns cve ent & cve mun
gdf_properties[["id_entidad_f", "id_municipio"]]

Unnamed: 0,id_entidad_f,id_municipio
0,09,003
1,15,058
2,15,121
3,09,003
4,15,058
...,...,...
853028,31,041
853029,31,050
853030,31,050
853031,31,050


In [237]:
# create 'ubica_geo' column
gdf_properties["ubica_geo"] = (
    gdf_properties["id_entidad_f"].astype(str).str.zfill(2)
    + gdf_properties["id_municipio"].astype(str).str.zfill(3)
    )

# see some values
gdf_properties["ubica_geo"]

0         09003
1         15058
2         15121
3         09003
4         15058
          ...  
853028    31041
853029    31050
853030    31050
853031    31050
853032    31050
Name: ubica_geo, Length: 852931, dtype: object

In [238]:
# get year of fecha avaluo and floor every 2 years
gdf_properties["year_enigh"] = (
    gdf_properties["fecha_avaluo"].dt.year
    // 2 * 2
    )

# describe
gdf_properties["year_enigh"].describe()

count    852931.000000
mean       2020.196398
std           1.485187
min        2018.000000
25%        2020.000000
50%        2020.000000
75%        2022.000000
max        2022.000000
Name: year_enigh, dtype: float64

### Join with Concentrado Hogar

In [239]:
# join with df_concentradohogar_all
gdf_properties = (
    gdf_properties
    .merge(
        df_concentradohogar_all,
        left_on=["ubica_geo", "year_enigh"],
        right_on=["ubica_geo", "year"],
        how="left"
        )
    )

# see columns
gdf_properties.shape

(852931, 166)

In [240]:
# see size
gdf_properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 852931 entries, 0 to 852930
Columns: 166 entries, property_id to cve_ent
dtypes: datetime64[ns](1), float64(129), int32(2), int64(24), object(10)
memory usage: 1.0+ GB


In [241]:
# see if there are columns with "_y"
gdf_properties.filter(like="_x").columns

Index([], dtype='object')

In [244]:
# see how many nan values 
gdf_properties.isnull().sum()[gdf_properties.isnull().sum() > 0]

elevador           28371
cve_vigilancia    316447
tipo_vialidad     316447
dtype: int64

### Join with Vivienda

In [245]:
# join with df_viviendas_all
gdf_properties = (
    gdf_properties
    .drop(columns=["year", "tam_loc", "est_socio"])
    .merge(
        df_viviendas_all,
        left_on=["ubica_geo", "year_enigh"],
        right_on=["ubica_geo", "year"],
        how="left"
        )
    )

# see columns
gdf_properties.shape

(852931, 220)

In [246]:
# see size
gdf_properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 852931 entries, 0 to 852930
Columns: 220 entries, property_id to cve_ent_y
dtypes: datetime64[ns](1), float64(182), int32(2), int64(24), object(11)
memory usage: 1.4+ GB


In [247]:
# see if there are columns with "_y"
gdf_properties.filter(like="_x").columns

Index(['cve_ent_x'], dtype='object')

In [249]:
# drop cve_ent_x and cve_ent_y
gdf_properties = (
    gdf_properties
    .drop(columns=["cve_ent_x", "cve_ent_y"])
    )

In [251]:
# see how many nan values 
gdf_properties.isnull().sum()[gdf_properties.isnull().sum() > 0]

elevador           28371
cve_vigilancia    316447
tipo_vialidad     316447
dtype: int64

---
# Save

In [252]:
# save
dir_save = "../../data/interim/"
df_viviendas_all.to_csv(dir_save + "viviendas_enigh.csv")
df_concentradohogar_all.to_csv(dir_save + "concentrado_hogar_enigh.csv")

In [253]:
# save parquet
gdf_properties.to_parquet("../../data/interim/cleaned_data_s5_adamuz.parquet")