In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
#lectura de archivos
enigh = pd.read_csv(r'data\conjunto_de_datos_enigh_ns_2022_csv\conjunto_de_datos_concentradohogar_enigh2022_ns\conjunto_de_datos\conjunto_de_datos_concentradohogar_enigh2022_ns.csv', dtype=object)

In [3]:
#variables de interés
enigh = enigh[['medicinas', 'cuida_pers','ubica_geo', 'est_socio','ing_cor']]

In [4]:
# columnas numéricas
float_columns_2022 = ['medicinas', 'cuida_pers','est_socio','ing_cor']

In [5]:
enigh[float_columns_2022] = enigh[float_columns_2022].astype(float)

# columnas categóricas

enigh['ubica_geo'] = enigh['ubica_geo'].astype(str)

In [6]:
# extraccion entidad y municipio
enigh.loc[:,'entidad'] = enigh['ubica_geo'].apply(lambda x: x[0] if len(x) == 8 else x[0:2])
enigh.loc[:,'municipio'] = enigh['ubica_geo'].apply(lambda x: x[1:4] if len(x) == 8 else x[2:5])

In [7]:
#agrupar municipio entidad
enigh_mun = enigh.groupby(['entidad', 'municipio']).agg({ 'medicinas':'sum', 'cuida_pers':'sum', 'est_socio':'mean','ing_cor':'sum'}).reset_index()

In [8]:
import geopandas as gpd

#poligonos municipales
mun_grouped = gpd.read_file(r'data\out\mun_grouped\mun_grouped.shp')

In [9]:
# juntar poligonos y enigh
enigh_mun_pol=pd.merge(enigh_mun, mun_grouped, left_on=['entidad', 'municipio'], right_on=['CVE_ENT', 'CVE_MUN'], how='inner')

In [10]:
import time

# identificar poligonos faltantes en enigh (municipios faltantes)
start_time = time.time()
missing_polygons = mun_grouped[~mun_grouped[['CVE_ENT', 'CVE_MUN']].apply(tuple, 1).isin(enigh_mun_pol[['CVE_ENT', 'CVE_MUN']].apply(tuple, 1))]
print(f"Step 1 (Identifying missing polygons) took: {time.time() - start_time:.2f} seconds")


Step 1 (Identifying missing polygons) took: 0.02 seconds


In [11]:
from tqdm import tqdm
results = []

# calcular vecinos mas cercanos de la misma entidad
start_time = time.time()
for _, missing_polygon in tqdm(missing_polygons.iterrows(), total=len(missing_polygons), desc="Processing missing polygons"):
    
    missing_geom = missing_polygon['geometry']
    missing_ent = missing_polygon['CVE_ENT']
    
    # filtrar por entidad
    same_ent_polygons = enigh_mun_pol[enigh_mun_pol['CVE_ENT'] == missing_ent]
    
    if len(same_ent_polygons) > 0:
        # sacar distancias
        same_ent_polygons = same_ent_polygons.copy()  
        same_ent_polygons.loc[:, 'distance'] = same_ent_polygons['geometry'].apply(lambda geom: missing_geom.distance(geom))
        
        # 3 poligonos mas cercanos
        nearest_polygons = same_ent_polygons.nsmallest(3, 'distance')
        
        # sacar promedios
        average_values = nearest_polygons[['medicinas', 'cuida_pers','est_socio','ing_cor']].mean()
        
        # añadir valor calculado
        new_row = pd.concat([missing_polygon[['CVE_ENT', 'CVE_MUN', 'geometry']], average_values])
        
        results.append(new_row)
    else:
        print(f"No neighboring polygons found for CVE_ENT: {missing_ent}")
print(f"Step 2 (Processing missing polygons) took: {time.time() - start_time:.2f} seconds")

Processing missing polygons: 100%|██████████| 1343/1343 [08:18<00:00,  2.70it/s]

Step 2 (Processing missing polygons) took: 498.06 seconds





In [12]:
# 4combinar resultados y juntarlos al dataframe original
start_time = time.time()
new_polygons_df = gpd.GeoDataFrame(results)
enigh_mun_pol = pd.concat([enigh_mun_pol,new_polygons_df])
print(f"Step 4 (Appending new polygons) took: {time.time() - start_time:.2f} seconds")

Step 4 (Appending new polygons) took: 0.07 seconds


  return GeometryArray(data, crs=_get_common_crs(to_concat))


In [13]:
enigh_mun_pol=enigh_mun_pol.drop(columns=['index','entidad','municipio','CVEGEO','CVE_LOC','CVE_AGEB','AMBITO','geometry'])

In [14]:
enigh_mun_pol.rename(columns={'CVE_ENT':'entidad','CVE_MUN':'municipio'}, inplace=True)

In [15]:
enigh_mun_pol

Unnamed: 0,medicinas,cuida_pers,est_socio,ing_cor,entidad,municipio
0,370513.390000,3.901444e+06,2.714378,1.238243e+08,01,001
1,32014.210000,3.579400e+05,2.000000,9.133303e+06,01,002
2,46294.480000,3.280608e+05,2.000000,1.030639e+07,01,003
3,3553.960000,5.109968e+04,2.000000,7.592749e+05,01,004
4,78173.010000,7.372831e+05,2.268199,1.968148e+07,01,005
...,...,...,...,...,...,...
1338,46597.673333,4.896170e+05,2.632047,1.238122e+07,32,037
1339,4442.536667,7.189031e+04,1.667105,1.728795e+06,32,041
1340,16680.560000,1.022269e+05,2.200647,2.529844e+06,32,043
1341,3445.706667,4.207420e+04,1.666667,9.802428e+05,32,047


In [16]:
enigh_mun_pol=enigh_mun_pol.dropna()

In [17]:
enigh_mun_pol.shape

(2475, 6)

In [18]:
# exportar 
enigh_mun_pol.to_csv('data/out/enigh_2022_mun_pol.csv', index=False)