## Preprocessing for ENSANUT 2012 - Componente de NUTRICION (Bases trabajadas)

https://ensanut.insp.mx/encuestas/ensanut2012/descargas.php


- `actividad_fisica_menores.csv`  
  → Información sobre actividad física en menores de edad

- `antropometria_escolares.csv`  
  → Mediciones antropométricas de niños en edad escolar

- `antropometria_preesc.csv`  
  → Mediciones antropométricas de niños en edad preescolar

- `frecuencia_escolares_colapsada.csv`  
  → Frecuencia alimentaria de escolares (tabla colapsada)

- `frecuencia_escolares_nutrimentos.csv`  
  → Frecuencia de consumo de nutrimentos en escolares

- `frecuencia_preescolares_colapsada.csv`  
  → Frecuencia alimentaria de preescolares (tabla colapsada)

- `frecuencia_preescolares_nutrimentos.csv`  
  → Frecuencia de consumo de nutrimentos en preescolares

- `Frecuencia-dias-Escolares1.csv`  
  → Registro detallado de días de consumo para escolares (parte 1)

- `Frecuencia-dias-Escolares2.csv`  
  → Registro detallado de días de consumo para escolares (parte 2)

- `Frecuencia-dias-Preescolares1.csv`  
  → Registro detallado de días de consumo para preescolares (parte 1)

- `Frecuencia-dias-Preescolares2.csv`  
  → Registro detallado de días de consumo para preescolares (parte 2)

- `practicas_alim_inf_2012_F.csv`  
  → Prácticas de alimentación infantil

- `recordatorio_de_24_horas_1.csv`  
  → Recordatorio de 24 horas de consumo (parte 1)

- `recordatorio_de_24_horas_2.csv`  
  → Recordatorio de 24 horas de consumo (parte 2)

- `sangre_anemia_escolar.csv`  
  → Resultados de hemoglobina en escolares

- `sangre_anemia_preescol.csv`  
  → Resultados de hemoglobina en preescolares


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sys
import os

---

### actividad_fisica_menores

In [2]:
actividad_fisica = pd.read_csv(
    "../data/raw_nutricion/actividad_fisica_menores.csv",
    encoding="latin1",
    na_values=["", " ", "NA"],
    low_memory=False
)

actividad_fisica.columns = actividad_fisica.columns.str.replace("ï»¿", "")

In [3]:
actividad_fisica

Unnamed: 0,folio,intp,entidad,desc_ent,munici,desc_mun,locali,desc_loc,manzana,ageb,...,an405cat,ur_ru,catmon2,imc_cat,nsef,nse5f,indicef,nse10f,escolari,region
0,10001,,,,,,,,,,...,,,,,,5.0,3.310654,10.0,,
1,10003,,,,,,,,,,...,,,,,,5.0,3.310654,10.0,,
2,10005,,,,,,,,,,...,,,,,,5.0,2.397098,10.0,,
3,10006,,,,,,,,,,...,,,,,,5.0,2.397098,10.0,,
4,10007,,,,,,,,,,...,,,,,,4.0,1.644975,8.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50549,321855,,,,,,,,,,...,,,,,,2.0,-1.379055,3.0,,
50550,321857,,,,,,,,,,...,,,,,,4.0,1.027074,7.0,,
50551,321858,,,,,,,,,,...,,,,,,2.0,-1.376276,3.0,,
50552,321860,,,,,,,,,,...,,,,,,3.0,0.363410,6.0,,


In [11]:
actividad_fisica.isna().sum().sort_values(ascending=False)

manzana     50554
an405esp    50512
an408esp    50462
an408j      49127
an408i      49127
            ...  
an405com    47134
nse5f          26
indicef        26
nse10f         26
folio           0
Length: 93, dtype: int64

In [4]:
actividad_fisica.columns

Index(['folio', 'intp', 'entidad', 'desc_ent', 'munici', 'desc_mun', 'locali',
       'desc_loc', 'manzana', 'ageb', 'maq', 'completa', 'sexo', 'edad_i',
       'edad', 'intsel', 'intsel2', 'an401a', 'an401b', 'an402a', 'an402b',
       'an403', 'an404', 'an405', 'an405esp', 'an406a', 'an406b', 'an407',
       'an408a', 'an408b', 'an408c', 'an408d', 'an408e', 'an408f', 'an408g',
       'an408h', 'an408i', 'an408j', 'an408k', 'an408l', 'an408m', 'an408n',
       'an408o', 'an408p', 'an408q', 'an408r', 'an408s', 'an408t', 'an408esp',
       'code_upm', 'est_dis', 'est_urb', 'est_marg', 'pondef', 'est_var',
       'an405com', 'an408u', 'an408v', 'an408w', 'an408x', 'an408y', 'an408z',
       'an408aa', 'an408ab', 'an408ac', 'an408ad', 'an408ae', 'an408af',
       'an408ag', 'an408ah', 'hrsuesem', 'hrsuefnsem', 'hrsuesemh',
       'hrsuefnsemh', 'an403m', 'an403ms', 'an404m', 'an404ms', 'an403mst',
       'an406amt', 'an406mtc', 'an407cat1', 'an403cat2', 'an405cat', 'ur_ru',
       'catmon

In [5]:
# Seleccionar y renombrar columnas clave
cols_utiles = {
    'folio': 'folio',
    'intp': 'intp',
    'entidad': 'entidad',
    'munici': 'munici',
    'locali': 'locali',
    'sexo': 'sexo',
    'edad': 'edad',

    # Tiempo frente a pantallas
    'an403': 'minutos_tv_dia_semana',
    'an404': 'minutos_tv_dia_finsemana',
    'an405': 'tipo_contenido_visto',

    # Actividad física
    'an406a': 'dias_act_fisica_semana',
    'an406b': 'minutos_act_fisica_dia',
    'an407': 'tipo_actividad',

    # Sueño
    'hrsuesem': 'horas_sueno_semana',
    'hrsuefnsem': 'horas_sueno_finsemana',

    # Contexto y expansión
    'est_urb': 'zona_urbana',
    'est_marg': 'nivel_marginacion',
    'pondef': 'ponderador',

    # Variables sociodemográficas
    'nse5f': 'nivel_socioeconomico',
    'escolari': 'escolaridad_madre',
    'region': 'region'
}

# 3. Crear dataframe con variables renombradas
actividad_fisica_clean = actividad_fisica[list(cols_utiles.keys())].rename(columns=cols_utiles).copy()


In [6]:
actividad_fisica_clean

Unnamed: 0,folio,intp,entidad,munici,locali,sexo,edad,minutos_tv_dia_semana,minutos_tv_dia_finsemana,tipo_contenido_visto,...,minutos_act_fisica_dia,tipo_actividad,horas_sueno_semana,horas_sueno_finsemana,zona_urbana,nivel_marginacion,ponderador,nivel_socioeconomico,escolaridad_madre,region
0,10001,,,,,,,,,,...,,,,,,,,5.0,,
1,10003,,,,,,,,,,...,,,,,,,,5.0,,
2,10005,,,,,,,,,,...,,,,,,,,5.0,,
3,10006,,,,,,,,,,...,,,,,,,,5.0,,
4,10007,,,,,,,,,,...,,,,,,,,4.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50549,321855,,,,,,,,,,...,,,,,,,,2.0,,
50550,321857,,,,,,,,,,...,,,,,,,,4.0,,
50551,321858,,,,,,,,,,...,,,,,,,,2.0,,
50552,321860,,,,,,,,,,...,,,,,,,,3.0,,


In [7]:
actividad_fisica_clean.isna().sum().sort_values(ascending=False)

dias_act_fisica_semana      47271
minutos_act_fisica_dia      47271
tipo_contenido_visto        47155
escolaridad_madre           47155
ponderador                  47155
nivel_marginacion           47155
zona_urbana                 47155
tipo_actividad              47155
intp                        47155
region                      47155
minutos_tv_dia_finsemana    47155
minutos_tv_dia_semana       47155
edad                        47155
sexo                        47155
locali                      47155
munici                      47155
entidad                     47155
horas_sueno_semana          47154
horas_sueno_finsemana       47153
nivel_socioeconomico           26
folio                           0
dtype: int64

In [9]:
# Filas donde todo está vacío (puede haber artefactos)
actividad_fisica_clean = actividad_fisica_clean.dropna(how='all')


In [8]:
# Tipos de variables
actividad_fisica_clean.dtypes


folio                         int64
intp                        float64
entidad                     float64
munici                      float64
locali                      float64
sexo                        float64
edad                        float64
minutos_tv_dia_semana       float64
minutos_tv_dia_finsemana    float64
tipo_contenido_visto        float64
dias_act_fisica_semana      float64
minutos_act_fisica_dia      float64
tipo_actividad              float64
horas_sueno_semana          float64
horas_sueno_finsemana       float64
zona_urbana                 float64
nivel_marginacion           float64
ponderador                  float64
nivel_socioeconomico        float64
escolaridad_madre           float64
region                      float64
dtype: object

In [10]:
actividad_fisica_clean.shape

(50554, 21)

In [13]:
print(actividad_fisica_clean.columns.tolist())


['folio', 'intp', 'entidad', 'munici', 'locali', 'sexo', 'edad', 'minutos_tv_dia_semana', 'minutos_tv_dia_finsemana', 'tipo_contenido_visto', 'dias_act_fisica_semana', 'minutos_act_fisica_dia', 'tipo_actividad', 'horas_sueno_semana', 'horas_sueno_finsemana', 'zona_urbana', 'nivel_marginacion', 'ponderador', 'nivel_socioeconomico', 'escolaridad_madre', 'region']


In [14]:
# Definir variables clave con los nombres correctos
vars_clave = ['dias_act_fisica_semana', 'minutos_act_fisica_dia', 'edad', 'sexo']

# Filtrar registros completos
actividad_fisica_valid = actividad_fisica_clean.dropna(subset=vars_clave)

# Verificar dimensiones resultantes
print(actividad_fisica_valid.shape)


(3283, 21)


In [15]:
actividad_fisica_clean[vars_clave].isna().sum()


dias_act_fisica_semana    47271
minutos_act_fisica_dia    47271
edad                      47155
sexo                      47155
dtype: int64

In [16]:
actividad_fisica_clean[vars_clave].notna().sum()


dias_act_fisica_semana    3283
minutos_act_fisica_dia    3283
edad                      3399
sexo                      3399
dtype: int64

#### ESTA TABLA NO SIRVE DE NADA

✅ Tiene 50,554 filas, pero

❌ Sólo ~3,283 filas (~6.5%) tienen datos completos en las variables clave (edad, sexo, días y minutos de actividad física).

❌ La mayoría de las variables específicas de actividad (an408a a an408t, etc.) están vacías o casi vacías (>97% nulos).

❌ Campos geográficos como manzana y ageb están totalmente vacíos.

🧺 En la práctica, es una tabla con buena estructura pero mal capturada o muy incompleta.

---

### antropometria_escolares

In [17]:
antropometria_escolares = pd.read_csv(
    "../data/raw_nutricion/antropometria_escolares.csv",
    encoding="latin1",
    na_values=["", " ", "NA"],
    low_memory=False
)

antropometria_escolares.columns = antropometria_escolares.columns.str.replace("ï»¿", "")

In [18]:
antropometria_escolares

Unnamed: 0,folio,intp,entidad,desc_ent,munici,desc_mun,locali,desc_loc,code_upm,est_dis,...,peso,talla,BAZ_valid,BAZ_CLAS_NUM_SP_OB,indiceF,nseF,nse5F,nse10F,NindiceF,afilia_1ra_x
0,10001,5,1,01 AGUASCALIENTES,1,001 AGUASCALIENTES,1,0001 AGUASCALIENTES,M0101,5,...,28.400,133.55,-0.14,4,3.310654,3,5,10,4,1
1,10008,4,1,01 AGUASCALIENTES,1,001 AGUASCALIENTES,1,0001 AGUASCALIENTES,M0101,5,...,27.775,129.05,0.26,4,-0.281082,2,3,5,2,1
2,10011,4,1,01 AGUASCALIENTES,1,001 AGUASCALIENTES,1,0001 AGUASCALIENTES,M0101,5,...,46.550,146.70,1.63,6,2.565220,3,5,10,4,1
3,10011,5,1,01 AGUASCALIENTES,1,001 AGUASCALIENTES,1,0001 AGUASCALIENTES,M0101,5,...,33.350,128.30,2.24,5,2.565220,3,5,10,4,1
4,10015,6,1,01 AGUASCALIENTES,1,001 AGUASCALIENTES,1,0001 AGUASCALIENTES,M0101,5,...,40.700,148.50,0.91,4,2.666162,3,5,10,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16346,321845,3,32,32 ZACATECAS,57,057 TRANCOSO,1,0001 TRANCOSO,M3248,3,...,16.900,107.90,-0.53,4,-3.078561,1,1,1,1,6
16347,321849,7,32,32 ZACATECAS,57,057 TRANCOSO,1,0001 TRANCOSO,M3248,3,...,27.000,119.90,1.55,6,-1.455542,1,1,2,1,6
16348,321857,5,32,32 ZACATECAS,57,057 TRANCOSO,1,0001 TRANCOSO,M3248,3,...,26.450,122.20,1.54,6,1.027074,3,4,7,3,6
16349,321858,6,32,32 ZACATECAS,57,057 TRANCOSO,1,0001 TRANCOSO,M3248,3,...,28.100,135.60,-0.78,4,-1.376276,1,2,3,1,6


In [19]:
antropometria_escolares.columns

Index(['folio', 'intp', 'entidad', 'desc_ent', 'munici', 'desc_mun', 'locali',
       'desc_loc', 'code_upm', 'est_dis', 'est_urb', 'est_marg', 'est_var',
       'pondef', 'sexo', 'Fecha_NAC', 'fech_ter', 'Edad_Meses_calc',
       'edad_anios', 'area', 'region', 'peso', 'talla', 'BAZ_valid',
       'BAZ_CLAS_NUM_SP_OB', 'indiceF', 'nseF', 'nse5F', 'nse10F', 'NindiceF',
       'afilia_1ra_x'],
      dtype='object')

In [30]:
# Definir columnas útiles y nombres más descriptivos
cols_utiles = {
    'folio': 'folio',  # ID del hogar, folio consecutivo
    'intp': 'intp',  # ID de la persona dentro del hogar (nominal- util para id)
    'entidad': 'entidad',  # Clave del estado
    'munici': 'municipio',  # Clave del municipio
    'locali': 'localidad',  # Clave de la localidad
    'sexo': 'sexo',  # 1 = Hombre, 2 = Mujer
    'Edad_Meses_calc': 'edad_meses',  # Edad calculada en meses
    'edad_anios': 'edad_anios',  # Edad en años completos
    'peso': 'peso_kg',  # Peso corporal en kilogramos
    'talla': 'talla_cm',  # Talla en centímetros
    'BAZ_valid': 'zscore_imc_para_edad',  # BAZ: Z-score de IMC para la edad según estándares OMS
    'BAZ_CLAS_NUM_SP_OB': 'clasificacion_imc',  # Clasificación del IMC: bajo peso, normal, sobrepeso, obesidad
    'pondef': 'ponderador',  # Ponderador muestral
    'region': 'region'  # Región geográfica
}

# Crear DataFrame limpio con columnas renombradas
antropometria_escolares_clean = (
    antropometria_escolares[list(cols_utiles.keys())]
    .rename(columns=cols_utiles)
    .copy()
)

# Verifica dimensiones y muestra primeras filas
print(antropometria_escolares_clean.shape)
antropometria_escolares_clean.head()


(16351, 14)


Unnamed: 0,folio,intp,entidad,municipio,localidad,sexo,edad_meses,edad_anios,peso_kg,talla_cm,zscore_imc_para_edad,clasificacion_imc,ponderador,region
0,10001,5,1,1,1,1,111.507187,108119,28.4,133.55,-0.14,4,643.3107,2
1,10008,4,1,1,1,2,109.601643,108119,27.775,129.05,0.26,4,726.0302,2
2,10011,4,1,1,1,1,138.940452,132143,46.55,146.7,1.63,6,666.4679,2
3,10011,5,1,1,1,1,95.080082,8495,33.35,128.3,2.24,5,602.1836,2
4,10015,6,1,1,1,2,114.398357,108119,40.7,148.5,0.91,4,726.0302,2


In [21]:
antropometria_escolares_clean.isna().sum().sort_values(ascending=False)

folio                   0
id_persona              0
entidad                 0
municipio               0
localidad               0
sexo                    0
edad_meses              0
edad_anios              0
peso_kg                 0
talla_cm                0
zscore_imc_para_edad    0
clasificacion_imc       0
ponderador              0
region                  0
dtype: int64

In [22]:
antropometria_escolares.shape

(16351, 31)

In [31]:
# Crear ID único para cada persona
antropometria_escolares_clean["id_persona"] = (
    antropometria_escolares_clean["folio"].astype(str) + "_" +
    antropometria_escolares_clean["intp"].astype(str)
)


In [33]:
antropometria_escolares_clean["id_persona"].nunique()

16351

---

### antropometria_preesc

In [23]:
antropometria_preesc = pd.read_csv(
    "../data/raw_nutricion/antropometria_preesc.csv",
    encoding="latin1",
    na_values=["", " ", "NA"],
    low_memory=False
)

antropometria_preesc.columns = antropometria_preesc.columns.str.replace("ï»¿", "")

In [24]:
antropometria_preesc

Unnamed: 0,folio,intp,entidad,desc_ent,munici,desc_mun,locali,desc_loc,code_upm,est_var,...,BAZ_valid,WHZ_CLAS_NUM,HAZ_CLAS_NUM,WAZ_CLAS_NUM,BAZ_CLAS_PREESC_NUM,indiceF,nseF,nse5F,nse10F,NindiceF
0,10008,5,1,01 AGUASCALIENTES,1,001 AGUASCALIENTES,1,0001 AGUASCALIENTES,M0101,15,...,1.21,3,3,3,3,-0.281082,2,3,5,2
1,10011,6,1,01 AGUASCALIENTES,1,001 AGUASCALIENTES,1,0001 AGUASCALIENTES,M0101,15,...,2.00,2,3,3,3,2.565220,3,5,10,4
2,10015,8,1,01 AGUASCALIENTES,1,001 AGUASCALIENTES,1,0001 AGUASCALIENTES,M0101,15,...,0.11,3,3,3,3,2.666162,3,5,10,4
3,10033,7,1,01 AGUASCALIENTES,1,001 AGUASCALIENTES,1,0001 AGUASCALIENTES,M0101,15,...,0.88,3,2,3,3,1.671566,3,5,9,3
4,10035,6,1,01 AGUASCALIENTES,1,001 AGUASCALIENTES,1,0001 AGUASCALIENTES,M0101,15,...,0.42,3,3,3,3,2.371507,3,5,10,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10653,321837,4,32,32 ZACATECAS,57,057 TRANCOSO,1,0001 TRANCOSO,M3248,323,...,1.71,3,1,3,3,-0.549147,2,2,4,2
10654,321838,6,32,32 ZACATECAS,57,057 TRANCOSO,1,0001 TRANCOSO,M3248,323,...,-0.63,3,3,3,3,-0.202596,2,3,5,2
10655,321843,4,32,32 ZACATECAS,57,057 TRANCOSO,1,0001 TRANCOSO,M3248,323,...,1.29,3,3,3,3,1.061353,3,4,8,3
10656,321860,4,32,32 ZACATECAS,57,057 TRANCOSO,1,0001 TRANCOSO,M3248,323,...,2.70,2,3,3,5,0.363410,2,3,6,2


In [25]:
antropometria_preesc.columns

Index(['folio', 'intp', 'entidad', 'desc_ent', 'munici', 'desc_mun', 'locali',
       'desc_loc', 'code_upm', 'est_var', 'est_dis', 'est_urb', 'est_marg',
       'pondef', 'sexo', 'Fecha_NAC', 'fech_ter', 'EDAD_ANIOS',
       'Edad_Meses_calc', 'area', 'region', 'Peso', 'Talla', 'WHZ_Valid',
       'HAZ_valid', 'WAZ_valid', 'BAZ_valid', 'WHZ_CLAS_NUM', 'HAZ_CLAS_NUM',
       'WAZ_CLAS_NUM', 'BAZ_CLAS_PREESC_NUM', 'indiceF', 'nseF', 'nse5F',
       'nse10F', 'NindiceF'],
      dtype='object')

In [26]:
# Definir columnas útiles y renombrarlas con nombres descriptivos
cols_utiles_preesc = {
    'folio': 'folio',
    'intp': 'intp',
    'entidad': 'entidad',
    'munici': 'municipio',
    'locali': 'localidad',
    'sexo': 'sexo',
    'Edad_Meses_calc': 'edad_meses',
    'EDAD_ANIOS': 'edad_anios',
    'Peso': 'peso_kg',
    'Talla': 'talla_cm',
    'BAZ_valid': 'baz',  # Índice de masa corporal para edad (Z-score)
    'BAZ_CLAS_PREESC_NUM': 'baz_categoria',  # Clasificación nutricional según BAZ
    'pondef': 'ponderador',
    'region': 'region'
}

# Crear DataFrame limpio con las columnas seleccionadas y renombradas
antropometria_preesc_clean = (
    antropometria_preesc[list(cols_utiles_preesc.keys())]
    .rename(columns=cols_utiles_preesc)
    .copy()
)

# Verificar dimensiones y primeras filas
print(antropometria_preesc_clean.shape)
antropometria_preesc_clean.head()


(10658, 14)


Unnamed: 0,folio,intp,entidad,municipio,localidad,sexo,edad_meses,edad_anios,peso_kg,talla_cm,baz,baz_categoria,ponderador,region
0,10008,5,1,1,1,2,51.876797,4859,18.95,105.1,1.21,3,600.3404,2
1,10011,6,1,1,1,1,32.459959,2435,17.275,96.7,2.0,3,580.4296,2
2,10015,8,1,1,1,1,44.057495,3647,17.75,106.9,0.11,3,1160.8591,2
3,10033,7,1,1,1,2,21.125257,1223,13.975,91.3,0.88,3,559.014,2
4,10035,6,1,1,1,2,53.519507,4859,18.55,108.1,0.42,3,600.3404,2


In [27]:
antropometria_preesc_clean.isna().sum().sort_values(ascending=False)

folio            0
intp             0
entidad          0
municipio        0
localidad        0
sexo             0
edad_meses       0
edad_anios       0
peso_kg          0
talla_cm         0
baz              0
baz_categoria    0
ponderador       0
region           0
dtype: int64

In [28]:
antropometria_preesc_clean.shape

(10658, 14)

In [34]:
antropometria_preesc_clean["id_persona"] = (
    antropometria_preesc_clean["folio"].astype(str) + "_" +
    antropometria_preesc_clean["intp"].astype(str)
)


In [35]:
antropometria_preesc_clean["id_persona"].nunique()

10658

---

### frecuencia_escolares_colapsada

In [36]:
frecuencia_escolares_colapsada = pd.read_csv(
    "../data/raw_nutricion/frecuencia_escolares_colapsada.csv",
    encoding="latin1",
    na_values=["", " ", "NA"],
    low_memory=False
)

frecuencia_escolares_colapsada.columns = frecuencia_escolares_colapsada.columns.str.replace("ï»¿", "")

In [37]:
frecuencia_escolares_colapsada

Unnamed: 0,folio,intp,entidad,munici,locali,code_upm,est_dis,est_urb,est_marg,pondef,...,f13d0,f15d1,f18d3cn3,f20d3n3,f20d3n6,f20d4n6,f18d3i,f21d5,f22d4,f18d1tn7
0,10127,4,1,1,1,M0104,5,3,1,1794.136108,...,0.002023,0.000423,0.137555,0.005048,0.013418,0.010885,0.0,0.0,0.003579,0.0
1,10143,6,1,1,1,M0104,5,3,1,2111.720459,...,0.008896,0.000840,0.166200,0.029779,0.013075,0.024812,0.0,0.0,0.000532,0.0
2,10189,3,1,1,1,M0105,5,3,1,2057.106934,...,0.004117,0.000371,0.308087,0.003343,0.011945,0.004619,0.0,0.0,0.004943,0.0
3,10292,4,1,1,1,M0108,5,3,1,3640.897217,...,0.005021,0.000600,0.235928,0.009500,0.014508,0.016614,0.0,0.0,0.008672,0.0
4,10474,4,1,1,1,M0114,5,3,1,3968.578125,...,0.004410,0.000600,0.237561,0.011736,0.024242,0.016369,0.0,0.0,0.005129,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,321470,5,32,42,47,M3237,2,1,2,6974.594727,...,0.005357,0.000043,0.066575,0.004143,0.007507,0.002543,0.0,0.0,0.001758,0.0
1388,321621,6,32,51,38,M3242,2,1,2,3746.528076,...,0.003829,0.000401,0.089200,0.004737,0.003470,0.001786,0.0,0.0,0.000429,0.0
1389,321633,3,32,51,38,M3242,2,1,2,1933.675171,...,0.014321,0.000700,0.305839,0.031223,0.015856,0.022436,0.0,0.0,0.004179,0.0
1390,321782,3,32,56,1,M3247,5,3,1,5132.021484,...,0.010890,0.000926,0.495070,0.012315,0.026907,0.038228,0.0,0.0,0.006729,0.0


In [38]:
frecuencia_escolares_colapsada.columns

Index(['folio', 'intp', 'entidad', 'munici', 'locali', 'code_upm', 'est_dis',
       'est_urb', 'est_marg', 'pondef',
       ...
       'f13d0', 'f15d1', 'f18d3cn3', 'f20d3n3', 'f20d3n6', 'f20d4n6', 'f18d3i',
       'f21d5', 'f22d4', 'f18d1tn7'],
      dtype='object', length=149)

In [39]:
# Definir columnas útiles y renombrarlas con nombres descriptivos
cols_utiles_freq_escolares = {
    'folio': 'folio',
    'intp': 'intp',
    'entidad': 'entidad',
    'munici': 'municipio',
    'locali': 'localidad',
    'sexo': 'sexo',
    'edadmeses': 'edad_meses',
    'region': 'region',
    'pondef': 'ponderador',
    'energ_kcal': 'energia_kcal',
    'protein': 'proteina_g',
    'lipid_tot': 'grasas_totales_g',
    'carbohydrt': 'carbohidratos_totales_g',
    'fiber_td': 'fibra_total_g',
    'sugar_tot': 'azucares_totales_g',
    'calcium': 'calcio_mg',
    'iron': 'hierro_mg',
    'zinc': 'zinc_mg',
    'vit_c': 'vitamina_c_mg',
    'thiamin': 'tiamina_mg',
    'riboflavin': 'riboflavina_mg',
    'niacin': 'niacina_mg',
    'panto_acid': 'acido_pantotenico_mg'
}

# Crear DataFrame limpio con las columnas seleccionadas y renombradas
frecuencia_escolares_clean = (
    frecuencia_escolares_colapsada[list(cols_utiles_freq_escolares.keys())]
    .rename(columns=cols_utiles_freq_escolares)
    .copy()
)

# Verificar dimensiones y primeras filas
print(frecuencia_escolares_clean.shape)
frecuencia_escolares_clean.head()


(1392, 23)


Unnamed: 0,folio,intp,entidad,municipio,localidad,sexo,edad_meses,region,ponderador,energia_kcal,...,fibra_total_g,azucares_totales_g,calcio_mg,hierro_mg,zinc_mg,vitamina_c_mg,tiamina_mg,riboflavina_mg,niacina_mg,acido_pantotenico_mg
0,10127,4,1,1,1,1,131.285421,2,1794.136108,1632.982527,...,19.095582,122.918898,638.057048,13.143687,8.687673,144.44281,1.363653,1.214944,12.001293,3.658216
1,10143,6,1,1,1,1,127.835729,2,2111.720459,1194.73301,...,12.280141,110.207163,521.028155,6.928346,4.569165,27.455991,0.463637,0.79141,7.755948,2.146731
2,10189,3,1,1,1,1,143.671458,2,2057.106934,2117.378044,...,14.65766,104.193229,687.48352,16.448869,12.194654,80.242372,1.573255,1.515297,16.716897,3.604494
3,10292,4,1,1,1,1,116.665298,2,3640.897217,1732.517491,...,10.175481,140.08479,407.05804,12.994388,9.347291,66.982943,1.353584,1.171402,12.173067,2.692164
4,10474,4,1,1,1,1,71.26078,2,3968.578125,1430.804934,...,9.670636,86.213455,519.027989,7.341689,7.94135,48.903939,1.054932,0.957707,9.811865,2.840694


In [40]:
# Crear ID único por persona
frecuencia_escolares_clean["id_persona"] = (
    frecuencia_escolares_clean["folio"].astype(str) + "_" +
    frecuencia_escolares_clean["intp"].astype(str)
)

# Contar cuántos ID únicos hay
frecuencia_escolares_clean["id_persona"].nunique()


1392

---

### frecuencia_escolares_nutrimentos

In [41]:
frecuencia_escolares_nutrimentos = pd.read_csv(
    "../data/raw_nutricion/frecuencia_escolares_nutrimentos.csv",
    encoding="latin1",
    na_values=["", " ", "NA"],
    low_memory=False
)

frecuencia_escolares_nutrimentos.columns = frecuencia_escolares_nutrimentos.columns.str.replace("ï»¿", "")

In [42]:
frecuencia_escolares_nutrimentos

Unnamed: 0,folio,intp,entidad,munici,locali,code_upm,est_dis,est_urb,est_marg,pondef,...,f13d0,f15d1,f18d3cn3,f20d3n3,f20d3n6,f20d4n6,f18d3i,f21d5,f22d4,f18d1tn7
0,10127,4,1,1,1,M0104,5,3,1,1794.136108,...,,,,,,,,,,
1,10127,4,1,1,1,M0104,5,3,1,1794.136108,...,,,,,,,,,,
2,10127,4,1,1,1,M0104,5,3,1,1794.136108,...,,,,,,,,,,
3,10127,4,1,1,1,M0104,5,3,1,1794.136108,...,,0.0,0.002773,,,,,,,
4,10127,4,1,1,1,M0104,5,3,1,1794.136108,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44237,321797,3,32,56,1,M3247,5,3,1,5297.524902,...,,,,,,,,,,
44238,321797,3,32,56,1,M3247,5,3,1,5297.524902,...,,,,,,,,,,
44239,321797,3,32,56,1,M3247,5,3,1,5297.524902,...,0.0,0.0,0.009878,0.0,0.0,0.0,,,,
44240,321797,3,32,56,1,M3247,5,3,1,5297.524902,...,,,,,,,,,,


In [44]:
frecuencia_escolares_nutrimentos.columns.tolist()

['folio',
 'intp',
 'entidad',
 'munici',
 'locali',
 'code_upm',
 'est_dis',
 'est_urb',
 'est_marg',
 'pondef',
 'est_var',
 'indiceF',
 'nse3F',
 'nse4F',
 'nse5F',
 'nse10F',
 'area',
 'region',
 'sexo',
 'fecha_nac',
 'fecha_vis',
 'edadanos',
 'edadmeses',
 'edaddias',
 'CodigoMex',
 'alimento',
 'esp_alim',
 'consumo',
 'water',
 'alcohol',
 'energ_kcal',
 'carbohydrt',
 'lipid_tot',
 'protein',
 'ash',
 'fiber_td',
 'sugar_tot',
 'calcium',
 'iron',
 'h_heme',
 'h_noheme',
 'magnesium',
 'phosphorus',
 'potassium',
 'sodium',
 'zinc',
 'copper',
 'manganese',
 'selenium',
 'vit_c',
 'thiamin',
 'riboflavin',
 'niacin',
 'panto_acid',
 'vit_b6',
 'folate_tot',
 'folic_acid',
 'food_folate',
 'folate_dfe',
 'choline_tot',
 'vit_b12',
 'vit_a_iu',
 'vit_a_rae',
 'retinol',
 'alpha_carot',
 'beta_carot',
 'beta_crypt',
 'lycopene',
 'lut_zea',
 'vit_e',
 'vit_d_mcg',
 'vit_d_iu',
 'vit_k',
 'caffeine',
 'vit_b12_add',
 'betaine',
 'tryptophan',
 'threonine',
 'isoleucine',
 'leucin

In [45]:
# Definir columnas útiles y renombrarlas con nombres descriptivos
cols_utiles_freq_nutrimentos = {
    'folio': 'folio',
    'intp': 'intp',
    'entidad': 'entidad',
    'munici': 'municipio',
    'locali': 'localidad',
    'sexo': 'sexo',
    'edadmeses': 'edad_meses',
    'region': 'region',
    'pondef': 'ponderador',
    'energ_kcal': 'energia_kcal',
    'protein': 'proteina_g',
    'lipid_tot': 'grasas_totales_g',
    'carbohydrt': 'carbohidratos_totales_g',
    'fiber_td': 'fibra_total_g',
    'sugar_tot': 'azucares_totales_g',
    'calcium': 'calcio_mg',
    'iron': 'hierro_mg',
    'zinc': 'zinc_mg',
    'vit_c': 'vitamina_c_mg',
    'thiamin': 'tiamina_mg',
    'riboflavin': 'riboflavina_mg',
    'niacin': 'niacina_mg',
    'panto_acid': 'acido_pantotenico_mg',
    'folate_tot': 'folato_total_ug',
    'vit_b12': 'vitamina_b12_ug',
    'vit_a_rae': 'vitamina_a_rae_ug',
    'vit_d_mcg': 'vitamina_d_mcg',
    'vit_e': 'vitamina_e_mg',
    'vit_k': 'vitamina_k_ug'
}

# Crear DataFrame limpio con las columnas seleccionadas y renombradas
frecuencia_escolares_nutrimentos_clean = (
    frecuencia_escolares_nutrimentos[list(cols_utiles_freq_nutrimentos.keys())]
    .rename(columns=cols_utiles_freq_nutrimentos)
    .copy()
)

# Verificar dimensiones y primeras filas
print(frecuencia_escolares_nutrimentos_clean.shape)
frecuencia_escolares_nutrimentos_clean.head()


(44242, 29)


Unnamed: 0,folio,intp,entidad,municipio,localidad,sexo,edad_meses,region,ponderador,energia_kcal,...,tiamina_mg,riboflavina_mg,niacina_mg,acido_pantotenico_mg,folato_total_ug,vitamina_b12_ug,vitamina_a_rae_ug,vitamina_d_mcg,vitamina_e_mg,vitamina_k_ug
0,10127,4,1,1,1,1,131.285421,2,1794.136108,0.212571,...,0.000943,0.000976,0.00457,0.001833,1.155857,0.0,5.354143,0.0,0.002325,1.519886
1,10127,4,1,1,1,1,131.285421,2,1794.136108,0.424286,...,0.001929,0.004243,0.007714,0.004821,0.289286,0.0,6.094285,0.0,0.028929,7.714285
2,10127,4,1,1,1,1,131.285421,2,1794.136108,1.461482,...,0.002196,0.001622,0.038877,0.00817,0.533687,0.0,3.879495,0.0,0.040232,0.219633
3,10127,4,1,1,1,1,131.285421,2,1794.136108,3.997715,...,0.001674,0.003248,0.043425,0.034705,2.023843,0.0,0.1749,0.0,0.05172,0.5247
4,10127,4,1,1,1,1,131.285421,2,1794.136108,2.110629,...,0.000782,0.001173,0.004299,0.002345,0.169241,0.0,0.09107,0.0,0.006254,0.116085


In [46]:
# Crear ID único por persona
frecuencia_escolares_nutrimentos_clean["id_persona"] = (
    frecuencia_escolares_nutrimentos_clean["folio"].astype(str) + "_" +
    frecuencia_escolares_nutrimentos_clean["intp"].astype(str)
)

# Contar cuántos ID únicos hay
frecuencia_escolares_nutrimentos_clean["id_persona"].nunique()


1392

In [47]:
frecuencia_escolares_nutrimentos_clean.isna().sum().sort_values(ascending=False)

vitamina_k_ug              6224
vitamina_d_mcg             5318
vitamina_e_mg              4926
acido_pantotenico_mg       4059
vitamina_b12_ug            3528
vitamina_c_mg              2092
folato_total_ug            1754
tiamina_mg                 1233
niacina_mg                 1233
vitamina_a_rae_ug          1204
riboflavina_mg             1011
hierro_mg                   994
fibra_total_g               907
calcio_mg                   763
zinc_mg                     651
azucares_totales_g          477
energia_kcal                443
proteina_g                  442
carbohidratos_totales_g     437
grasas_totales_g            344
folio                         0
intp                          0
ponderador                    0
region                        0
edad_meses                    0
sexo                          0
localidad                     0
municipio                     0
entidad                       0
id_persona                    0
dtype: int64

In [49]:
# Calcular el porcentaje de valores faltantes por columna
missing_percent = (frecuencia_escolares_nutrimentos_clean.isna().sum() / len(frecuencia_escolares_nutrimentos_clean)) * 100

# Ordenar de mayor a menor
missing_percent.sort_values(ascending=False)


vitamina_k_ug              14.068080
vitamina_d_mcg             12.020252
vitamina_e_mg              11.134216
acido_pantotenico_mg        9.174540
vitamina_b12_ug             7.974323
vitamina_c_mg               4.728538
folato_total_ug             3.964559
tiamina_mg                  2.786945
niacina_mg                  2.786945
vitamina_a_rae_ug           2.721396
riboflavina_mg              2.285159
hierro_mg                   2.246734
fibra_total_g               2.050088
calcio_mg                   1.724606
zinc_mg                     1.471452
azucares_totales_g          1.078161
energia_kcal                1.001311
proteina_g                  0.999051
carbohidratos_totales_g     0.987749
grasas_totales_g            0.777542
folio                       0.000000
intp                        0.000000
ponderador                  0.000000
region                      0.000000
edad_meses                  0.000000
sexo                        0.000000
localidad                   0.000000
m

---

### frecuencia_preescolares_colapsada

In [50]:
frecuencia_preescolares_colapsada = pd.read_csv(
    "../data/raw_nutricion/frecuencia_preescolares_colapsada.csv",
    encoding="latin1",
    na_values=["", " ", "NA"],
    low_memory=False
)

frecuencia_preescolares_colapsada.columns = frecuencia_preescolares_colapsada.columns.str.replace("ï»¿", "")

In [51]:
frecuencia_preescolares_colapsada

Unnamed: 0,folio,intp,entidad,munici,locali,code_upm,est_dis,est_urb,est_marg,pondef,...,f13d0,f15d1,f18d3cn3,f20d3n3,f20d3n6,f20d4n6,f18d3i,f21d5,f22d4,f18d1tn7
0,10033,7,1,1,1,M0101,5,3,1,2060.228027,...,0.003207,0.001111,0.253070,0.005651,0.025291,0.027884,0.0,0.0,0.006145,0.0
1,10037,6,1,1,1,M0101,5,3,1,2627.877197,...,0.006178,0.000917,0.179474,0.003171,0.037175,0.022895,0.0,0.0,0.000214,0.0
2,10143,7,1,1,1,M0104,5,3,1,3854.620361,...,0.002690,0.000179,0.031644,0.002071,0.005812,0.011250,0.0,0.0,0.001487,0.0
3,10226,7,1,1,1,M0106,5,3,1,2394.759033,...,0.003078,0.000223,0.056688,0.003587,0.006785,0.008311,0.0,0.0,0.001928,0.0
4,10279,3,1,1,1,M0108,5,3,1,2119.255859,...,0.000964,0.000000,0.036702,0.014336,0.018307,0.008250,0.0,0.0,0.000071,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,321642,4,32,51,38,M3242,2,1,2,1810.640259,...,0.002757,0.000600,0.123030,0.008500,0.013217,0.006261,0.0,0.0,0.008475,0.0
1334,321661,5,32,53,1,M3243,3,2,1,2450.989502,...,0.005143,0.000000,0.014961,0.004329,0.007904,0.003514,0.0,0.0,0.000839,0.0
1335,321667,3,32,53,1,M3243,3,2,1,2450.989502,...,0.005489,0.000882,0.130836,0.012491,0.009698,0.006298,0.0,0.0,0.004646,0.0
1336,321716,4,32,54,29,M3244,2,1,2,1275.269165,...,0.005132,0.002036,0.136571,0.013180,0.027406,0.028383,0.0,0.0,0.005695,0.0


In [53]:
frecuencia_preescolares_colapsada.columns.tolist()

['folio',
 'intp',
 'entidad',
 'munici',
 'locali',
 'code_upm',
 'est_dis',
 'est_urb',
 'est_marg',
 'pondef',
 'est_var',
 'sexo',
 'indiceF',
 'nse3F',
 'nse4F',
 'nse5F',
 'nse10F',
 'area',
 'region',
 'fecha_nac',
 'fecha_vis',
 'edadanos',
 'edadmeses',
 'edaddias',
 'water',
 'alcohol',
 'energ_kcal',
 'carbohydrt',
 'lipid_tot',
 'protein',
 'ash',
 'fiber_td',
 'sugar_tot',
 'calcium',
 'iron',
 'h_heme',
 'h_noheme',
 'magnesium',
 'phosphorus',
 'potassium',
 'sodium',
 'zinc',
 'copper',
 'manganese',
 'selenium',
 'vit_c',
 'thiamin',
 'riboflavin',
 'niacin',
 'panto_acid',
 'vit_b6',
 'folate_tot',
 'folic_acid',
 'food_folate',
 'folate_dfe',
 'choline_tot',
 'vit_b12',
 'vit_a_iu',
 'vit_a_rae',
 'retinol',
 'alpha_carot',
 'beta_carot',
 'beta_crypt',
 'lycopene',
 'lut_zea',
 'vit_e',
 'vit_d_mcg',
 'vit_d_iu',
 'vit_k',
 'caffeine',
 'vit_b12_add',
 'betaine',
 'tryptophan',
 'threonine',
 'isoleucine',
 'leucine',
 'lysine',
 'methionine',
 'cystine',
 'phenylal

In [54]:
# Definir columnas útiles y renombrarlas con nombres descriptivos
cols_utiles_freq_preescolares = {
    'folio': 'folio',
    'intp': 'intp',
    'entidad': 'entidad',
    'munici': 'municipio',
    'locali': 'localidad',
    'sexo': 'sexo',
    'edadmeses': 'edad_meses',
    'region': 'region',
    'pondef': 'ponderador',
    'energ_kcal': 'energia_kcal',
    'protein': 'proteina_g',
    'lipid_tot': 'grasas_totales_g',
    'carbohydrt': 'carbohidratos_totales_g',
    'fiber_td': 'fibra_total_g',
    'sugar_tot': 'azucares_totales_g',
    'calcium': 'calcio_mg',
    'iron': 'hierro_mg',
    'zinc': 'zinc_mg',
    'vit_c': 'vitamina_c_mg',
    'thiamin': 'tiamina_mg',
    'riboflavin': 'riboflavina_mg',
    'niacin': 'niacina_mg',
    'panto_acid': 'acido_pantotenico_mg'
}

# Crear DataFrame limpio con las columnas seleccionadas y renombradas
frecuencia_preescolares_clean = (
    frecuencia_preescolares_colapsada[list(cols_utiles_freq_preescolares.keys())]
    .rename(columns=cols_utiles_freq_preescolares)
    .copy()
)

# Verificar dimensiones y primeras filas
print(frecuencia_preescolares_clean.shape)
frecuencia_preescolares_clean.head()


(1338, 23)


Unnamed: 0,folio,intp,entidad,municipio,localidad,sexo,edad_meses,region,ponderador,energia_kcal,...,fibra_total_g,azucares_totales_g,calcio_mg,hierro_mg,zinc_mg,vitamina_c_mg,tiamina_mg,riboflavina_mg,niacina_mg,acido_pantotenico_mg
0,10033,7,1,1,1,2,21.125257,2,2060.228027,1304.775939,...,10.567073,104.23677,732.553282,6.484451,5.374578,88.030725,1.00419,1.144311,9.765092,3.814638
1,10037,6,1,1,1,1,49.24846,2,2627.877197,1637.898967,...,11.746441,127.59911,1660.051578,11.060477,11.894907,69.822804,1.60551,2.298132,15.227305,3.51397
2,10143,7,1,1,1,2,31.605749,2,3854.620361,1053.811354,...,15.519947,81.012205,681.286833,5.587787,4.471658,48.049118,0.479147,0.642388,6.962112,1.761558
3,10226,7,1,1,1,1,28.188912,2,2394.759033,1189.705403,...,8.114364,150.916636,793.851377,6.764674,8.556872,77.093281,0.762373,1.395468,8.759597,2.825162
4,10279,3,1,1,1,1,41.36345,2,2119.255859,1024.050771,...,9.241501,119.82449,1124.239484,6.057855,4.238768,127.510012,0.648009,1.178456,5.934387,1.784436


In [55]:
frecuencia_preescolares_clean.isna().sum().sort_values(ascending=False)

folio                      0
carbohidratos_totales_g    0
niacina_mg                 0
riboflavina_mg             0
tiamina_mg                 0
vitamina_c_mg              0
zinc_mg                    0
hierro_mg                  0
calcio_mg                  0
azucares_totales_g         0
fibra_total_g              0
grasas_totales_g           0
intp                       0
proteina_g                 0
energia_kcal               0
ponderador                 0
region                     0
edad_meses                 0
sexo                       0
localidad                  0
municipio                  0
entidad                    0
acido_pantotenico_mg       0
dtype: int64

In [57]:
# Crear ID único por persona
frecuencia_preescolares_clean["id_persona"] = (
    frecuencia_preescolares_clean["folio"].astype(str) + "_" +
    frecuencia_preescolares_clean["intp"].astype(str)
)

# Contar cuántos ID únicos hay
frecuencia_preescolares_clean["id_persona"].nunique()


1338

In [59]:
print(frecuencia_preescolares_clean.shape)
# Verificar valores faltantes
frecuencia_preescolares_clean.isna().sum().sort_values(ascending=False)

(1338, 24)


folio                      0
intp                       0
acido_pantotenico_mg       0
niacina_mg                 0
riboflavina_mg             0
tiamina_mg                 0
vitamina_c_mg              0
zinc_mg                    0
hierro_mg                  0
calcio_mg                  0
azucares_totales_g         0
fibra_total_g              0
carbohidratos_totales_g    0
grasas_totales_g           0
proteina_g                 0
energia_kcal               0
ponderador                 0
region                     0
edad_meses                 0
sexo                       0
localidad                  0
municipio                  0
entidad                    0
id_persona                 0
dtype: int64

### frecuencia_preescolares_nutrimentos

In [60]:
frecuencia_preescolares_nutrimentos = pd.read_csv(
    "../data/raw_nutricion/frecuencia_preescolares_nutrimentos.csv",
    encoding="latin1",
    na_values=["", " ", "NA"],
    low_memory=False
)

frecuencia_preescolares_nutrimentos.columns = frecuencia_preescolares_nutrimentos.columns.str.replace("ï»¿", "")

In [None]:
frecuencia_preescolares_nutrimentos

Unnamed: 0,folio,intp,entidad,munici,locali,code_upm,est_dis,est_urb,est_marg,pondef,...,f13d0,f15d1,f18d3cn3,f20d3n3,f20d3n6,f20d4n6,f18d3i,f21d5,f22d4,f18d1tn7
0,10033,7,1,1,1,M0101,5,3,1,2060.228027,...,,,,,,,,,,
1,10033,7,1,1,1,M0101,5,3,1,2060.228027,...,,,,,,,,,,
2,10033,7,1,1,1,M0101,5,3,1,2060.228027,...,,0.0000,,,,,,,,
3,10033,7,1,1,1,M0101,5,3,1,2060.228027,...,,,,,,,,,,
4,10033,7,1,1,1,M0101,5,3,1,2060.228027,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40214,321725,3,32,54,43,M3244,2,1,2,1275.269165,...,,,,,,,,,,
40215,321725,3,32,54,43,M3244,2,1,2,1275.269165,...,,0.0000,0.0111,0.0000,0.0000,,,,0.00000,
40216,321725,3,32,54,43,M3244,2,1,2,1275.269165,...,,,,,,,,,,
40217,321725,3,32,54,43,M3244,2,1,2,1275.269165,...,0.0009,0.0003,0.0045,0.0042,0.0015,0.00165,,,0.00135,


In [62]:
frecuencia_preescolares_nutrimentos.columns.tolist()

['folio',
 'intp',
 'entidad',
 'munici',
 'locali',
 'code_upm',
 'est_dis',
 'est_urb',
 'est_marg',
 'pondef',
 'est_var',
 'indiceF',
 'nse3F',
 'nse4F',
 'nse5F',
 'nse10F',
 'area',
 'region',
 'sexo',
 'fecha_nac',
 'fecha_vis',
 'edadanos',
 'edadmeses',
 'edaddias',
 'CodigoMex',
 'alimento',
 'esp_alim',
 'consumo',
 'water',
 'alcohol',
 'energ_kcal',
 'carbohydrt',
 'lipid_tot',
 'protein',
 'ash',
 'fiber_td',
 'sugar_tot',
 'calcium',
 'iron',
 'h_heme',
 'h_noheme',
 'magnesium',
 'phosphorus',
 'potassium',
 'sodium',
 'zinc',
 'copper',
 'manganese',
 'selenium',
 'vit_c',
 'thiamin',
 'riboflavin',
 'niacin',
 'panto_acid',
 'vit_b6',
 'folate_tot',
 'folic_acid',
 'food_folate',
 'folate_dfe',
 'choline_tot',
 'vit_b12',
 'vit_a_iu',
 'vit_a_rae',
 'retinol',
 'alpha_carot',
 'beta_carot',
 'beta_crypt',
 'lycopene',
 'lut_zea',
 'vit_e',
 'vit_d_mcg',
 'vit_d_iu',
 'vit_k',
 'caffeine',
 'vit_b12_add',
 'betaine',
 'tryptophan',
 'threonine',
 'isoleucine',
 'leucin

In [63]:
# Definir columnas útiles y renombrarlas con nombres descriptivos
cols_utiles_freq_preesc = {
    'folio': 'folio',
    'intp': 'intp',
    'entidad': 'entidad',
    'munici': 'municipio',
    'locali': 'localidad',
    'sexo': 'sexo',
    'edadmeses': 'edad_meses',
    'region': 'region',
    'pondef': 'ponderador',
    'energ_kcal': 'energia_kcal',  # Energía total consumida
    'protein': 'proteina_g',
    'lipid_tot': 'grasas_totales_g',
    'carbohydrt': 'carbohidratos_totales_g',
    'fiber_td': 'fibra_total_g',
    'sugar_tot': 'azucares_totales_g',
    'calcium': 'calcio_mg',
    'iron': 'hierro_mg',
    'zinc': 'zinc_mg',
    'vit_c': 'vitamina_c_mg',
    'thiamin': 'tiamina_mg',
    'riboflavin': 'riboflavina_mg',
    'niacin': 'niacina_mg',
    'panto_acid': 'acido_pantotenico_mg'
}


In [64]:
# Crear DataFrame limpio con las columnas seleccionadas y renombradas
frecuencia_preescolares_nutrimentos_clean = (
    frecuencia_preescolares_nutrimentos[list(cols_utiles_freq_preesc.keys())]
    .rename(columns=cols_utiles_freq_preesc)
    .copy()
)

# Verificar dimensiones y primeras filas
print(frecuencia_preescolares_nutrimentos_clean.shape)
frecuencia_preescolares_nutrimentos_clean.head()


(40219, 23)


Unnamed: 0,folio,intp,entidad,municipio,localidad,sexo,edad_meses,region,ponderador,energia_kcal,...,fibra_total_g,azucares_totales_g,calcio_mg,hierro_mg,zinc_mg,vitamina_c_mg,tiamina_mg,riboflavina_mg,niacina_mg,acido_pantotenico_mg
0,10033,7,1,1,1,2,21.125257,2,2060.228027,0.416286,...,0.093886,0.036934,1.186857,0.007086,0.002214,0.696969,0.000531,0.000709,0.003543,0.002825
1,10033,7,1,1,1,2,21.125257,2,2060.228027,22.549999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10033,7,1,1,1,2,21.125257,2,2060.228027,1.22475,...,0.073312,0.109192,1.81125,0.030187,0.011126,0.482457,0.001941,0.002148,0.012109,0.005563
3,10033,7,1,1,1,2,21.125257,2,2060.228027,1.461482,...,0.063427,0.125006,0.595266,0.0507,0.0117,0.282156,0.002196,0.001622,0.038877,0.00817
4,10033,7,1,1,1,2,21.125257,2,2060.228027,18.206572,...,0.011571,0.056143,7.215,0.108857,0.111857,,0.008571,0.007286,0.085714,


In [65]:
# Crear ID único por persona concatenando folio + intp
frecuencia_preescolares_nutrimentos_clean["id_persona"] = (
    frecuencia_preescolares_nutrimentos_clean["folio"].astype(str) + "_" +
    frecuencia_preescolares_nutrimentos_clean["intp"].astype(str)
)

# Verificar número de IDs únicos
frecuencia_preescolares_nutrimentos_clean["id_persona"].nunique()


1338

In [67]:
frecuencia_preescolares_nutrimentos_clean.isna().sum().sort_values(ascending=False)

# En porcentaje
missing_percent_preesc = (frecuencia_preescolares_nutrimentos_clean.isna().sum() / len(frecuencia_preescolares_nutrimentos_clean)) * 100
# Ordenar de mayor a menor
missing_percent_preesc.sort_values(ascending=False)

acido_pantotenico_mg       10.460230
vitamina_c_mg               5.475024
niacina_mg                  3.794227
tiamina_mg                  3.704717
riboflavina_mg              3.267113
hierro_mg                   3.267113
fibra_total_g               2.376986
calcio_mg                   1.939382
zinc_mg                     1.593774
energia_kcal                1.273030
proteina_g                  1.245680
carbohidratos_totales_g     0.999528
azucares_totales_g          0.947313
grasas_totales_g            0.726025
folio                       0.000000
intp                        0.000000
ponderador                  0.000000
region                      0.000000
edad_meses                  0.000000
sexo                        0.000000
localidad                   0.000000
municipio                   0.000000
entidad                     0.000000
id_persona                  0.000000
dtype: float64

----

# FIRST MERGE

In [73]:
actividad_fisica_clean.columns.tolist()

['folio',
 'intp',
 'entidad',
 'munici',
 'locali',
 'sexo',
 'edad',
 'minutos_tv_dia_semana',
 'minutos_tv_dia_finsemana',
 'tipo_contenido_visto',
 'dias_act_fisica_semana',
 'minutos_act_fisica_dia',
 'tipo_actividad',
 'horas_sueno_semana',
 'horas_sueno_finsemana',
 'zona_urbana',
 'nivel_marginacion',
 'ponderador',
 'nivel_socioeconomico',
 'escolaridad_madre',
 'region']

In [74]:
antropometria_escolares_clean.columns.tolist()

['folio',
 'intp',
 'entidad',
 'municipio',
 'localidad',
 'sexo',
 'edad_meses',
 'edad_anios',
 'peso_kg',
 'talla_cm',
 'zscore_imc_para_edad',
 'clasificacion_imc',
 'ponderador',
 'region',
 'id_persona']

In [75]:
antropometria_preesc_clean.columns.tolist()

['folio',
 'intp',
 'entidad',
 'municipio',
 'localidad',
 'sexo',
 'edad_meses',
 'edad_anios',
 'peso_kg',
 'talla_cm',
 'baz',
 'baz_categoria',
 'ponderador',
 'region',
 'id_persona']

In [76]:
frecuencia_escolares_clean.columns.tolist()

['folio',
 'intp',
 'entidad',
 'municipio',
 'localidad',
 'sexo',
 'edad_meses',
 'region',
 'ponderador',
 'energia_kcal',
 'proteina_g',
 'grasas_totales_g',
 'carbohidratos_totales_g',
 'fibra_total_g',
 'azucares_totales_g',
 'calcio_mg',
 'hierro_mg',
 'zinc_mg',
 'vitamina_c_mg',
 'tiamina_mg',
 'riboflavina_mg',
 'niacina_mg',
 'acido_pantotenico_mg',
 'id_persona']

In [77]:
frecuencia_escolares_nutrimentos_clean.columns.tolist()

['folio',
 'intp',
 'entidad',
 'municipio',
 'localidad',
 'sexo',
 'edad_meses',
 'region',
 'ponderador',
 'energia_kcal',
 'proteina_g',
 'grasas_totales_g',
 'carbohidratos_totales_g',
 'fibra_total_g',
 'azucares_totales_g',
 'calcio_mg',
 'hierro_mg',
 'zinc_mg',
 'vitamina_c_mg',
 'tiamina_mg',
 'riboflavina_mg',
 'niacina_mg',
 'acido_pantotenico_mg',
 'folato_total_ug',
 'vitamina_b12_ug',
 'vitamina_a_rae_ug',
 'vitamina_d_mcg',
 'vitamina_e_mg',
 'vitamina_k_ug',
 'id_persona']

In [78]:
frecuencia_preescolares_clean.columns.tolist()

['folio',
 'intp',
 'entidad',
 'municipio',
 'localidad',
 'sexo',
 'edad_meses',
 'region',
 'ponderador',
 'energia_kcal',
 'proteina_g',
 'grasas_totales_g',
 'carbohidratos_totales_g',
 'fibra_total_g',
 'azucares_totales_g',
 'calcio_mg',
 'hierro_mg',
 'zinc_mg',
 'vitamina_c_mg',
 'tiamina_mg',
 'riboflavina_mg',
 'niacina_mg',
 'acido_pantotenico_mg',
 'id_persona']

In [79]:
frecuencia_preescolares_nutrimentos_clean.columns.tolist()

['folio',
 'intp',
 'entidad',
 'municipio',
 'localidad',
 'sexo',
 'edad_meses',
 'region',
 'ponderador',
 'energia_kcal',
 'proteina_g',
 'grasas_totales_g',
 'carbohidratos_totales_g',
 'fibra_total_g',
 'azucares_totales_g',
 'calcio_mg',
 'hierro_mg',
 'zinc_mg',
 'vitamina_c_mg',
 'tiamina_mg',
 'riboflavina_mg',
 'niacina_mg',
 'acido_pantotenico_mg',
 'id_persona']