In [1]:
import json

import pandas as pd

import sys
sys.path.append("../../../../..")
from ydnpd.utils import metadata_to_pandera_schema

RANDOM_STATE = 42
SAMPLE_SIZE = 1_469


cols_to_keep = [
    # Demographics
    'CCAA', 'SEXO', 'EDAD', 'ECIVIL', 'NACIO', 'ESTUDIO', #'RELACT',
   #  'CERTIG', 'DEPENG',
    
    # Health status
    'D_1',
    
    # Functional assessment
    'VISI_2_1',  # Vision
   #  'AUDI_7_1',  # Hearing
   #  'COMU_8_1', 'COMU_9_1',  # Communication
    'MOVI_21_1', 'MOVI_22_1', 'MOVI_25_1',  # Mobility  
   #  'AUTO_29_1', 'AUTO_31_1', 'AUTO_33_1',  # Self-care
   #  'RELA_39_1', 'RELA_41_1'  # Interpersonal
]

with open("../metadata.json") as f:
   metadata = json.load(f)

schema = {k: v for k, v in metadata["schema"].items() if k in cols_to_keep}

pa_scheme = metadata_to_pandera_schema(schema)


for name in ["2020", "2023"]:

   print(f"### {name} ###")

   if name == "2023":
      df = pd.read_csv(f"{name}.tsv", sep='\t', usecols=cols_to_keep)

   elif name == "2020":
      df_h = pd.read_csv("2020_hogar.tsv", sep="\t")
      df_l = pd.read_csv("2020_limitaciones.tsv", sep="\t")
      df_d = pd.read_csv("2020_discapacidad.tsv", sep="\t")

      # Create a merged dataset using left joins from the household file
      # This ensures we keep all household members and add disability/limitation info where it exists

      # 1. Merge with Discapacidad file
      df_merged = df_h.merge(
         df_d, 
         how='left',
         on=['IDENTHOGAR', 'NORDEN'], 
         suffixes=('', '_disc')
      )

      # 2. Merge with Limitaciones file
      df = df_merged.merge(
         df_l,
         how='left',
         on=['IDENTHOGAR', 'NORDEN'],
         suffixes=('', '_lim')
      )
      df = df.rename({
         "RELACT_CUID": "RELACT",
         "CERTI_D": "CERTIG",
         "DEPEN_D": "DEPENG"
         },
         axis=1)


   else:
      ValueError()

   df = df[cols_to_keep]

   mask = (
      df.select_dtypes(include=["object"])
      .apply(lambda x: x.str.strip() == "", axis=0)
      .any(axis=1)
   )


   df = df[~mask]

   # Drop rows where any column has value 9 or 99
   df = df[(df != 9).all(axis=1)
           & (df != 99).all(axis=1)
           & (df != "9").all(axis=1)
           & (df != "09").all(axis=1)
           & (df != "99").all(axis=1)]

   # Drop any remaining null values
   df = df.dropna()

   # Function to encode categorical values starting from 1
   def encode_categorical(series):
      return pd.Categorical(series).codes + 1

   # List of columns to encode
   columns = df.columns

   # Apply encoding to all columns
   for col in columns:
      df[col] = encode_categorical(df[col])
      
   # For EDAD (age), keep as is since it's already numerical
   df["EDAD"] = df["EDAD"].astype(int)

   df["EDAD"] = pd.cut(df["EDAD"], 
                     bins=[0, 16, 35, 50, 65, 80, 120], 
                     labels=[1, 2, 3, 4, 5, 6])

   df = df.astype("int64")

   print(f"Remaining rows after cleaning: {len(df)}")
   print(f"Loaded dataframe with {len(df)} rows and {len(df.columns)} columns")
   print("\nColumns:", list(df.columns))
   df.info()

   # Verify results
   for col in df.columns:
      print(f"\n{col}:")
      print(df[col].value_counts().sort_index())

   df = (
      df
      .sample(SAMPLE_SIZE, replace=False, random_state=RANDOM_STATE)
      .reset_index(drop=True)
   )

   # pa_scheme.validate(df)

   df.to_csv(f"../{name}.csv", index=False)

   print()



### 2020 ###


  df_d = pd.read_csv("2020_discapacidad.tsv", sep="\t")


Remaining rows after cleaning: 1469
Loaded dataframe with 1469 rows and 11 columns

Columns: ['CCAA', 'SEXO', 'EDAD', 'ECIVIL', 'NACIO', 'ESTUDIO', 'D_1', 'VISI_2_1', 'MOVI_21_1', 'MOVI_22_1', 'MOVI_25_1']
<class 'pandas.core.frame.DataFrame'>
Index: 1469 entries, 424 to 164245
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   CCAA       1469 non-null   int64
 1   SEXO       1469 non-null   int64
 2   EDAD       1469 non-null   int64
 3   ECIVIL     1469 non-null   int64
 4   NACIO      1469 non-null   int64
 5   ESTUDIO    1469 non-null   int64
 6   D_1        1469 non-null   int64
 7   VISI_2_1   1469 non-null   int64
 8   MOVI_21_1  1469 non-null   int64
 9   MOVI_22_1  1469 non-null   int64
 10  MOVI_25_1  1469 non-null   int64
dtypes: int64(11)
memory usage: 137.7 KB

CCAA:
CCAA
1     289
2      69
3      65
4      37
5      82
6      48
7      90
8      88
9     115
10    105
11    125
12    123
13     73
14     39
1