In [13]:
import os
import sys
import pandas as pd


In [21]:
# Check if running in the GitHub Actions environment
if 'GITHUB_ACTIONS' in os.environ:
    project_path = os.getcwd()
else:
    # Assuming your script is in the 'scripts' directory
    project_path = os.path.abspath(os.path.join(os.getcwd(), '../../..'))

# Add the project directory to the PYTHONPATH if it's not already there
if project_path not in sys.path:
    sys.path.append(project_path)

# Now you can import your custom module
from data_utils.data_processing import download_file, process_zip_file


In [23]:
# Define headers for CSV files without headers
DEFAULT_HEADERS = [
    "AÑO", "FECHA_DEF", "SEXO_NOMBRE", "EDAD_TIPO", "EDAD_CANT", "COD_COMUNA", "COMUNA", "NOMBRE_REGION",
    "DIAG1", "CAPITULO_DIAG1", "GLOSA_CAPITULO_DIAG1", "CODIGO_GRUPO_DIAG1", "GLOSA_GRUPO_DIAG1",
    "CODIGO_CATEGORIA_DIAG1", "GLOSA_CATEGORIA_DIAG1", "CODIGO_SUBCATEGORIA_DIAG1", "GLOSA_SUBCATEGORIA_DIAG1",
    "DIAG2", "CAPITULO_DIAG2", "GLOSA_CAPITULO_DIAG2", "CODIGO_GRUPO_DIAG2", "GLOSA_GRUPO_DIAG2",
    "CODIGO_CATEGORIA_DIAG2", "GLOSA_CATEGORIA_DIAG2", "CODIGO_SUBCATEGORIA_DIAG2", "GLOSA_SUBCATEGORIA_DIAG2",
    "LUGAR_DEFUNCION"
]


In [24]:
def get_data_paths():
    # Check if running in the GitHub Actions environment
    if 'GITHUB_ACTIONS' in os.environ:
        base_path = os.path.join(os.getcwd(), 'data')
    else:
        # Assuming your script is in the 'scripts' directory
        base_path = os.path.abspath(os.path.join(os.getcwd(), '../../../data'))

    source_path = os.path.join(base_path, "source/salud/defunciones")
    processed_path = os.path.join(base_path, "processed/salud/defunciones")
    
    return source_path, processed_path

In [25]:
# Create directories if they don't exist
source_dir, processed_dir = get_data_paths()
os.makedirs(source_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)



In [26]:
# List of file URLs and corresponding CSV filenames to extract
file_info = [
    ("https://repositoriodeis.minsal.cl/DatosAbiertos/VITALES/DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.zip", "DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.csv"),
    ("https://repositoriodeis.minsal.cl/DatosAbiertos/VITALES/DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.zip", "DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.csv")
]



In [28]:
# Process each zip file
for url, extract_filename in file_info:
    header_option = None if '2022_2024' in extract_filename else 'infer'
    names_option = None if header_option == 'infer' else DEFAULT_HEADERS
    process_zip_file(url, extract_filename, source_dir, processed_dir, header=header_option, names=names_option)



Downloaded DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.zip
Detected encoding for /tmp/DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.csv: ISO-8859-1
Successfully read file with encoding ISO-8859-1
Processed DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.csv and saved to /Users/ernestolaval/Documents/Github Repositories/data_chile/data/processed/salud/defunciones/DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.parquet
DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.zip already exists. Skipping download.
Detected encoding for /tmp/DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.csv: ISO-8859-1
Successfully read file with encoding ISO-8859-1
Processed DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.csv and saved to /Users/ernestolaval/Documents/Github Repositories/data_chile/data/processed/salud/defunciones/DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.parquet


In [29]:
# Read and merge both files using the common structure
df1 = pd.read_parquet(os.path.join(processed_dir, "DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.parquet"))
df2 = pd.read_parquet(os.path.join(processed_dir, "DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.parquet"))

# Concatenate dataframes
df_combined = pd.concat([df1, df2], ignore_index=True)

# Filter records with AÑO >= 2003
df_filtered = df_combined[df_combined['AÑO'] >= 2003]

# Save combined and filtered dataframe
combined_parquet_path = os.path.join(processed_dir, "DEFUNCIONES_2003_2024.parquet")
df_filtered.to_parquet(combined_parquet_path)
print(f"Combined and filtered DataFrame saved to {combined_parquet_path}")

Combined and filtered DataFrame saved to /Users/ernestolaval/Documents/Github Repositories/data_chile/data/processed/salud/defunciones/DEFUNCIONES_2003_2024.parquet
