In [70]:
import os
import sys
import pandas as pd
import requests
from collections import defaultdict

In [51]:
# Check if running in the GitHub Actions environment
if 'GITHUB_ACTIONS' in os.environ:
    project_path = os.getcwd()
else:
    # Assuming your script is in the 'scripts' directory
    project_path = os.path.abspath(os.path.join(os.getcwd(), '../../..'))

# Add the project directory to the PYTHONPATH if it's not already there
if project_path not in sys.path:
    sys.path.append(project_path)

# Now you can import your custom module
from data_utils.data_processing import download_file, process_zip_file


In [52]:
from data_utils.data_processing import download_file, read_csv_file


In [53]:
def get_data_paths():
    # Check if running in the GitHub Actions environment
    if 'GITHUB_ACTIONS' in os.environ:
        base_path = os.path.join(os.getcwd(), 'data')
    else:
        # Assuming your script is in the 'scripts' directory
        base_path = os.path.abspath(os.path.join(os.getcwd(), '../../../data'))

    source_path = os.path.join(base_path, "source/ine/empleo")
    processed_path = os.path.join(base_path, "processed/ine/empleo")
    
    return source_path, processed_path


In [155]:
source_dir, processed_dir = get_data_paths()

print(f"Source dir: {source_dir}")
print(f"Processed dir: {processed_dir}")


Source dir: /Users/ernestolaval/Documents/Github Repositories/data_chile/data/source/ine/empleo
Processed dir: /Users/ernestolaval/Documents/Github Repositories/data_chile/data/processed/ine/empleo


In [55]:
os.makedirs(source_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)


In [56]:
# List of file URLs
file_urls = [
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2024/csv/ene-2024-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2024/csv/ene-2024-03-fma.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2024/csv/ene-2024-02-efm.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2024/csv/ene-2024-01-def.csv",

    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-12-nde.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-11-ond.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-10-son.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-09-aso.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-08-jas.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-07-jja.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-06-mjj.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-05-amj.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-03-fma.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-02-efm.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-01-def.csv",

    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-12-nde.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-11-ond.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-10-son.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-09-aso.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-08-jas.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-07-jja.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-06-mjj.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-05-amj.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-03-fma.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-02-efm.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-01-def.csv",

    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-12-nde.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-11-ond.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-10-son.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-09-aso.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-08-jas.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-07-jja.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-06-mjj.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-05-amj.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-03-fma.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-02-efm.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-01-def.csv",

    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-12-nde.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-11-ond.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-10-son.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-09-aso.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-08-jas.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-07-jja.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-06-mjj.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-05-amj.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-03-fma.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-02-efm.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-01-def.csv",

    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-12-nde.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-11-ond.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-10-son.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-09-aso.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-08-jas.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-07-jja.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-06-mjj.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-05-amj.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-03-fma.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-02-efm.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-01-def.csv",
    # Add more URLs as needed
]


In [57]:
delimiter = ";"


In [58]:
# Function to preprocess the "fact_cal" field
def preprocess_fact_cal(df):
    if 'fact_cal' in df.columns:
        df['fact_cal'] = df['fact_cal'].str.replace(',', '.').astype(float)
    return df



In [59]:
# Dictionary to store DataFrames for each trimester
trimester_data = defaultdict(pd.DataFrame)


In [60]:
# Download files and group by trimester
for url in file_urls:
    # Extract filename from URL
    filename = url.split('/')[-1]
    csv_path = os.path.join(source_dir, filename)
    
    print(f"Next {url}")
    # Check if the file already exists
    if not os.path.exists(csv_path):
        # Download the CSV file if it doesn't exist
        download_file(url, csv_path)
        print(f"Downloaded {filename}")
    else:
        print(f"{filename} already exists. Skipping download.")
    
    # Read the CSV file
    df = read_csv_file(csv_path, delimiter=delimiter)
    
    # Preprocess the "fact_cal" field
    df = preprocess_fact_cal(df)
    
    # Extract trimester info (e.g., "04-mam" from "ene-2024-04-mam.csv")
    trimester = '-'.join(filename.split('-')[2:4]).split('.')[0]
    
    # Append data to the corresponding trimester DataFrame
    trimester_data[trimester] = pd.concat([trimester_data[trimester], df], ignore_index=True)
    
    print(f"Added {filename} to the trimester {trimester} DataFrame.")

Next https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2024/csv/ene-2024-04-mam.csv
ene-2024-04-mam.csv already exists. Skipping download.
Detected encoding for /Users/ernestolaval/Documents/Github Repositories/data_chile/data/source/ine/empleo/ene-2024-04-mam.csv: utf-8
Successfully read file with encoding utf-8
Added ene-2024-04-mam.csv to the trimester 04-mam DataFrame.
Next https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2024/csv/ene-2024-03-fma.csv
ene-2024-03-fma.csv already exists. Skipping download.
Detected encoding for /Users/ernestolaval/Documents/Github Repositories/data_chile/data/source/ine/empleo/ene-2024-03-fma.csv: utf-8
Successfully read file with encoding utf-8
Added ene-2024-03-fma.csv to the trimester 03-fma DataFrame.
Next https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2024/csv/ene-2024-02-efm.csv
ene-2024-02-efm.csv already exists. Skipping download.
Detected encoding for /Users/ernestolava

Successfully read file with encoding utf-8
Added ene-2022-08-jas.csv to the trimester 08-jas DataFrame.
Next https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-07-jja.csv
ene-2022-07-jja.csv already exists. Skipping download.
Detected encoding for /Users/ernestolaval/Documents/Github Repositories/data_chile/data/source/ine/empleo/ene-2022-07-jja.csv: utf-8
Successfully read file with encoding utf-8
Added ene-2022-07-jja.csv to the trimester 07-jja DataFrame.
Next https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-06-mjj.csv
ene-2022-06-mjj.csv already exists. Skipping download.
Detected encoding for /Users/ernestolaval/Documents/Github Repositories/data_chile/data/source/ine/empleo/ene-2022-06-mjj.csv: utf-8
Successfully read file with encoding utf-8
Added ene-2022-06-mjj.csv to the trimester 06-mjj DataFrame.
Next https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-05

Successfully read file with encoding ISO-8859-1
Added ene-2021-03-fma.csv to the trimester 03-fma DataFrame.
Next https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-02-efm.csv
ene-2021-02-efm.csv already exists. Skipping download.
Detected encoding for /Users/ernestolaval/Documents/Github Repositories/data_chile/data/source/ine/empleo/ene-2021-02-efm.csv: utf-8
Error reading CSV file /Users/ernestolaval/Documents/Github Repositories/data_chile/data/source/ine/empleo/ene-2021-02-efm.csv with encoding utf-8, trying ISO-8859-1
Successfully read file with encoding ISO-8859-1
Added ene-2021-02-efm.csv to the trimester 02-efm DataFrame.
Next https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-01-def.csv
ene-2021-01-def.csv already exists. Skipping download.
Detected encoding for /Users/ernestolaval/Documents/Github Repositories/data_chile/data/source/ine/empleo/ene-2021-01-def.csv: utf-8
Error reading CSV file /Us

Error reading CSV file /Users/ernestolaval/Documents/Github Repositories/data_chile/data/source/ine/empleo/ene-2019-09-aso.csv with encoding utf-8, trying ISO-8859-1
Successfully read file with encoding ISO-8859-1
Added ene-2019-09-aso.csv to the trimester 09-aso DataFrame.
Next https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-08-jas.csv
ene-2019-08-jas.csv already exists. Skipping download.
Detected encoding for /Users/ernestolaval/Documents/Github Repositories/data_chile/data/source/ine/empleo/ene-2019-08-jas.csv: utf-8
Error reading CSV file /Users/ernestolaval/Documents/Github Repositories/data_chile/data/source/ine/empleo/ene-2019-08-jas.csv with encoding utf-8, trying ISO-8859-1
Successfully read file with encoding ISO-8859-1
Added ene-2019-08-jas.csv to the trimester 08-jas DataFrame.
Next https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-07-jja.csv
ene-2019-07-jja.csv already exists. Skipping dow

In [157]:
# List of columns to include
columns_to_include = [
    'ano_trimestre', 
    'mes_central', 
    'ano_encuesta',
    'mes_encuesta',
    
    'id_identificacion',
    'idrph',
    
    'region',
    'edad',
    'tramo_edad',
    'sexo',
    'nivel',
    'termino_nivel',
    'cine',
    'nacionalidad',
    
    'b1',
    'b13_rev4cl_caenes', 
    'b14_rev4cl_caenes',
    'r_p_rev4cl_caenes',
    
    'habituales',
    'c10',
    'c11',
    
    'e4',
    
    'efectivas',
    
    'activ',
    'cae_general', 
    'cae_especifico', 
    'categoria_ocupacion',
    'ocup_form',
    'sector',
    'obe',
    'tpi',
    'id',
    'ftp',
    
    'fact_cal'
    
]  # Add other columns as needed



In [158]:
# Save each trimester DataFrame as a Parquet file
for trimester, df in trimester_data.items():
    # Select only the specified columns
    df = df[columns_to_include]
    
    processed_path = os.path.join(processed_dir, f"ene-{trimester}.parquet")
    df.to_parquet(processed_path)
    print(f"Merged DataFrame for trimester {trimester} saved to {processed_path}")

Merged DataFrame for trimester 04-mam saved to /Users/ernestolaval/Documents/Github Repositories/data_chile/data/processed/ine/empleo/ene-04-mam.parquet
Merged DataFrame for trimester 03-fma saved to /Users/ernestolaval/Documents/Github Repositories/data_chile/data/processed/ine/empleo/ene-03-fma.parquet
Merged DataFrame for trimester 02-efm saved to /Users/ernestolaval/Documents/Github Repositories/data_chile/data/processed/ine/empleo/ene-02-efm.parquet
Merged DataFrame for trimester 01-def saved to /Users/ernestolaval/Documents/Github Repositories/data_chile/data/processed/ine/empleo/ene-01-def.parquet
Merged DataFrame for trimester 12-nde saved to /Users/ernestolaval/Documents/Github Repositories/data_chile/data/processed/ine/empleo/ene-12-nde.parquet
Merged DataFrame for trimester 11-ond saved to /Users/ernestolaval/Documents/Github Repositories/data_chile/data/processed/ine/empleo/ene-11-ond.parquet
Merged DataFrame for trimester 10-son saved to /Users/ernestolaval/Documents/Githu

## Regenerar archivo para todos los meses / años con campos claves

In [156]:
import pandas as pd
import numpy as np
import glob

# Define the pattern to match all relevant Parquet files
file_pattern = processed_dir + '/ene-*-*.parquet'
print(file_pattern)

# Use glob to find all files matching the pattern
parquet_files = glob.glob(file_pattern)

# Initialize an empty list to hold the DataFrames
dfs = []

# Iterate over the list of files and read each one into a DataFrame
for file in parquet_files:
    df = pd.read_parquet(file)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Display the first few rows of the combined DataFrame to check the data
print("Combined DataFrame:")
print(combined_df.head())

# Inspect the distribution of 'cae_especifico' values
print("Distribution of 'cae_especifico' values:")
print(combined_df['cae_especifico'].value_counts())

# Categorize 'Ocupacion' based on 'cae_especifico'
combined_df['Ocupacion'] = combined_df['cae_especifico'].apply(lambda x: 'Ocupados' if 1 <= x <= 7 else ('Desocupados' if 8 <= x <= 9 else 'Unknown'))

# Inspect the distribution of the new 'Ocupacion' column
print("Distribution of 'Ocupacion' values:")
print(combined_df['Ocupacion'].value_counts())

# Categorize 'Nacionalidad'
combined_df['Nacionalidad'] = combined_df['nacionalidad'].apply(lambda x: 'Chilena' if x == 152 else 'Extranjeros')

# Add 'edad_de_trabajar' column
combined_df['edad_de_trabajar'] = combined_df['edad'].apply(lambda x: 1 if x >= 15 else 0)

# Fill null values in the 'sexo' column
combined_df['sexo'] = combined_df['sexo'].fillna('Unknown').astype(str)

# Specify the key dimension fields to group by, including 'edad_de_trabajar' and 'sexo'
key_dimension_fields = [
    'ano_trimestre', 'mes_central', 'Ocupacion', 'Nacionalidad', 'categoria_ocupacion', 'b14_rev4cl_caenes', 'b1', 'ocup_form', 'sector', 'edad_de_trabajar', 'sexo'
]

# Fill null values in the key dimension fields with appropriate placeholders
combined_df['categoria_ocupacion'] = combined_df['categoria_ocupacion'].fillna(-1).astype(int)
combined_df['b14_rev4cl_caenes'] = combined_df['b14_rev4cl_caenes'].fillna(-1).astype(int)
combined_df['b1'] = combined_df['b1'].fillna(-1).astype(int)
combined_df['ocup_form'] = combined_df['ocup_form'].fillna(-1).astype(int)
combined_df['sector'] = combined_df['sector'].fillna(-1).astype(int)

# Aggregate the data by summing the 'fact_cal' metric
aggregated_df = combined_df.groupby(key_dimension_fields)['fact_cal'].sum().reset_index()

# Replace placeholders with np.nan after aggregation for integer columns
aggregated_df['categoria_ocupacion'] = aggregated_df['categoria_ocupacion'].replace(-1, np.nan).astype('Int64')
aggregated_df['b14_rev4cl_caenes'] = aggregated_df['b14_rev4cl_caenes'].replace(-1, np.nan).astype('Int64')
aggregated_df['b1'] = aggregated_df['b1'].replace(-1, np.nan).astype('Int64')
aggregated_df['ocup_form'] = aggregated_df['ocup_form'].replace(-1, np.nan).astype('Int64')
aggregated_df['sector'] = aggregated_df['sector'].replace(-1, np.nan).astype('Int64')

# Display the first few rows of the aggregated DataFrame to check the results
print("Aggregated DataFrame:")
print(aggregated_df.head())

# Save the aggregated DataFrame to a new Parquet file
output_parquet_file = processed_dir + '/ene_sintetica.parquet'
aggregated_df.to_parquet(output_parquet_file, index=False)

print(f"Aggregated data saved to {output_parquet_file}")


/Users/ernestolaval/Documents/Github Repositories/data_chile/data/processed/ine/empleo/ene-*-*.parquet
Combined DataFrame:
   ano_trimestre  mes_central  ano_encuesta  mes_encuesta  region  edad  \
0           2023            6          2023             5      13     9   
1           2023            6          2023             5      13     4   
2           2023            6          2023             5      13    38   
3           2023            6          2023             5      13    43   
4           2023            6          2023             5      13    29   

   tramo_edad  sexo  nivel  termino_nivel  ...  cae_general  cae_especifico  \
0         NaN     2      3              2  ...            0               0   
1         NaN     2      1              1  ...            0               0   
2         5.0     1      8              1  ...            1               1   
3         6.0     2      8              1  ...            9              13   
4         3.0     1      9     