In [29]:
import os
import pandas as pd
import requests
from collections import defaultdict


In [3]:
# Create directories if they don't exist
source_dir = "."
parquet_dir = "../../../processed/ine/empleo/"


In [4]:
os.makedirs(source_dir, exist_ok=True)
os.makedirs(parquet_dir, exist_ok=True)


In [38]:
# List of file URLs
file_urls = [
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2024/csv/ene-2024-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2024/csv/ene-2024-03-fma.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-03-fma.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-03-fma.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-04-mam.csv",
 
    # Add more URLs as needed
]


In [39]:
# Function to download a file from a URL
def download_file(url, save_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
    else:
        print(f"Failed to download file from {url}")



In [40]:
delimiter = ";"

In [41]:
# Function to read a file with a specified encoding and handle errors
def read_csv_file(file_path, encoding='utf-8'):
    try:
        return pd.read_csv(file_path, sep=delimiter, encoding=encoding,index_col=False, low_memory=False)
    except UnicodeDecodeError:
        print(f"Encoding error with {file_path}, trying ISO-8859-1")
        return pd.read_csv(file_path, sep=delimiter,  encoding='ISO-8859-1',index_col=False, low_memory=False)


In [42]:
# Function to convert CSV to Parquet
def convert_csv_to_parquet(csv_path, parquet_path):
    df = read_csv_file(csv_path)
    df.to_parquet(parquet_path)


In [47]:
# Function to preprocess the "fact_cal" field
def preprocess_fact_cal(df):
    if 'fact_cal' in df.columns:
        df['fact_cal'] = df['fact_cal'].str.replace(',', '.').astype(float)
    return df


In [48]:
# Dictionary to store DataFrames for each trimester
trimester_data = defaultdict(pd.DataFrame)


In [49]:
# Download files and group by trimester
for url in file_urls:
    # Extract filename from URL
    filename = url.split('/')[-1]
    csv_path = os.path.join(source_dir, filename)
    
    # Check if the file already exists
    if not os.path.exists(csv_path):
        # Download the CSV file if it doesn't exist
        download_file(url, csv_path)
        print(f"Downloaded {filename}")
    else:
        print(f"{filename} already exists. Skipping download.")
    
    # Read the CSV file
    df = read_csv_file(csv_path)
    
    # Preprocess the "fact_cal" field
    df = preprocess_fact_cal(df)
    
    # Extract trimester info (e.g., "04-mam" from "ene-2024-04-mam.csv")
    trimester = '-'.join(filename.split('-')[2:4]).split('.')[0]
    
    # Append data to the corresponding trimester DataFrame
    trimester_data[trimester] = pd.concat([trimester_data[trimester], df], ignore_index=True)
    
    print(f"Added {filename} to the trimester {trimester} DataFrame.")


ene-2024-04-mam.csv already exists. Skipping download.
Added ene-2024-04-mam.csv to the trimester 04-mam DataFrame.
ene-2024-03-fma.csv already exists. Skipping download.
Added ene-2024-03-fma.csv to the trimester 03-fma DataFrame.
ene-2023-04-mam.csv already exists. Skipping download.
Added ene-2023-04-mam.csv to the trimester 04-mam DataFrame.
ene-2023-03-fma.csv already exists. Skipping download.
Added ene-2023-03-fma.csv to the trimester 03-fma DataFrame.
ene-2022-04-mam.csv already exists. Skipping download.
Added ene-2022-04-mam.csv to the trimester 04-mam DataFrame.
ene-2022-03-fma.csv already exists. Skipping download.
Added ene-2022-03-fma.csv to the trimester 03-fma DataFrame.
ene-2021-04-mam.csv already exists. Skipping download.
Encoding error with ./ene-2021-04-mam.csv, trying ISO-8859-1
Added ene-2021-04-mam.csv to the trimester 04-mam DataFrame.
ene-2020-04-mam.csv already exists. Skipping download.
Added ene-2020-04-mam.csv to the trimester 04-mam DataFrame.
ene-2019-04

In [50]:
# Save each trimester DataFrame as a Parquet file
for trimester, df in trimester_data.items():
    parquet_path = os.path.join(parquet_dir, f"ene-{trimester}.parquet")
    df.to_parquet(parquet_path)
    print(f"Merged DataFrame for trimester {trimester} saved to {parquet_path}")


Merged DataFrame for trimester 04-mam saved to ../../../processed/ine/empleo/ene-04-mam.parquet
Merged DataFrame for trimester 03-fma saved to ../../../processed/ine/empleo/ene-03-fma.parquet
