In [1]:
import pandas as pd
import hashlib
from unidecode import unidecode
import os

In [2]:
file = "data/Dengue 03-2024.xls"

config = dict(
hashing_cols = ["tipo_documento", "nro_documento"],
additional_cols_to_drop = ["paciente"],
drop_id = True)

In [3]:
def read_excel(file):
    df = pd.read_excel(file, nrows=50)
    mask = df == "Paciente"
    idx = mask.stack().idxmax()[0]
    return pd.read_excel(file, skiprows=idx+1)

def colnames_norm(df):
    df.columns = df.columns.str.lower().str.strip().str.replace('.', '', regex = False).str.replace(' ', '_', regex = False).map(lambda x: unidecode(x).replace(' ', '_'))

# Function to compute a consistent hash
def compute_hash(row, cols):
    combined_string = ''.join([str(row[col]) for col in cols])
    return hashlib.md5(combined_string.encode()).hexdigest()

def hash_id(df, hashing_cols = config["hashing_cols"], new_id_name = "id_subject", drop_id = True):
    df['id_subject'] = df.apply(compute_hash, cols = hashing_cols, axis=1)
    if drop_id:
        df.drop(columns = hashing_cols, inplace= True)
        
def anonymize(file):
    df = read_excel(file)
    colnames_norm(df)
    hash_id(df)
    return df.drop(columns = config["additional_cols_to_drop"])

In [5]:
root = "raw_data/"
dest = "anonym_data/"
files = [f for f in os.listdir(root)]

for file in  files:
    try:
        df = anonymize(root + file)
        df.to_csv(f"{dest}/{file.split('.')[0]}.csv", index = None)
        print(f"{file} with shape {df.shape} done")
    except Exception as e:
        print(f"Failed! ({file} with shape {df.shape}). {str(e)[:100]}")