In [33]:
import pandas as pd
import os
from unidecode import unidecode
import numpy as np

In [34]:
root = "anonym_data/"
new_root = "norm_data/"

In [35]:
def pcr_dengue(raw_df):
    
    result_mapping = {'Detectable - Serotipo 1\r\n\xa0': "Positivo-DEN1",  
                        "Detectable - Serotipo 2" : "Positivo-DEN2",       
                        "No detectable": "Negativo",                   
                        "Detectable" : "Positivo",                      
                        "XX" : "Indeterminado",                              
                        "No Detectable" : "Negativo",                    
                        "Detectable Serotipo 1"  : "Positivo-DEN1",            
                        "Detectable Serotipo 2"    : "Positivo-DEN2",          
                        "DEN2"  : "Positivo-DEN2",                             
                        "DEN1" : "Positivo-DEN1"}
    
    relcols = ["id_subject","fecha_nacimiento",'fecha_muestra', "analisis", 'resultado']
    fdf = raw_df[(raw_df.determinacion == "Dengue - Resultado:") & (raw_df.resultado.notna())].filter(relcols).copy()
    fdf["resultado"] = fdf.resultado.map(result_mapping)
    if fdf.resultado.isna().sum():
        print("Warning! Some mapping might be missing. NA result generated")
    return fdf


def classify_result(value, thresholds):
    try:
        # Attempt to convert the value to a float
        float_value = float(value)
        # Iterate through the thresholds dictionary and classify
        for label, (lower, upper) in thresholds.items():
            if lower <= float_value <= upper:
                return label
        # If no thresholds match, return a default value or raise an error
        return "Indeterminado"
    except ValueError:
        # If conversion fails, return the original value
        return value

def convert_fecha_muestra(df, column_name='fecha_muestra'):
    # Step 1: Split the datetime string to get only the date part
    df[column_name] = df[column_name].str.split(" ").str[0]
    
    # Step 2: Convert the date string to datetime object with format '%d/%m/%y'
    df[column_name] = pd.to_datetime(df[column_name], format='%d/%m/%y')
    
    return df


def igm_dengue(raw_df):
    relcols = ["id_subject","fecha_nacimiento",'fecha_muestra', "analisis", 'resultado']

    result_mapping = {'Positivo': 'Positivo',
                     'NEGATIVO': 'Negativo',
                     'Negativo': 'Negativo',
                     'XX': 'Indeterminado',
                     'Indeterminado': 'Indeterminado',
                     'INDET...': 'Indeterminado',
                     'BEG': 'Indeterminado'}
    

    thresholds = {
        "Positivo": (1.1, float('inf')),
        "Indeterminado": (0.9, 1.1),
        "Negativo": (float('-inf'), 0.9)
    }
    
    fdf = raw_df[(raw_df.determinacion == "Anticuerpos anti Dengue IgM") & (raw_df.resultado.notna())].copy()
    
    fdf["resultado"] = fdf.resultado.apply(lambda x: classify_result(x, thresholds))
    fdf.loc[:,"resultado"] = fdf.resultado.map(result_mapping)
    
    if (n:= fdf.resultado.isna().sum()):
        print(f"Warning! Some mapping might be missing. NA result generated n={n}")
    return fdf.filter(relcols)


def glucosa(df):
    relcols = ["id_subject", "fecha_muestra"]
    df = df[df.resultado != 'XX'].copy()
    colname = df.determinacion.unique()[0].lower()
    df[colname] = df.resultado.astype(float)
    if df.unidad_medida.nunique() > 1:
        print("Warning! Diferentes unidades de medida")
    
    return df[relcols + [colname]]



def multistudy_df(df):
    index_cols = ["id_subject", "determinacion", "fecha_muestra"]
    mask = ~df[index_cols].duplicated(keep='first')
    df_hemograma = df[mask].pivot(index=['id_subject', 'fecha_muestra'], columns='determinacion', values='resultado')
    
    df_hemograma = df_hemograma.apply(pd.to_numeric, errors = 'coerce').reset_index()

    df_hemograma.columns = [unidecode(c.lower().replace("%","perc").replace(" ","_").replace(".","")) for c in df_hemograma.columns]
    
    df_hemograma.replace('XX', np.nan, inplace=True)

    print(f"Deleting {(~mask).sum()} dup results")
    return df_hemograma


def plaquetas(df):
    index_cols = ["id_subject", "determinacion", "fecha_muestra"]

    mask = ~df[index_cols].duplicated(keep='first')
    df_plaq = df[mask].pivot(index=['id_subject', 'fecha_muestra'], columns='determinacion', values='resultado')
    df_plaq.columns = [unidecode(c.lower().replace("%","perc").replace(" ","_").replace(".","")) for c in df_plaq.columns]
    df_plaq.replace('XX', np.nan, inplace=True)
    df_plaq.replace('XX.', np.nan, inplace=True)
    df_plaq.replace('xx', np.nan, inplace=True)
    df_plaq.replace('X', np.nan, inplace=True)
    df_plaq.replace('.', np.nan, inplace=True)
    df_plaq.replace('0', np.nan, inplace=True)
    df_plaq["no_plaq"] = df_plaq.recuento_de_plaquetas == '<2'
    df_plaq["recuento_de_plaquetas"] = df_plaq["recuento_de_plaquetas"].apply(pd.to_numeric, errors = 'coerce')


    print(f"Deleting {(~mask).sum()} dup results")
    return df_plaq.reset_index()

# Dengue PCR

In [36]:
file = 'Dengue 03-2024.csv'
df = pd.read_csv(root + file)
df_dengue = convert_fecha_muestra(pcr_dengue(df))
df_dengue.to_csv(new_root + file, index = None)
df_dengue.head(2)


Unnamed: 0,id_subject,fecha_nacimiento,fecha_muestra,analisis,resultado
1,4332f4ec7a65d99c5127ff172d811dd7,1992-03-17 00:00:00,2024-03-01,PCR PARA DENGUE.,Positivo-DEN1
3,7afd11d1e498c6e48c8c80da9a52938f,1983-10-20 00:00:00,2024-03-01,PCR PARA DENGUE.,Negativo


In [37]:
df.fecha_nacimiento

0       1992-03-17 00:00:00
1       1992-03-17 00:00:00
2       1983-10-20 00:00:00
3       1983-10-20 00:00:00
4       1994-07-01 00:00:00
               ...         
5337             1967-04-26
5338             1991-10-14
5339             1991-10-14
5340             1973-02-08
5341             1973-02-08
Name: fecha_nacimiento, Length: 5342, dtype: object

# Dengue IgM

In [38]:
file = 'Dengue IgM- 03-2024.csv'
df = pd.read_csv(root + file)
df_dengue_igm = convert_fecha_muestra(igm_dengue(df))
df_dengue_igm.to_csv(new_root + file, index = None)
df_dengue_igm.head(2)

Unnamed: 0,id_subject,fecha_nacimiento,fecha_muestra,analisis,resultado
0,7afd11d1e498c6e48c8c80da9a52938f,1983-10-20 00:00:00,2024-03-01,SEROLOGIA PARA DENGUE IGM,Positivo
2,3a05ef1ae3b846c860b9cab74493cead,1986-04-12 00:00:00,2024-03-01,SEROLOGIA PARA DENGUE IGM,Positivo


# Glucosa

In [39]:
file = 'Glucosa 03-2024.csv'
df = pd.read_csv(root + file)
df_glucosa = convert_fecha_muestra(glucosa(df))
df_glucosa.to_csv(new_root + file, index = None)
df_glucosa.head(2)

Unnamed: 0,id_subject,fecha_muestra,glucosa
0,91e1a31c165dbff618695827e68c677f,2024-03-01,107.5
1,f530055b06393ba6f588a6671e5b8abd,2024-03-01,77.2


# Hemograma

In [40]:
file = "Hemograma 03-2024.csv"
df = pd.read_csv(root + file)
df_hemograma = convert_fecha_muestra(multistudy_df(df))
df_hemograma.to_csv(new_root + file, index = None)
df_hemograma.head(2)

Deleting 57 dup results


Unnamed: 0,id_subject,fecha_muestra,perc_basofilos,perc_eosinofilos,perc_linfocitos,perc_monocitos,perc_neutrofilos_cayados,perc_neutrofilos_polisegmentados,basofilos_abs,concentracion_de_hb_corp_media,...,hemoglobina_corpuscular_media,linfocitos_abs,monocitos_abs,neutrofilos_cayados_abs,neutrofilos_polisegmentados_abs,observaciones,rdw,recuento_de_globulos_blancos,recuento_de_globulos_rojos,volumen_corpuscular_medio
0,000bd3f4aa39981d82c5b2cd4fe41bc7,2024-03-11,0.0,2.0,20.0,17.0,0.0,61.0,0.0,31.7,...,29.0,774.0,657.9,0.0,2360.7,,12.5,3870.0,4100.0,91.5
1,00201509ff6946619043c1e8f92db9d8,2024-03-14,0.2,0.6,14.2,10.5,0.0,74.5,10.82,32.8,...,28.9,768.22,568.05,0.0,4030.45,,13.2,5410.0,4570.0,88.2


#  Hepatograma

In [41]:
file = "Hepatograma 03-2024.csv"
df = pd.read_csv(root + file, low_memory=False)
df_hepatograma = convert_fecha_muestra(multistudy_df(df))
df_hepatograma.to_csv(new_root + file, index = None)
df_hepatograma.head(2)

Deleting 16 dup results


Unnamed: 0,id_subject,fecha_muestra,albumina,bilirrubina_directa,bilirrubina_total,colesterol,fosfatasa_alcalina_(fal),proteinas_totales,transaminasa_glutamico_oxalacetica_(got),transaminasa_glutamico_piruvica_(gpt)
0,000bd3f4aa39981d82c5b2cd4fe41bc7,2024-03-11,4.42,0.1,0.24,132.28,76.0,6.51,17.2,16.0
1,001ca0efb436ca49bed63fcc1aa8e79a,2024-03-09,4.22,0.2,0.46,156.83,76.0,7.05,16.1,20.7


# Plaquetas

In [42]:
file = 'Plaquetas 03-2024.csv'
df = pd.read_csv(root + file)
df_plaquetas = convert_fecha_muestra(plaquetas(df))
df_plaquetas.to_csv(new_root + file, index = None)
df_plaquetas.head(2)

Deleting 0 dup results


Unnamed: 0,id_subject,fecha_muestra,observaciones_de_plaquetas,recuento_de_plaquetas,no_plaq
0,000bd3f4aa39981d82c5b2cd4fe41bc7,2024-03-11,,206.0,False
1,00201509ff6946619043c1e8f92db9d8,2024-03-14,Morfológicamente normales,130.0,False
