In [92]:
import polars as pl
import pandas as pd
from unidecode import unidecode
import re
import operator


In [93]:
import pandas as pd
from pandas.api import types as pdt

def clean_field_names_pandas(df: pd.DataFrame, case: str = "lower") -> pd.DataFrame:
    """
    Renombra columnas:
      - strip + collapse espacios
      - espacios → "_"
      - quita tildes
      - pasa a lower/upper según case
    """
    cols = (
        df.columns
          .str.strip()
          .str.replace(r"\s+", " ", regex=True)
          .str.replace(" ", "_")
          .str.normalize("NFKD")
          .str.encode("ascii", "ignore").str.decode("ascii")
    )
    cols = cols.str.upper() if case.lower() == "upper" else cols.str.lower()
    df = df.copy()
    df.columns = cols
    return df

def _clean_str_series(ser: pd.Series, case: str) -> pd.Series:
    """
    Limpia una Serie de texto:
      - strip + collapse espacios
      - quita tildes
      - pasa a lower/upper según case
    Usa StringDtype para preservar nulos.
    """
    s = ser.astype("string")
    s = (
        s.str.strip()
         .str.replace(r"\s+", " ", regex=True)
         .str.normalize("NFKD")
         .str.encode("ascii", "ignore").str.decode("ascii")
    )
    s = s.str.upper() if case.lower() == "upper" else s.str.lower()
    return s

def clean_text_values_pandas(df: pd.DataFrame, case: str = "lower") -> pd.DataFrame:
    """
    Normaliza valores de texto en columnas object, string o category.
    """
    df = df.copy()
    for col in df.columns:
        dtype = df[col].dtype

        if isinstance(dtype, pd.CategoricalDtype):
            # limpia categorías y valores
            ser = df[col]
            cats = pd.Series(ser.cat.categories, dtype="string")
            cats_clean = _clean_str_series(cats, case)
            ser_new = ser.cat.set_categories(cats_clean).astype("string")
            ser_new = _clean_str_series(ser_new, case).astype("category")

        elif pdt.is_object_dtype(dtype) or pdt.is_string_dtype(dtype):
            ser_new = _clean_str_series(df[col], case)

        else:
            continue

        df[col] = ser_new
    return df

def clean_datetime_columns_pandas(
    df: pd.DataFrame,
    datetime_params: dict = None
) -> pd.DataFrame:
    """
    Convierte a datetime SOLO las columnas listadas en datetime_params:
      datetime_params = { col_name: format_str, ... }
    No hay inferencia automática.
    """
    df = df.copy()
    datetime_params = datetime_params or {}

    for col, fmt in datetime_params.items():
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], format=fmt, errors="coerce")
            print(f"*** Columna {col!r}: convertida con formato {fmt} ***")
    return df

def fill_nulls(df: pd.DataFrame, fill_map: dict) -> pd.DataFrame:
    """
    Rellena nulos según fill_map = { col_name: fill_value, ... }
    """
    df = df.copy()
    for col, val in fill_map.items():
        if col in df.columns:
            before = df[col].isna().sum()
            df[col] = df[col].fillna(val)
            after = df[col].isna().sum()
            print(f"*** Columna {col!r} con {before} nulls → rellenado con {val!r} ({after} nulls actualmente) *** ")
    return df

def check_duplicates(df: pd.DataFrame, subset: list = None, max_display: int = 5) -> None:
    """
    Muestra registros duplicados en el DataFrame.
    
    Parámetros:
    - subset: columnas a considerar para detectar duplicados (por defecto, todas).
    - max_display: cuántos duplicados mostrar (por defecto, 5).
    """
    dup_mask = df.duplicated(subset=subset, keep=False)
    dup_count = dup_mask.sum()
    
    if dup_count == 0:
        print("*** No hay duplicados encontrados ***")
    else:
        cols_info = f"en columnas: {subset}" if subset else "en todas las columnas"
        print(f"*** Se encontraron {dup_count} registros duplicados {cols_info}. Mostrando primeros {max_display}: ***")
        display(df[dup_mask].head(max_display))

import operator

def replace_invalid_numeric_values(df: pd.DataFrame, numeric_check_conf: dict) -> pd.DataFrame:
    df = df.copy()

    ops = {
        "<": operator.lt,
        "<=": operator.le,
        "==": operator.eq,
        "!=": operator.ne,
        ">=": operator.ge,
        ">": operator.gt,
    }

    fields = numeric_check_conf.get("fields_name", [])
    op_str = numeric_check_conf.get("op", "<=")
    threshold = numeric_check_conf.get("value", 0)
    method = numeric_check_conf.get("method", "mean").lower()

    if op_str not in ops:
        print(f"***  Operador {op_str!r} no soportado. ***")
        return df

    op_func = ops[op_str]

    for col in fields:
        if col not in df.columns:
            print(f"*** Columna {col!r} no existe en el DataFrame. ***")
            continue

        mask = op_func(df[col], threshold)
        count = mask.sum()

        if count == 0:
            print(f"*** Columna {col!r}: ningún valor cumple la condición '{col} {op_str} {threshold}' — no se reemplaza nada. ***")
            continue

        if method == "mean":
            replacement = df.loc[~mask, col].mean()
        elif method == "median":
            replacement = df.loc[~mask, col].median()
        elif method == "mode":
            mode = df.loc[~mask, col].mode()
            replacement = mode[0] if not mode.empty else None
        else:
            print(f"*** Método {method!r} no soportado. Se omite columna {col!r}. ***")
            continue

        print(f"*** Columna {col!r}: {count} valores reemplazados con {method} ({replacement:.2f}) usando condición '{col} {op_str} {threshold}'***")
        df.loc[mask, col] = replacement

    return df


def clean_pandas_df(
    df: pd.DataFrame,
    fields_name_params: dict,
    values_params: dict,
) -> pd.DataFrame:
    """
    Pipeline:
      1) Renombra columnas (fields_name_params['case'])
      2) Convierte fechas (fields_name_params['datetime_fields'])
      3) Rellena nulos (values_params['change_nulls_maps'])
      4) Normaliza texto (values_params['case'])
    """
    # 1) Renombrar columnas
    case_names = fields_name_params.get("case", "lower")
    df2 = clean_field_names_pandas(df, case_names)

    # 2) Convertir datetime
    datetime_fields = fields_name_params.get("datetime_fields", {})
    if datetime_fields:
        df2 = clean_datetime_columns_pandas(df2, datetime_fields)

    # 3) Rellenar nulos
    null_fill_map = values_params.get("change_nulls_maps", {})
    if null_fill_map:
        df2 = fill_nulls(df2, null_fill_map)

    # 4) Normalizar texto
    case_values = values_params.get("case", "lower")
    df2 = clean_text_values_pandas(df2, case_values)

    # 5) Comprobar duplicados
    check_duplicates(df2)

    #6) Validación datos numéricos
    numeric_check_params = values_params.get("numeric_checks", {})
    if numeric_check_params:
        df2 = replace_invalid_numeric_values(df2, numeric_check_params)

    return df2




In [94]:
df = pd.read_csv('./../datasets/olist_order_reviews_dataset.csv')

fields_name_params = {
    "case": "lower",
    "datetime_fields": {
        "review_creation_date": "%Y-%m-%d %H:%M:%S",
        "review_answer_timestamp": "%Y-%m-%d %H:%M:%S"
    }
}

values_params = {
    "case": "lower",
    "change_nulls_maps": {
        "review_comment_title": "no title",
        "review_comment_message": "no comment"
    }
}

df_clean = clean_pandas_df(df, fields_name_params, values_params)

print("=== Después de limpiar ===")
print(df_clean.info())
print("Nulls restantes:\n", df_clean.isna().sum())
df_clean

*** Columna 'review_creation_date': convertida con formato %Y-%m-%d %H:%M:%S ***
*** Columna 'review_answer_timestamp': convertida con formato %Y-%m-%d %H:%M:%S ***
*** Columna 'review_comment_title' con 87656 nulls → rellenado con 'no title' (0 nulls actualmente) *** 
*** Columna 'review_comment_message' con 58247 nulls → rellenado con 'no comment' (0 nulls actualmente) *** 
*** No hay duplicados encontrados ***
=== Después de limpiar ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   review_id                99224 non-null  object        
 1   order_id                 99224 non-null  object        
 2   review_score             99224 non-null  int64         
 3   review_comment_title     99224 non-null  object        
 4   review_comment_message   99224 non-null  object        
 5   review_creation_

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,no title,no comment,2018-01-18,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,no title,no comment,2018-03-10,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,no title,no comment,2018-02-17,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,no title,recebi bem antes do prazo estipulado.,2017-04-21,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,no title,parabens lojas lannister adorei comprar pela i...,2018-03-01,2018-03-02 10:26:53
...,...,...,...,...,...,...,...
99219,574ed12dd733e5fa530cfd4bbf39d7c9,2a8c23fee101d4d5662fa670396eb8da,5,no title,no comment,2018-07-07,2018-07-14 17:18:30
99220,f3897127253a9592a73be9bdfdf4ed7a,22ec9f0669f784db00fa86d035cf8602,5,no title,no comment,2017-12-09,2017-12-11 20:06:42
99221,b3de70c89b1510c4cd3d0649fd302472,55d4004744368f5571d1f590031933e4,5,no title,"excelente mochila, entrega super rapida. super...",2018-03-22,2018-03-23 09:10:43
99222,1adeb9d84d72fe4e337617733eb85149,7725825d039fc1f0ceb7635e3f7d9206,4,no title,no comment,2018-07-01,2018-07-02 12:59:13


In [95]:
#Guardo el dataset limpio y normalizado
df_clean.to_csv("./output/olist_order_reviews_dataset_clean.csv", index=False)


In [96]:
#Hago lo mismo para el dataset de payments
df = pd.read_csv('./../datasets/olist_order_payments_dataset.csv')
print("Nulls:", df.isna().sum())

df.info()

fields_name_params = {
    "case": "lower",
}

values_params = {
    "case": "lower",
    "numeric_checks": {
        "fields_name": ["payment_sequential", "payment_installments", "payment_value"],  # columnas a validar
        "op": "<",
        "value": 0,
        "method": "median"  # puede ser "mean", "median", o "mode"
    }
}

df_clean = clean_pandas_df(df, fields_name_params, values_params)

print("=== Después de limpiar ===")
print(df_clean.info())
print("Nulls restantes:\n", df_clean.isna().sum())

# Guardo el dataset limpio y normalizado
df_clean.to_csv("./output/olist_order_payments_dataset_clean.csv", index=False)

df_clean


Nulls: order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB
*** No hay duplicados encontrados ***
*** Columna 'payment_sequential': ningún valor cumple la condición 'payment_sequential < 0' — no se reemplaza nada. ***
*** Columna 'payment_installments': ningún valor cumple la condición 'payment_installments < 0' — no se reemplaza nada. ***
*** Columna 'payment_value': nin

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45
...,...,...,...,...,...
103881,0406037ad97740d563a178ecc7a2075c,1,boleto,1,363.31
103882,7b905861d7c825891d6347454ea7863f,1,credit_card,2,96.80
103883,32609bbb3dd69b3c066a6860554a77bf,1,credit_card,1,47.77
103884,b8b61059626efa996a60be9bb9320e10,1,credit_card,5,369.54


In [97]:
datetimes_params = {
    "filters":{
        "field_name_filter": "deliveried",
    },
    "field_start": "field_name1",
    "field_end": "field_name2",
    "fill_null": "mean", #puede ser también la mediana
}

In [98]:
def handle_datetime_diff_with_null_imputation(df: pd.DataFrame, params: dict) -> pd.DataFrame:
    df = df.copy()
    
    start_col = params["field_start"]
    end_col = params["field_end"]
    
    # Asegurar tipos datetime
    df[start_col] = pd.to_datetime(df[start_col], errors="coerce")
    df[end_col] = pd.to_datetime(df[end_col], errors="coerce")
    
    # Calcular diferencia en días para registros válidos
    mask_valid = df[start_col].notna() & df[end_col].notna()
    df["__days_diff__"] = (df[end_col] - df[start_col]).dt.days
    valid_diffs = df.loc[mask_valid, "__days_diff__"]

    # Calcular métrica
    method = params.get("fill_null", {}).get("diff", "mean")
    if method == "mean":
        diff_value = valid_diffs.mean()
    elif method == "median":
        diff_value = valid_diffs.median()
    else:
        raise ValueError("Método no soportado para diff: usa 'mean' o 'median'")
    
    print(f"📐 Diferencia en días calculada por {method}: {diff_value:.2f}")

    # Filtros para aplicar imputación de nulos
    filter_conditions = params.get("fill_null", {}).get("filters", {})
    if filter_conditions:
        filter_mask = pd.Series(True, index=df.index)
        for col, val in filter_conditions.items():
            filter_mask &= df[col] == val
    else:
        filter_mask = pd.Series(True, index=df.index)

    # Relleno de nulos
    mask_start_na = df[start_col].isna() & df[end_col].notna() & filter_mask
    df.loc[mask_start_na, start_col] = df.loc[mask_start_na, end_col] - pd.to_timedelta(diff_value, unit='D')
    print(f"🛠️ {mask_start_na.sum()} registros con {start_col} nulo rellenados usando {end_col} - {diff_value:.2f} días")

    mask_end_na = df[end_col].isna() & df[start_col].notna() & filter_mask
    df.loc[mask_end_na, end_col] = df.loc[mask_end_na, start_col] + pd.to_timedelta(diff_value, unit='D')
    print(f"🛠️ {mask_end_na.sum()} registros con {end_col} nulo rellenados usando {start_col} + {diff_value:.2f} días")

    # Crear columna de diferencia si se solicita
    if params.get("create_diff_column"):
        diff_col = f"days_diff_{start_col}:{end_col}"
        df[diff_col] = (df[end_col] - df[start_col]).dt.days
        print(f"📄 Columna creada: {diff_col}")

    df.drop(columns="__days_diff__", inplace=True)
    return df


In [99]:


df = pd.read_csv('./../datasets/olist_orders_dataset.csv')
print(df.head(5))
df.info()




                           order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
4  ad21c59c0840e6cb83a9ceb5573f8159  8ab97904e6daea8866dbdbc4fb7aad2c   

  order_status order_purchase_timestamp    order_approved_at  \
0    delivered      2017-10-02 10:56:33  2017-10-02 11:07:15   
1    delivered      2018-07-24 20:41:37  2018-07-26 03:24:27   
2    delivered      2018-08-08 08:38:49  2018-08-08 08:55:23   
3    delivered      2017-11-18 19:28:06  2017-11-18 19:45:59   
4    delivered      2018-02-13 21:18:39  2018-02-13 22:20:29   

  order_delivered_carrier_date order_delivered_customer_date  \
0          2017-10-04 19:55:00           2017-10-10 21:25:13   
1          2018-07-26 14:31:00           2018-08

In [100]:

fields_name_params = {
    "case": "lower",
    "datetime_fields": {
        "order_purchase_timestamp": "%Y-%m-%d %H:%M:%S",
        "order_approved_at": "%Y-%m-%d %H:%M:%S",
        "order_delivered_carrier_date": "%Y-%m-%d %H:%M:%S",
        "order_delivered_customer_date": "%Y-%m-%d %H:%M:%S",
        "order_estimated_delivery_date": "%Y-%m-%d %H:%M:%S",
    }
}

values_params = {
    "case": "lower",
}

df_clean = clean_pandas_df(df, fields_name_params, values_params)

*** Columna 'order_purchase_timestamp': convertida con formato %Y-%m-%d %H:%M:%S ***
*** Columna 'order_approved_at': convertida con formato %Y-%m-%d %H:%M:%S ***
*** Columna 'order_delivered_carrier_date': convertida con formato %Y-%m-%d %H:%M:%S ***
*** Columna 'order_delivered_customer_date': convertida con formato %Y-%m-%d %H:%M:%S ***
*** Columna 'order_estimated_delivery_date': convertida con formato %Y-%m-%d %H:%M:%S ***
*** No hay duplicados encontrados ***


In [102]:
a =df[(df['order_status'] == 'delivered') & (df['order_delivered_carrier_date'].isna())]
a

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
73222,2aa91108853cecb43c84a5dc5b277475,afeb16c7f46396c0ed54acb45ccaaa40,delivered,2017-09-29 08:52:58,2017-09-29 09:07:16,,2017-11-20 19:44:47,2017-11-14 00:00:00
92643,2d858f451373b04fb5c984a1cc2defaf,e08caf668d499a6d643dafd7c5cc498a,delivered,2017-05-25 23:22:43,2017-05-25 23:30:16,,,2017-06-23 00:00:00


In [126]:
print(df[(df['order_purchase_timestamp'].isna())].shape)

print(df[(df['order_status'] != 'canceled') & (df['order_approved_at'].isna())].shape)
df[(df['order_status'] != 'canceled') & (df['order_status'] != 'created') & (df['order_approved_at'].isna())]



(0, 8)
(19, 8)


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
5323,e04abd8149ef81b95221e88f6ed9ab6a,2127dc6603ac33544953ef05ec155771,delivered,2017-02-18 14:40:00,,2017-02-23 12:04:47,2017-03-01 13:25:33,2017-03-17 00:00:00
16567,8a9adc69528e1001fc68dd0aaebbb54a,4c1ccc74e00993733742a3c786dc3c1f,delivered,2017-02-18 12:45:31,,2017-02-23 09:01:52,2017-03-02 10:05:06,2017-03-21 00:00:00
19031,7013bcfc1c97fe719a7b5e05e61c12db,2941af76d38100e0f8740a374f1a5dc3,delivered,2017-02-18 13:29:47,,2017-02-22 16:25:25,2017-03-01 08:07:38,2017-03-17 00:00:00
22663,5cf925b116421afa85ee25e99b4c34fb,29c35fc91fc13fb5073c8f30505d860d,delivered,2017-02-18 16:48:35,,2017-02-22 11:23:10,2017-03-09 07:28:47,2017-03-31 00:00:00
23156,12a95a3c06dbaec84bcfb0e2da5d228a,1e101e0daffaddce8159d25a8e53f2b2,delivered,2017-02-17 13:05:55,,2017-02-22 11:23:11,2017-03-02 11:09:19,2017-03-20 00:00:00
26800,c1d4211b3dae76144deccd6c74144a88,684cb238dc5b5d6366244e0e0776b450,delivered,2017-01-19 12:48:08,,2017-01-25 14:56:50,2017-01-30 18:16:01,2017-03-01 00:00:00
38290,d69e5d356402adc8cf17e08b5033acfb,68d081753ad4fe22fc4d410a9eb1ca01,delivered,2017-02-19 01:28:47,,2017-02-23 03:11:48,2017-03-02 03:41:58,2017-03-27 00:00:00
39334,d77031d6a3c8a52f019764e68f211c69,0bf35cac6cc7327065da879e2d90fae8,delivered,2017-02-18 11:04:19,,2017-02-23 07:23:36,2017-03-02 16:15:23,2017-03-22 00:00:00
48401,7002a78c79c519ac54022d4f8a65e6e8,d5de688c321096d15508faae67a27051,delivered,2017-01-19 22:26:59,,2017-01-27 11:08:05,2017-02-06 14:22:19,2017-03-16 00:00:00
61743,2eecb0d85f281280f79fa00f9cec1a95,a3d3c38e58b9d2dfb9207cab690b6310,delivered,2017-02-17 17:21:55,,2017-02-22 11:42:51,2017-03-03 12:16:03,2017-03-20 00:00:00


In [139]:
a = df[
    (df['order_status'] != 'canceled')  & 
    (df['order_status'] != 'created')  & 
    (df['order_status'] != 'invoiced')  & 
    (df['order_status'] != 'invoicedprocessing')  & 
    (df['order_status'] != 'unavailable')  & 
    (df['order_status'] != 'processing')  & 
    (df['order_delivered_carrier_date'].isna())]

a['order_status'].value_counts()



order_status
approved     2
delivered    2
Name: count, dtype: int64

In [140]:
a = df[
    (df['order_delivered_customer_date'].isna())]

a['order_status'].value_counts()


order_status
shipped        1107
canceled        619
unavailable     609
invoiced        314
processing      301
delivered         8
created           5
approved          2
Name: count, dtype: int64

In [101]:
datetimes_params = {

    "field_start": "review_creation_date",
    "field_end": "review_answer_timestamp",
    "fill_null":  {
        "diff": "mean",
        "filters":{
            "field_name_filter": "deliveried",
            "field_name_filter2": "3",
        },
    },
    "create_diff_column": True 
}

df_clean = handle_datetime_diff_with_null_imputation(df_clean, datetimes_params)


KeyError: 'review_creation_date'