In [16]:
import polars as pl
import pandas as pd
from unidecode import unidecode
import re

In [17]:

def clean_field_names_pandas(df: pd.DataFrame, case: str = "lower") -> pd.DataFrame:
    """
    - strip() + collapse múltiples espacios
    - sustituye espacios por '_'
    - quita tildes (NFKD → ascii)
    - pone en upper o lower según `case`
    """
    cols = (
        df.columns
          .str.strip()                                       # quita espacios al inicio/fin
          .str.replace(r"\s+", " ", regex=True)              # colapsa espacios múltiples
          .str.replace(" ", "_")                             # espacios → underscore
          .str.normalize("NFKD")                             # descompone acentos
          .str.encode("ascii", "ignore").str.decode("ascii") # elimina diacríticos
    )
    if case.lower() == "upper":
        cols = cols.str.upper()
    else:
        cols = cols.str.lower()
    df = df.copy()
    df.columns = cols
    return df

def clean_text_values_pandas(df: pd.DataFrame, case: str = "lower") -> pd.DataFrame:
    """
    Aplica a todas las columnas de tipo object (strings):
    - strip() + collapse espacios
    - quita tildes
    - pone en upper o lower según `case`
    """
    df = df.copy()
    for col in df.select_dtypes(include="object").columns:
        ser = df[col].astype("string")  # trabajar con StringDtype para no perder nulos
        ser = (
            ser.str.strip()                                     # quita espacios al inicio/fin
               .str.replace(r"\s+", " ", regex=True)            # colapsa espacios múltiples
               .str.normalize("NFKD")                           # descompone acentos
               .str.encode("ascii", "ignore").str.decode("ascii") # elimina diacríticos
        )
        if case.lower() == "upper":
            ser = ser.str.upper()
        else:
            ser = ser.str.lower()
        df[col] = ser
    return df

def clean_pandas_df(
    df: pd.DataFrame,
    fields_name_params: dict,
    values_params: dict
) -> pd.DataFrame:
    """
    Orquesta la limpieza:
      1) Limpia nombres de columnas según fields_name_params['case']
      2) Limpia valores de texto según values_params['case']
    """
    df2 = clean_field_names_pandas(df, fields_name_params.get("case", "lower"))
    df2 = clean_text_values_pandas(df2, values_params.get("case", "lower"))
    return df2

# ----------------------------
# Ejemplo de uso
# ----------------------------
if __name__ == "__main__":
    df = pd.DataFrame({
        " review id ": [1, 2],
        "Nombre Cliente": ["José   Pérez", "María López"],
        " Comentario final": ["¡Muy bien! ", None],
        "score": [5, 4]
    })

    fields_name_params = {"case": "lower"}   # 'lower' o 'upper' para nombres
    values_params     = {"case": "upper"}   # 'lower' o 'upper' para valores

    df_clean = clean_pandas_df(df, fields_name_params, values_params)
    print(df_clean)
    print(df_clean.dtypes)


   review_id nombre_cliente comentario_final  score
0          1     JOSE PEREZ        MUY BIEN!      5
1          2    MARIA LOPEZ             <NA>      4
review_id            int64
nombre_cliente      object
comentario_final    object
score                int64
dtype: object


In [18]:
import pandas as pd
from pandas.api import types as pdt

def clean_field_names_pandas(df: pd.DataFrame, case: str = "lower") -> pd.DataFrame:
    cols = (
        df.columns
          .str.strip()
          .str.replace(r"\s+", " ", regex=True)
          .str.replace(" ", "_")
          .str.normalize("NFKD")
          .str.encode("ascii", "ignore").str.decode("ascii")
    )
    cols = cols.str.upper() if case.lower()=="upper" else cols.str.lower()
    df = df.copy()
    df.columns = cols
    return df

def _clean_str_series(ser: pd.Series, case: str) -> pd.Series:
    s = ser.astype("string")
    s = (
        s.str.strip()
         .str.replace(r"\s+", " ", regex=True)
         .str.normalize("NFKD")
         .str.encode("ascii", "ignore").str.decode("ascii")
    )
    s = s.str.upper() if case.lower()=="upper" else s.str.lower()
    return s

def clean_text_values_pandas(df: pd.DataFrame, case: str = "lower") -> pd.DataFrame:
    df = df.copy()
    for col in df.columns:
        dtype = df[col].dtype

        if pdt.is_categorical_dtype(dtype):
            ser = df[col]
            cats = pd.Series(ser.cat.categories, dtype="string")
            cats_clean = _clean_str_series(cats, case)
            ser_new = ser.cat.set_categories(cats_clean).astype("string")
            ser_new = _clean_str_series(ser_new, case).astype("category")

        elif pdt.is_object_dtype(dtype) or pdt.is_string_dtype(dtype):
            ser_new = _clean_str_series(df[col], case)

        else:
            continue

        df[col] = ser_new
    return df

def clean_datetime_columns_pandas(
    df: pd.DataFrame,
    datetime_params: dict = None,
    infer: bool = True
) -> pd.DataFrame:
    df = df.copy()
    datetime_params = datetime_params or {}
    for col in df.columns:
        if pdt.is_datetime64_any_dtype(df[col].dtype):
            continue
        if col in datetime_params:
            fmt = datetime_params[col]
            df[col] = pd.to_datetime(df[col], format=fmt, errors="coerce")
            continue
        if infer and (pdt.is_object_dtype(df[col].dtype) or pdt.is_string_dtype(df[col].dtype)):
            sample = df[col].dropna().astype(str).head(5)
            parsed = pd.to_datetime(sample, errors="coerce", infer_datetime_format=True)
            if parsed.notna().any():
                df[col] = pd.to_datetime(df[col], errors="coerce", infer_datetime_format=True)
    return df

def fill_nulls(df: pd.DataFrame, fill_map: dict) -> pd.DataFrame:
    """
    fill_map: { column_name: fill_value, ... }
    """
    df = df.copy()
    for col, val in fill_map.items():
        if col in df.columns:
            df[col] = df[col].fillna(val)
            print(f"🔢 Columna {col!r}: filled {df[col].isna().sum()} → 0 nulls with {val!r}")
    return df

def clean_pandas_df(
    df: pd.DataFrame,
    fields_name_params: dict,
    values_params: dict,
    datetime_params: dict = None,
    infer_datetime: bool = True,
    null_fill_map: dict = None
) -> pd.DataFrame:
    """
    1) Renombra columnas
    2) Convierte datetime
    3) Rellena nulls
    4) Limpia texto
    """
    print("=== INICIO LIMPIEZA DataFrame ===")
    df2 = clean_field_names_pandas(df, fields_name_params.get("case", "lower"))

    print("→ Parseando datetime…")
    df2 = clean_datetime_columns_pandas(df2, datetime_params, infer=infer_datetime)

    if null_fill_map:
        print("→ Rellenando nulls…")
        df2 = fill_nulls(df2, null_fill_map)

    print("→ Limpiando texto…")
    df2 = clean_text_values_pandas(df2, values_params.get("case", "lower"))
    print("=== FIN LIMPIEZA DataFrame ===")
    return df2

# ----------------------------
# Ejemplo de uso
# ----------------------------
# if __name__ == "__main__":
#     # df ya cargado...
#     fields_name_params = {"case": "lower"}
#     values_params     = {"case": "lower"}
#     datetime_params   = {
#         "review_creation_date": "%Y-%m-%d %H:%M:%S",
#         "review_answer_timestamp": "%Y-%m-%d %H:%M:%S"
#     }
#     null_fill_map = {
#         "review_comment_title": "no title",
#         "review_comment_message": "no comment"
#     }

#     df_clean = clean_pandas_df(
#         df,
#         fields_name_params,
#         values_params,
#         datetime_params=datetime_params,
#         infer_datetime=False,
#         null_fill_map=null_fill_map
#     )

#     print("\n=== Después de limpiar ===")
#     print(df_clean.info())
#     print("Nulls restantes:\n", df_clean.isna().sum())


In [19]:


df = pd.read_csv('./../datasets/olist_order_reviews_dataset.csv',
                #  sep=',',              # separador
                #  encoding='utf-8',      # codificación
                #  parse_dates=['fecha'], # columnas a convertir a datetime
                #  na_values=['NA','?'],  # cómo interpretar nulos
                #  chunksize=100000       # leer por partes si es muy grande
                )

# ----------------------------
# Ejemplo de uso
# ----------------------------
print(df.info())

df_nulls = df.isnull().sum()
print(f"Nulls \n {df_nulls[df_nulls > 0]}")

fields_name_params = {"case": "lower"}
values_params     = {"case": "lower"}
# indicamos formatos distintos por columna
datetime_params   = {
    "review_creation_date": "%Y-%m-%d %H:%M:%S",
    "review_answer_timestamp": "%Y-%m-%d %H:%M:%S"
}
null_fill_map = {
    "review_comment_title": "no title",
    "review_comment_message": "no comment"
}

df_clean = clean_pandas_df(
    df,
    fields_name_params,
    values_params,
    datetime_params=datetime_params,
    infer_datetime=False,
    null_fill_map=null_fill_map
)

print("\n=== Después de limpiar ===")
print(df_clean.info())
print("Nulls restantes:\n", df_clean.isna().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   review_id                99224 non-null  object
 1   order_id                 99224 non-null  object
 2   review_score             99224 non-null  int64 
 3   review_comment_title     11568 non-null  object
 4   review_comment_message   40977 non-null  object
 5   review_creation_date     99224 non-null  object
 6   review_answer_timestamp  99224 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB
None
Nulls 
 review_comment_title      87656
review_comment_message    58247
dtype: int64
=== INICIO LIMPIEZA DataFrame ===
→ Parseando datetime…
→ Rellenando nulls…
🔢 Columna 'review_comment_title': filled 0 → 0 nulls with 'no title'
🔢 Columna 'review_comment_message': filled 0 → 0 nulls with 'no comment'
→ Limpiando texto…


  if pdt.is_categorical_dtype(dtype):


=== FIN LIMPIEZA DataFrame ===

=== Después de limpiar ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   review_id                99224 non-null  object        
 1   order_id                 99224 non-null  object        
 2   review_score             99224 non-null  int64         
 3   review_comment_title     99224 non-null  object        
 4   review_comment_message   99224 non-null  object        
 5   review_creation_date     99224 non-null  datetime64[ns]
 6   review_answer_timestamp  99224 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 5.3+ MB
None
Nulls restantes:
 review_id                  0
order_id                   0
review_score               0
review_comment_title       0
review_comment_message     0
review_creation_date       0
review_answer_time