# 01 — Data Cleaning

**Purpose:** standardize columns, clean key fields, and write staged outputs to `data/interim/`.

_Last updated: 2025-12-19_

In [None]:
from pathlib import Path
import pandas as pd

from src.config import PATHS, COLS
from src.io import read_any, write_parquet
from src.cleaning import standardize_columns, trim_strings, coerce_int, drop_duplicate_keys


## Load raw extracts

Place your raw files in `data/raw/` (CSV or parquet). Update filenames below.

In [None]:
raw_file = next(PATHS.raw.glob('*.csv'), None) or next(PATHS.raw.glob('*.parquet'), None)
print('Using:', raw_file)
df = read_any(raw_file)
df.head()

## Clean + standardize

In [None]:
df = standardize_columns(df)
# Example cleaning: trim ids and coerce BC / month fields if needed
id_cols = [COLS.account_id, COLS.customer_id, COLS.month_code]
df = trim_strings(df, id_cols)
df = coerce_int(df, [COLS.bc])

# Optional: enforce one row per (account_id, month_code) if that's your expected grain
# df = drop_duplicate_keys(df, [COLS.account_id, COLS.month_code])

df.head()

## Save interim output

In [None]:
out_path = PATHS.interim / 'cleaned_base.parquet'
write_parquet(df, out_path)
print('Saved:', out_path, 'rows:', len(df))