In [None]:
import commons
import pandas as pd
import functions as func

# Pandas Variables
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 40)

# Custom User Variables
SAMPLE_ROWS = 1000 # Number of rows to save in sample datasets

In [None]:
# Reading Raw Tables
products = pd.read_csv(commons.PRODUCTS_CSV_PATH)
users = pd.read_csv(commons.USERS_CSV_PATH)
transactions = pd.read_csv(commons.TRANSACTIONS_CSV_PATH)

# Creating sample datasets for quick review
products.head(SAMPLE_ROWS).to_csv(commons.PRODUCTS_SAMPLE_CSV_PATH, index=False)
transactions.head(SAMPLE_ROWS).to_csv(commons.TRANSACTIONS_SAMPLE_CSV_PATH, index=False)
users.head(SAMPLE_ROWS).to_csv(commons.USERS_SAMPLE_CSV_PATH, index=False)

# Data Cleaning
## Transactions Table
Standardizes data types and handles missing values in the transactions DataFrame:
1. Converts `BARCODE` to nullable Int64 type to handle NaN values
2. Converts date fields to pandas datetime using `standardize_datetime()`:
    - `SCAN_DATE`: "%Y-%m-%d %H:%M:%S.%f Z" format
    - `PURCHASE_DATE`: "%Y-%m-%d" format  
3. Fixes `FINAL_QUANTITY`:
    - Replaces "zero" with 0 
    - Converts to numeric, coercing errors to NaN
4. Fixes `FINAL_SALE`:
    - Converts to numeric, coercing errors to NaN


In [None]:
# Convert BARCODE to nullable integer
transactions['BARCODE'] = transactions['BARCODE'].astype('Int64')

# Convert dates to datetime format
transactions["SCAN_DATE"] = func.standardize_datetime(
    transactions, "SCAN_DATE", "%Y-%m-%d %H:%M:%S.%f Z"
)
transactions["PURCHASE_DATE"] = func.standardize_datetime(
    transactions, "PURCHASE_DATE", "%Y-%m-%d"
)

# Standardize numeric fields
transactions["FINAL_QUANTITY"] = (transactions["FINAL_QUANTITY"]
    .replace("zero", "0")
    .pipe(pd.to_numeric, errors="coerce"))

transactions["FINAL_SALE"] = pd.to_numeric(
    transactions["FINAL_SALE"], 
    errors="coerce"
)

## Users Table
Converts user date fields to pandas datetime using `standardize_datetime()`:
  - `BIRTH_DATE`: "%Y-%m-%d %H:%M:%S.%f Z" format
  - `CREATED_DATE`: "%Y-%m-%d %H:%M:%S.%f Z" format

In [None]:
# Convert dates to datetime format
users["BIRTH_DATE"] = func.standardize_datetime(
    users, "BIRTH_DATE", "%Y-%m-%d %H:%M:%S.%f Z"
)
users["CREATED_DATE"] = func.standardize_datetime(
    users, "CREATED_DATE", "%Y-%m-%d %H:%M:%S.%f Z"
)

## Products Table
1. Converts `BARCODE` to nullable Int64 type to handle NaN values


In [None]:
# Convert BARCODE to nullable integer
products['BARCODE'] = products['BARCODE'].astype('Int64')

# Writing DataFrames

We use Parquet files (.parquet) because they:
- Preserve pandas data types accurately
- Provide efficient compression
- Enable fast read/write operations
- Use column-oriented storage for optimized querying

In [None]:
users.to_parquet(commons.USERS_CLEAN_PARQUET_PATH)
transactions.to_parquet(commons.TRANSACTIONS_CLEAN_PARQUET_PATH)
products.to_parquet(commons.PRODUCTS_CLEAN_PARQUET_PATH)