In [1]:
import pandas as pd
import os

# ---------------------------
# Paths
# ---------------------------
PROCESSED_PATH = r"C:\Users\User\OneDrive\Documents\data engineering assignment\processed"
WAREHOUSE_PATH = r"C:\Users\User\OneDrive\Documents\data engineering assignment\warehouse"

os.makedirs(WAREHOUSE_PATH, exist_ok=True)

# ---------------------------
# Load staging data
# ---------------------------
def load_staging_data():
    users = pd.read_csv(f"{PROCESSED_PATH}/users_staging.csv")
    orders = pd.read_csv(f"{PROCESSED_PATH}/orders_staging.csv")
    return users, orders

# ---------------------------
# Clean users data
# ---------------------------
def clean_users(users):
    # Trim string columns
    users['name'] = users['name'].str.strip()
    users['email'] = users['email'].str.lower().str.strip()
    users['city'] = users['city'].str.strip()

    # Parse signup_date
    users['signup_date'] = pd.to_datetime(
        users['signup_date'],
        errors='coerce'
    )

    # Remove users with null user_id
    users = users.dropna(subset=['user_id'])

    return users

# ---------------------------
# Clean orders data
# ---------------------------
def clean_orders(orders):
    # Parse order_date
    orders['order_date'] = pd.to_datetime(
        orders['order_date'],
        errors='coerce'
    )

    # Remove invalid prices and null user_id
    orders = orders[
        (orders['price'] > 0) &
        (orders['user_id'].notna())
    ]

    # Add order_month
    orders['order_month'] = orders['order_date'].dt.to_period('M').astype(str)

    return orders

# ---------------------------
# Transformations
# ---------------------------
def apply_transformations(users, orders):
    # Account age in days
    users['account_age_days'] = (
        pd.Timestamp.today() - users['signup_date']
    ).dt.days

    # Lifetime Value (LTV)
    ltv = (
        orders
        .groupby('user_id')['price']
        .sum()
        .reset_index()
        .rename(columns={'price': 'ltv'})
    )

    users = users.merge(ltv, on='user_id', how='left')
    users['ltv'] = users['ltv'].fillna(0)

    return users, orders

# ---------------------------
# Save outputs
# ---------------------------
def save_outputs(users, orders):
    users.to_csv(f"{WAREHOUSE_PATH}/users_clean.csv", index=False)
    orders.to_csv(f"{WAREHOUSE_PATH}/orders_clean.csv", index=False)

# ---------------------------
# Main execution
# ---------------------------
if __name__ == "__main__":
    print("Starting data cleaning & transformations...")

    users_df, orders_df = load_staging_data()

    users_clean = clean_users(users_df)
    orders_clean = clean_orders(orders_df)

    users_final, orders_final = apply_transformations(
        users_clean,
        orders_clean
    )

    save_outputs(users_final, orders_final)

    print("Data cleaning & transformation completed successfully.")


Starting data cleaning & transformations...
Data cleaning & transformation completed successfully.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  orders['order_month'] = orders['order_date'].dt.to_period('M').astype(str)
