In [1]:
import pandas as pd
import os

# --- PART 1: LOAD DATA ---
print("1. Loading Data...")
# Use the 'python' engine to avoid parser errors with the CSVs
df_orders = pd.read_csv('../data/raw_orders.csv', encoding='latin1', sep=None, engine='python')
df_returns = pd.read_csv('../data/raw_returns.csv', encoding='latin1', sep=None, engine='python')
df_people = pd.read_csv('../data/raw_people.csv', encoding='latin1', sep=None, engine='python')

# --- PART 2: CLEANING (The Fixes) ---
print("2. Cleaning Orders...")
df_orders_clean = df_orders.copy()

# Fix Postal Code
df_orders_clean['Postal Code'] = df_orders_clean['Postal Code'].fillna('00000')
df_orders_clean['Postal Code'] = df_orders_clean['Postal Code'].astype(str).str.replace('.0', '', regex=False)

# --- NEW FIX: Standardize Dates for SQLite ---
# SQLite needs YYYY-MM-DD strings to use date functions
print("   - Fixing Date Formats...")
df_orders_clean['Order Date'] = pd.to_datetime(df_orders_clean['Order Date'], dayfirst=True, errors='coerce')
df_orders_clean['Order Date'] = df_orders_clean['Order Date'].dt.strftime('%Y-%m-%d')

df_orders_clean['Ship Date'] = pd.to_datetime(df_orders_clean['Ship Date'], dayfirst=True, errors='coerce')
df_orders_clean['Ship Date'] = df_orders_clean['Ship Date'].dt.strftime('%Y-%m-%d')

# Drop any rows where dates failed to parse (bad data)
df_orders_clean = df_orders_clean.dropna(subset=['Order Date', 'Ship Date'])

# Fix Region: Create 'Market_Group'
us_regions = ['East', 'West', 'Central', 'South']
df_orders_clean['Market_Group'] = df_orders_clean['Region'].apply(lambda x: 'USCA' if x in us_regions else 'Global')

# Prepare other tables
df_returns_clean = df_returns.copy()
df_people_clean = df_people.copy()

# --- PART 3: SAVING ---
print("3. Saving to 'processed/' folder...")
processed_path = '../data/processed'
os.makedirs(processed_path, exist_ok=True)

df_orders_clean.to_csv(f'{processed_path}/orders_clean.csv', index=False)
df_returns_clean.to_csv(f'{processed_path}/returns_clean.csv', index=False)
df_people_clean.to_csv(f'{processed_path}/people_clean.csv', index=False)

print(f"✅ SUCCESS: Clean data saved to {os.path.abspath(processed_path)}")

1. Loading Data...
2. Cleaning Orders...
   - Fixing Date Formats...
3. Saving to 'processed/' folder...
✅ SUCCESS: Clean data saved to c:\Projects\p2-global-sales-dashboard\data\processed
