In [1]:
import json
import datetime

### *Data Preprocessing or Data Cleaning Methods* 

In [None]:
class Order:
    """
    Represents a single e-commerce order with data cleaning and validation.

    This class takes a raw order dictionary, applies cleaning/validation
    rules for all fields, and stores them as attributes. This ensures 
    consistent, reliable data for further analysis.

    Attributes:
        order_id (str or int): Unique identifier for the order.
        customer_id (str or int): Unique identifier for the customer.
        customer_name (str): Cleaned customer name, defaults to "Unknown".
        product_id (str or int): Unique identifier for the product.
        product_name (str): Cleaned product name, defaults to "Unknown".
        category (str): Product category, mapped and cleaned.
        quantity (int): Validated order quantity (always positive, defaults to 1).
        price (float): Validated product price (min threshold applied).
        discount (float): Validated discount (capped at price).
        payment_mode (str): Normalized payment method (e.g., "COD", "Card").
        status (str): Order status, defaults to "Cancelled" if invalid.
        order_date (datetime.date or None): Parsed order date, None if invalid.
    """
    def __init__(self, raw_order_dict):
        """
        Initialize the Order with raw data and perform cleaning/validation.

        Args:
            raw_order_dict (dict): Raw order data with fields like
                                   'order_id', 'customer_id', 'product_name',
                                   'category', 'quantity', 'price', 'discount',
                                   'payment_mode', 'status', 'order_date'.
        """
        self.raw_order_dict = raw_order_dict

        # Extract fields
        self.order_id = raw_order_dict.get("order_id")
        self.customer_id = raw_order_dict.get("customer_id")
        self.customer_name = self.clean_customer_name(raw_order_dict.get("customer_name"))
        self.product_id = raw_order_dict.get("product_id")
        self.product_name = self.clean_product_name(raw_order_dict.get("product_name"))
        self.category = self.clean_category(raw_order_dict.get("category"))
        self.quantity = self.validate_quantity(raw_order_dict.get("quantity"))
        self.price = self.validate_price(raw_order_dict.get("price"))
        self.discount = self.validate_discount(raw_order_dict.get("discount"), self.price)
        self.payment_mode = self.clean_payment(raw_order_dict.get("payment_mode"))
        self.status = self.validate_status(raw_order_dict.get("status"))
        self.order_date = self.validate_date(raw_order_dict.get("order_date"))

    # ---------------- Cleaning Functions ----------------
    def clean_product_name(self, name):
        """
        Clean product name.
        Returns original name if valid, otherwise "Unknown".
        """
        return name if name and name.strip() else "Unknown"
    
    def clean_customer_name(self, name):
        """
        Clean customer name.
        Returns original name if valid, otherwise "Unknown".
        """
        return name if name and name.strip() else "UnKnown"

    def validate_status(self, status):
        """
        Validate order status.
        Returns original status if valid, otherwise "Cancelled".
        """
        return status if status and status.strip() else "Cancelled"
    
    def clean_category(self, cat):
        """
        Clean category name.
        Maps short/invalid values to standard ones.
        Returns "Unknown" if category is missing.
        """

        mapping = {"Electro": "Electronics"}
        if cat is None or not str(cat).strip():
            return "Unknown"
        return mapping.get(cat, cat)

    def clean_payment(self, mode):
        """
        Clean payment mode.
        Maps incorrect modes (e.g., 'Cashh') to standard values.
        Defaults to "COD" if missing/invalid.
        """
        mapping = {"Cashh": "COD", "Cardd": "Card"}
        # Handle None, empty string, or only spaces
        if mode is None or not str(mode).strip():
            return "COD"  
        
        return mapping.get(mode, mode)

    def validate_quantity(self, qty):
        """
        Validate quantity.
        Ensures integer ≥ 0.
        Defaults to 1 if invalid.
        """

        try:
            if qty is None or not isinstance(qty, int) :
                return 1
            if qty < 0:
                return abs(qty)
            return qty
        except Exception:
            return 1
        
    def validate_price(self, price, min_price=100):
        """
        Validate price.
        Ensures non-negative and above minimum threshold.
        Defaults to min_price if invalid.
        """
        try:
            if price is None or not isinstance(price, (int, float)):
                return min_price
            if price < 0:
                return abs(price)
            if price < min_price:
                return min_price
            return price
        except Exception:
            return min_price
        

    def validate_discount(self, disc, price):
        """
        Validate discount.
        Ensures discount is non-negative and ≤ price.
        Defaults to 0 if invalid.
        """
        try:
            if disc is None or not isinstance(disc, (int, float)):
                return 0
            if disc < 0:
                return 0
            if disc > price:
                return price
            return disc
        except Exception:
            return 0

    def validate_date(self, date_str):
        """
        Validate order date.
        Returns datetime.date if valid, otherwise None.
        """
        try:
            return datetime.datetime.strptime(date_str, "%Y-%m-%d").date()
        except Exception:
            return None

In [None]:
class OrderManager:
    """
    Manage a collection of orders and return cleaned data.

    This class takes raw order dictionaries, converts them into `Order` objects,
    and provides a method to extract cleaned, standardized order data 
    for analysis.
    """
    def __init__(self, orders_data):
        """
        Initialize OrderManager with raw orders.

        Args:
            orders_data (list of dict): Raw orders containing fields like
                                        order_id, customer_name, product_name, etc.
        """
        self.orders = [Order(o) for o in orders_data]

    def get_cleaned_data(self):
        """
        Return cleaned data from all orders.

        Converts each `Order` object into a dictionary with cleaned values
        for fields such as product_name, category, price, discount, status, and date.

        Returns:
            list of dict: Cleaned order data, ready for analysis.
        """
        cleaned = []
        for order in self.orders:
            cleaned.append({
                "order_id": order.order_id,
                "customer_id": order.customer_id,
                "customer_name": order.customer_name,
                "product_id": order.product_id,
                "product_name": order.product_name,
                "category": order.category,
                "quantity": order.quantity,
                "price": order.price,
                "discount": order.discount,
                "payment_mode": order.payment_mode,
                "status": order.status,
                "order_date": str(order.order_date) if order.order_date else "Invalid"
            })
        return cleaned

### *Exploratory Data Analysis Using pure python*

In [None]:
class EDA:
    """
    Perform Exploratory Data Analysis (EDA) on cleaned e-commerce order data.

    This class takes a list of cleaned order dictionaries (from OrderManager)
    and provides methods to calculate metrics and insights such as:
        - Total revenue
        - Top customers
        - Revenue by category
        - Revenue by month
        - Most active order day

    Attributes:
        data (list of dict): Cleaned order data with fields like
                             'customer_name', 'price', 'discount', 
                             'quantity', 'status', 'category', 'order_date'.
    """

    def __init__(self, data):
        """
        Initialize the EDA class with cleaned data.

        Args:
            data (list of dict): Cleaned dataset from OrderManager.get_cleaned_data().
        """
        self.data = data

    def total_revenue(self):
        """
        Calculate total revenue from all valid orders.
        Excludes cancelled orders.
        
        Returns:
            int/float: Total revenue amount.
        """
        revenue = 0
        for order in self.data:
            if order['status'] != 'Cancelled':
                revenue += (order['price'] - order['discount']) * order['quantity']
        return revenue
    
    def top_customers(self, n=5):
        """
        Find top N customers by total spend.
        Excludes cancelled orders.
        
        Args:
            n (int): Number of top customers to return. Default is 5.
        
        Returns:
            list of tuples: [(customer_name, total_spend), ...] sorted by spend.
        """
        customer_spend = {}
        for order in self.data:
            cust = order["customer_name"]
            if order['status'] != 'Cancelled':
                spend = (order["price"] - order["discount"]) * order["quantity"]
                customer_spend[cust] = customer_spend.get(cust, 0) + spend
        return sorted(customer_spend.items(), key=lambda x: x[1], reverse=True)[:n]
    
    def revenue_by_category(self):
        """
        Calculate total revenue for each product category.
        Excludes cancelled orders.
        
        Returns:
            dict: {category: total_revenue}
        """
        category_revenue = {}
        for order in self.data:
            if order['status'] != 'Cancelled':
                cat = order["category"]
                spend = (order['price'] - order['discount']) * order['quantity']
                category_revenue[cat] = category_revenue.get(cat, 0) + spend
        return category_revenue
    
    def revenue_by_month(self):
        """
        Calculate monthly revenue from valid orders.
        Skips invalid dates and excludes cancelled orders.
        
        Returns:
            dict: {YYYY-MM: total_revenue}
        """
        monthly_revenue = {}

        for order in self.data:
            date_str = order['order_date']
            if date_str == 'Invalid':
                continue
            try:
                date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
                month = date.strftime("%Y-%m")
                if order['status'] != 'Cancelled':
                    spend = (order['price'] - order['discount']) * order['quantity']
                    monthly_revenue[month] = monthly_revenue.get(month, 0) + spend
            except:
                print("Invalid date")
        return monthly_revenue
        
    def most_active_day(self):
        """
        Find the day with the highest number of orders.
        Skips invalid dates.
        
        Returns:
            tuple: (day, order_count) for the most active day.
            None if no valid data exists.
        """
        day_count = {}
        for order in self.data:
            date_str = order["order_date"]
            if date_str == "Invalid":
                continue
            try:
                date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
                day = date.strftime("%Y-%m-%d")
                day_count[day] = day_count.get(day, 0) + 1
            except:
                pass
        if not day_count:
            return None
        return max(day_count.items(), key=lambda x: x[1])


### *Main menu*

In [None]:
"""
Main execution script for the E-Commerce Order Data Cleaning and Analysis project.

Steps performed:
1. Loads raw order data from `ecomm_orders_dirty.json`.
2. Cleans the data using the Order and OrderManager classes.
3. Saves the cleaned dataset into `ecomm_orders_clean.json`.
4. Provides an interactive CLI menu for EDA (exploratory data analysis), 
   including:
   - Total Revenue
   - Top 5 Customers
   - Revenue by Category
   - Revenue by Month
   - Most Active Day
   - Exit option
"""

if __name__ == "__main__":
    # Step 1: Load raw dirty data
    with open("ecomm_orders_dirty.json", "r") as f:
        raw_data = json.load(f)

    # Step 2: Clean it
    manager = OrderManager(raw_data)
    cleaned_data = manager.get_cleaned_data()

    # Step 3: Save cleaned data
    with open("ecomm_orders_clean.json", "w") as f:
        json.dump(cleaned_data, f, indent=4)

    print("✅ Clean dataset ready. Total orders:", len(cleaned_data))
    
    eda = EDA(cleaned_data)

    while True:
        print("\n📊 E-Commerce Data Analysis Menu 📊")
        print("1. Total Revenue")
        print("2. Top 5 Customers")
        print("3. Revenue by Category")
        print("4. Revenue by Month")
        print("5. Most Active Day")
        print("6. Exit")

        choice = input("Enter choice (1-6): ")

        if choice == "1":
            print(f"\n💰 Total Revenue: {eda.total_revenue()}")

        elif choice == "2":
            top5 = eda.top_customers(n=5)  # list of tuples

            print("\n👑 Top 5 Customers")
            print("-" * 30)
            print(f"{'Customer':<15}{'Spend':>10}")
            print("-" * 30)
            for cust, spend in top5:       # unpack each tuple individually
                print(f"{cust:<15}{spend:>10}")
                
        elif choice == "3":
            print("\n📂 Revenue by Category")
            print("-" * 30)
            print(f"{'Category':<15}{'Revenue':>10}")
            print("-" * 30)
            for cat, rev in eda.revenue_by_category().items():
                print(f"{cat:<15}{rev:>10}")

        elif choice == "4":
            print("\n📅 Revenue by Month")
            print("-" * 30)
            print(f"{'Month':<15}{'Revenue':>10}")
            print("-" * 30)
            for month, rev in eda.revenue_by_month().items():
                print(f"{month:<15}{rev:>10}")

        elif choice == "5":
            result = eda.most_active_day()
            if result:
                day, count = result
                print("\n🔥 Most Active Day")
                print("-" * 30)
                print(f"{'Date':<15}{'Orders':>10}")
                print("-" * 30)
                print(f"{day:<15}{count:>10}")
            else:
                print("No valid dates found.")

        elif choice == "6":
            print("✅ Exiting...")
            break

        else:
            print("❌ Invalid choice. Try again.")


✅ Clean dataset ready. Total orders: 100

📊 E-Commerce Data Analysis Menu 📊
1. Total Revenue
2. Top 5 Customers
3. Revenue by Category
4. Revenue by Month
5. Most Active Day
6. Exit



💰 Total Revenue: 100100

📊 E-Commerce Data Analysis Menu 📊
1. Total Revenue
2. Top 5 Customers
3. Revenue by Category
4. Revenue by Month
5. Most Active Day
6. Exit

👑 Top 5 Customers
------------------------------
Customer            Spend
------------------------------
John                28700
Sara                11400
Sneha               11250
Priya               10350
Ravi                10100

📊 E-Commerce Data Analysis Menu 📊
1. Total Revenue
2. Top 5 Customers
3. Revenue by Category
4. Revenue by Month
5. Most Active Day
6. Exit

📂 Revenue by Category
------------------------------
Category          Revenue
------------------------------
Home                22150
Electronics         34650
Clothing            13200
electronics         17800
Unknown              6700
Beauty               5600

📊 E-Commerce Data Analysis Menu 📊
1. Total Revenue
2. Top 5 Customers
3. Revenue by Category
4. Revenue by Month
5. Most Active Day
6. Exit

📅 Revenue by Month
----------------------------

## *📝 Project Summary: E-Commerce Order Insights*

**Objective:**  
Analyze and clean messy e-commerce order data to extract meaningful insights about revenue, customer behavior, and sales trends.

**Key Features Implemented:**

1. **Data Cleaning & Validation**
   - Handled missing or invalid values for `quantity`, `price`, `discount`, `payment_mode`, `category`, and `order_date`.
   - Used **Object-Oriented Programming (OOP)** with `Order` and `OrderManager` classes for structured and reusable code.
   - Applied error handling to manage unexpected or inconsistent data.

2. **Exploratory Data Analysis (EDA)**
   - Calculated **total revenue**.
   - Identified **top customers**.
   - Computed **revenue by category** and **revenue by month**.
   - Determined the **most active day** of orders..

3. **Learnings**
   - Structuring projects with **OOP** improves readability and maintainability.
   - Error handling is critical for real-world datasets.
   - Pure Python can handle small to medium datasets efficiently before moving to **Pandas/NumPy** for larger data.

**📈Next Steps:**
- Implement **V3** using Pandas, NumPy, and visualization libraries to scale analysis and create interactive insights.
