### Hello, Data!	

In [4]:
import pandas as pd

df = pd.read_csv("D:\College Second Program\PROG8245-Machine Learning Programming\Lab2\Lab2_PROG8245\orders.csv") 
df.head(3)  

  df = pd.read_csv("D:\College Second Program\PROG8245-Machine Learning Programming\Lab2\Lab2_PROG8245\orders.csv")


Unnamed: 0,date,customer_id,product,price,quantity,coupon_code,shipping_city
0,56:34.9,CUST1000,Headphones,725.04,1,DEAL20,Houston
1,56:34.9,CUST1001,Keyboard,65.62,2,WELCOME,Chicago
2,56:34.9,CUST1002,Keyboard,610.04,3,DEAL20,Los Angeles


### Pick the Right Container ###
To represent each transaction, we need a structure that:

Holds multiple attributes (like date, customer_id, product, etc.)

Supports methods for cleaning or calculations (e.g., .clean() or .total())

Justification:

A Python class is the best fit here because it allows bundling data (attributes) and behavior (methods) together.

A dictionary would store data but not behavior.

A namedtuple is immutable and doesn't support custom methods easily.

#### Transaction Class and OO data structure	


In [5]:
from typing import Optional

class Transaction:
    def __init__(self, date: str, customer_id: str, product: str, price: float, quantity: int, coupon_code: Optional[str], shipping_city: str):
        self.date = date
        self.customer_id = customer_id
        self.product = product
        self.price = price
        self.quantity = quantity
        self.coupon_code = coupon_code
        self.shipping_city = shipping_city
        self.discount_percent = 0.0 

    def total(self) -> float:
        return self.price * self.quantity


    def parse_discount(self) -> float:
        if self.coupon_code == "NONE":
            return 0.0
        elif self.coupon_code.startswith("DEAL"):
            try:
                return float(self.coupon_code.replace("DEAL", ""))
            except:
                return 0.0
        elif self.coupon_code == "WELCOME":
            return 10.0
        return 0.0

    def clean(self):
        self.customer_id = self.customer_id.strip().upper()
        self.shipping_city = self.shipping_city.strip().title()


        code = str(self.coupon_code).strip().upper()
        if code in {"", "NONE", "NULL", "NAN"}:
            self.coupon_code = "NONE"
        else:
            self.coupon_code = code


        self.discount_percent = self.parse_discount()


### Bulk Loader	
load_transactions() returning list ↦ type-hinted

In [6]:
from typing import List

def load_transactions(df: pd.DataFrame) -> List[Transaction]:
    transactions = []
    for _, row in df.iterrows():
        transaction = Transaction(
            date=row['date'],
            customer_id=row['customer_id'],
            product=row['product'],
            price=row['price'],
            quantity=row['quantity'],
            coupon_code=row.get('coupon_code'),
            shipping_city=row['shipping_city']
        )
        transactions.append(transaction)
    return transactions


all_transactions = load_transactions(df)


len(all_transactions)


500

### Quick Profiling	
Min/mean/max price, unique city count (set)

In [7]:
prices = [t.price for t in all_transactions]
cities = {t.shipping_city for t in all_transactions}


min_price = min(prices)
mean_price = sum(prices) / len(prices)
max_price = max(prices)
unique_city_count = len(cities)


print(f"Min: {min_price}, Mean: {mean_price}, Max: {max_price}")
print(f"Unique Cities: {unique_city_count}")


Min: 20.31, Mean: 742.1563000000001, Max: 1492.66
Unique Cities: 5


### Spot the Grime

Three dirty data examples:

1. Inconsistent city names: `"new york"`, `"NEW YORK"`, and `"New York"` should be standardized.
2. Some coupon codes are missing, written as `"none"` or `"null"`.
3. The `date` field appears to be improperly formatted (`"56:34.9"`) and not usable as a timestamp.


### Cleaning Rules	
Execute fixes inside clean(); show “before/after” counts

In [8]:

dirty_coupon_codes = [t.coupon_code for t in all_transactions if not t.coupon_code or str(t.coupon_code).lower() in {"none", "null", ""}]
dirty_city_names = [t.shipping_city for t in all_transactions if t.shipping_city != t.shipping_city.title()]

print(f"Before Cleaning:")
print(f" - Dirty coupons: {len(dirty_coupon_codes)}")
print(f" - Unformatted cities: {len(dirty_city_names)}")


for t in all_transactions:
    t.clean()


dirty_coupon_codes_after = [t.coupon_code for t in all_transactions if t.coupon_code == "NONE"]
unique_cities_after = {t.shipping_city for t in all_transactions}

print(f"\nAfter Cleaning:")
print(f" - Standardized 'NONE' coupons: {len(dirty_coupon_codes_after)}")
print(f" - Unique cities: {len(unique_cities_after)}")


Before Cleaning:
 - Dirty coupons: 0
 - Unformatted cities: 0

After Cleaning:
 - Standardized 'NONE' coupons: 97
 - Unique cities: 5


### Transformations	
For example: Parse coupon_code ➞ numeric discount (others apply)

In [9]:
for t in all_transactions:
    t.clean() 

for t in all_transactions[:5]:
    print(f"Coupon: {t.coupon_code}, Discount: {t.discount_percent}%")


Coupon: DEAL20, Discount: 20.0%
Coupon: WELCOME, Discount: 10.0%
Coupon: DEAL20, Discount: 20.0%
Coupon: WELCOME, Discount: 10.0%
Coupon: FREESHIP, Discount: 0.0%


# Feature Engineering


In [11]:
# Add a dummy feature_engineer method to Transaction if not already present
def feature_engineer(self):
    # Example: Add a placeholder attribute or calculation
    self.has_discount = self.discount_percent > 0

Transaction.feature_engineer = feature_engineer

for t in all_transactions:
    t.feature_engineer()


# Mini-Aggregation


In [12]:
from collections import defaultdict

city_revenue = defaultdict(float)
for t in all_transactions:
    gross = t.total()
    discount = gross * (t.discount_percent / 100)
    net_revenue = gross - discount
    city_revenue[t.shipping_city] += net_revenue

sorted(city_revenue.items(), key=lambda x: x[1], reverse=True)[:5]

[('Houston', 236216.94699999993),
 ('New York', 230309.32499999995),
 ('Los Angeles', 218596.09000000003),
 ('Chicago', 192229.56899999996),
 ('Phoenix', 182967.706)]

# Serialization Checkpoint


In [14]:

cleaned_data = []
for t in all_transactions:
    cleaned_data.append({
        "date": t.date,
        "customer_id": t.customer_id,
        "product": t.product,
        "price": t.price,
        "quantity": t.quantity,
        "coupon_code": t.coupon_code,
        "shipping_city": t.shipping_city,
        "discount_percent": t.discount_percent,
        "total_price": t.total(),
        "net_revenue": t.total() * (1 - t.discount_percent / 100),
        "days_since": None  # or use a default value or calculation if available
    })

df_cleaned = pd.DataFrame(cleaned_data)
df_cleaned.to_json("cleaned_orders.json", orient="records", indent=2)
df_cleaned.to_parquet("cleaned_orders.parquet", index=False)


# Soft Interview Reflection

### Reflection
Using OOP helped encapsulate data cleaning, transformation, and feature engineering logic within the `Transaction` class.
This made the overall pipeline modular, reusable, and easier to test or extend. Instead of having fragmented logic,
we could centralize business rules directly where they belong.


# Data Dictionary Section
# Markdown Table

### Data Dictionary

| Field                | Type     | Description                              | Source        |
|----------------------|----------|------------------------------------------|----------------|
| date                | datetime | Date of transaction                      | orders_500.csv |
| customer_id         | string   | Unique customer identifier               | orders_500.csv |
| product             | string   | Name of the product                      | orders_500.csv |
| price               | float    | Price per unit                           | orders_500.csv |
| quantity            | int      | Number of units purchased                | orders_500.csv |
| coupon_code         | string   | Promotional code used                    | orders_500.csv |
| shipping_city       | string   | City where order was shipped             | orders_500.csv |
| total               | float    | Computed total price = price * quantity  | derived         |
| discount            | int      | Numeric discount derived from code       | derived         |
| days_since_purchase | int      | Days between purchase and today          | derived         |
