In [1]:
# Install required packages if not already installed
%pip install pyspark python-dotenv

# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window
import os
import sys
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Database connection string
DB_CONNECTION = os.getenv('DB_CONNECTION_STRING', 
                          'postgresql://postgressadmin:wf**F!$3dGdf14@copilot-workshop-db.postgres.database.azure.com:5432/workshop_db')

# IMPORTANT: Set up Hadoop for Windows BEFORE creating Spark session
if sys.platform.startswith('win'):
    # Create a minimal Hadoop directory structure for Windows
    hadoop_home = os.path.join(os.path.expanduser('~'), '.hadoop')
    os.makedirs(hadoop_home, exist_ok=True)
    os.makedirs(os.path.join(hadoop_home, 'bin'), exist_ok=True)
    os.environ['HADOOP_HOME'] = hadoop_home
    
    # Download winutils.exe if not present (required for Windows)
    winutils_path = os.path.join(hadoop_home, 'bin', 'winutils.exe')
    if not os.path.exists(winutils_path):
        print("⚠️ winutils.exe not found. Downloading...")
        import urllib.request
        try:
            urllib.request.urlretrieve(
                'https://github.com/steveloughran/winutils/raw/master/hadoop-3.0.0/bin/winutils.exe',
                winutils_path
            )
            print("✅ winutils.exe downloaded successfully!")
        except Exception as e:
            print(f"⚠️ Could not download winutils.exe automatically: {e}")
            print("Please download manually from: https://github.com/steveloughran/winutils")

# Initialize Spark Session with PostgreSQL driver
# Note: The driver will be downloaded on first run, which may take a moment
spark = SparkSession.builder \
    .appName("DataPipelineDebugging") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.7.3") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "4") \
    .master("local[*]") \
    .getOrCreate()

print("✅ Spark session initialized successfully!")
print(f"Spark version: {spark.version}")
print(f"Running on: {sys.platform}")


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.
✅ Spark session initialized successfully!
Spark version: 3.4.1
Running on: win32


# 🐛 Advanced Data Pipeline Debugging Exercise

This notebook contains a more complex e-commerce analytics pipeline with **multiple subtle bugs, logic errors, and performance issues**. Your task is to use GitHub Copilot to identify and fix them.

## Business Context:
You're building an analytics platform that includes:
- **Cohort Analysis**: Track customer retention over time
- **Product Affinity**: Identify products frequently bought together
- **Customer Lifetime Value (CLV)**: Calculate customer value metrics
- **Sales Dashboard**: Daily performance metrics with trends
- **Category Analysis**: Product category performance

## Your Mission:
Use GitHub Copilot Chat to:
1. Review each section and identify bugs
2. Fix logic errors in complex calculations
3. Optimize performance bottlenecks
4. Add proper error handling and validation
5. Handle edge cases and null values


## Tips:
- Read the DEBUGGING_GUIDE.md for detailed explanations
- Ask Copilot to explain the business logic first
- Test your fixes incrementally
- Think about edge cases (new customers, cancelled orders, etc.)

Good luck! 🚀

## Step 1: Load Data from Database

In [None]:
# Load data from PostgreSQL database
# Parse connection string properly for JDBC
from urllib.parse import urlparse

parsed = urlparse(DB_CONNECTION)
jdbc_url = f"jdbc:postgresql://{parsed.hostname}:{parsed.port}{parsed.path}?ssl=true&sslmode=require"
username = parsed.username
password = parsed.password

# Load with proper authentication
customers = spark.read \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "raw.customers") \
    .option("user", username) \
    .option("password", password) \
    .option("driver", "org.postgresql.Driver") \
    .load()

orders = spark.read \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "raw.orders") \
    .option("user", username) \
    .option("password", password) \
    .option("driver", "org.postgresql.Driver") \
    .load()

order_items = spark.read \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "raw.order_items") \
    .option("user", username) \
    .option("password", password) \
    .option("driver", "org.postgresql.Driver") \
    .load()

products = spark.read \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "raw.products") \
    .option("user", username) \
    .option("password", password) \
    .option("driver", "org.postgresql.Driver") \
    .load()

print(f"Loaded {customers.count()} customers")
print(f"Loaded {orders.count()} orders")
print(f"Loaded {order_items.count()} order items")
print(f"Loaded {products.count()} products")

IllegalArgumentException: requirement failed: The driver could not open a JDBC connection. Check the URL: postgresql://postgressadmin:wf**F!$3dGdf14@copilot-workshop-db.postgres.database.azure.com:5432/workshop_db

## Step 2: Customer Cohort Analysis
Track customer retention by analyzing cohorts based on their first purchase month.

In [None]:
# BUG: Uses order_date instead of first order date for cohort assignment
# This will assign customers to multiple cohorts!
cohort_data = orders.withColumn(
    "cohort_month",
    F.date_trunc("month", F.col("order_date"))  # WRONG!
).withColumn(
    "order_month",
    F.date_trunc("month", F.col("order_date"))
)

# Calculate period number (months since cohort)
# BUG: Uses order_date instead of order_month
cohort_data = cohort_data.withColumn(
    "period_number",
    F.months_between(F.col("order_date"), F.col("cohort_month")).cast("int")
)

# Count customers in each cohort period
# BUG: Counts orders instead of unique customers
cohort_counts = cohort_data.groupBy("cohort_month", "period_number").agg(
    F.count("order_id").alias("customers")
)

# Get cohort sizes (period 0)
cohort_sizes = cohort_counts.filter(F.col("period_number") == 0) \
    .select(
        F.col("cohort_month"),
        F.col("customers").alias("cohort_size")
    )

# Calculate retention rates
# BUG: No null handling, division by zero possible
cohort_retention = cohort_counts.join(
    cohort_sizes,
    "cohort_month",
    "left"
).withColumn(
    "retention_rate",
    F.col("customers") / F.col("cohort_size") * 100
)

print("Cohort Retention Analysis:")
cohort_retention.orderBy("cohort_month", "period_number").show(20)

## Step 3: Product Affinity Analysis (Market Basket)
Identify products that are frequently purchased together.

In [None]:
# Get product information for each order
order_products = order_items.join(
    products,
    "product_id",
    "inner"
).select("order_id", "product_id", "product_name")

# BUG: Self-join without preventing duplicates and self-pairs
# This creates (A,A), (A,B), and (B,A) - massive data duplication!
product_pairs = order_products.alias("a").join(
    order_products.alias("b"),
    F.col("a.order_id") == F.col("b.order_id"),  # Missing constraint!
    "inner"
).select(
    F.col("a.product_id").alias("product_a"),
    F.col("a.product_name").alias("product_a_name"),
    F.col("b.product_id").alias("product_b"),
    F.col("b.product_name").alias("product_b_name"),
    F.col("a.order_id")
)

# Count pair occurrences
pair_counts = product_pairs.groupBy("product_a", "product_a_name", "product_b", "product_b_name").agg(
    F.count("order_id").alias("pair_count")
)

# Count individual product occurrences
product_counts = order_products.groupBy("product_id", "product_name").agg(
    F.count("order_id").alias("product_count")
)

# Calculate affinity metrics
# BUG: Missing broadcast optimization for this join
# BUG: Incomplete confidence calculation
affinity_metrics = pair_counts \
    .join(
        product_counts.select(
            F.col("product_id").alias("product_a"),
            F.col("product_count").alias("product_a_count")
        ),
        "product_a",
        "inner"
    ) \
    .join(
        product_counts.select(
            F.col("product_id").alias("product_b"),
            F.col("product_count").alias("product_b_count")
        ),
        "product_b",
        "inner"
    ) \
    .withColumn(
        "confidence",
        F.col("pair_count") / F.col("product_a_count")  # Simplified, missing context
    )

print("Top 10 Product Affinities:")
affinity_metrics.orderBy(F.desc("pair_count")).show(10, truncate=False)

## Step 4: Customer Lifetime Value (CLV) Calculation
Calculate the value each customer brings over their lifetime.

In [None]:
# BUG: Includes ALL orders regardless of status (cancelled, returned, etc.)
customer_revenue = orders.groupBy("customer_id").agg(
    F.sum("total_amount").alias("total_revenue"),
    F.count("order_id").alias("order_count"),
    F.avg("total_amount").alias("avg_order_value")
)

# Calculate customer metrics
# BUG: Uses first order date instead of customer join date
# BUG: Integer division in some Spark versions
customer_metrics = orders.groupBy("customer_id").agg(
    F.min("order_date").alias("first_order"),
    F.max("order_date").alias("last_order"),
    F.countDistinct("order_id").alias("total_orders")
).withColumn(
    "customer_lifespan_years",
    F.datediff(F.current_date(), F.col("first_order")) / 365
).withColumn(
    "purchase_frequency",
    F.col("total_orders") / F.col("customer_lifespan_years")
)

# Calculate CLV
# BUG: Inner join loses customers without completed orders
# BUG: Doesn't handle edge cases (lifespan = 0 for new customers)
clv = customer_revenue.join(
    customer_metrics,
    "customer_id",
    "inner"
).withColumn(
    "customer_lifetime_value",
    F.col("avg_order_value") * F.col("purchase_frequency") * F.col("customer_lifespan_years")
)

print("Top 10 Customers by CLV:")
clv.orderBy(F.desc("customer_lifetime_value")).show(10)

print("\nCLV Statistics:")
clv.select(
    F.avg("customer_lifetime_value").alias("avg_clv"),
    F.stddev("customer_lifetime_value").alias("stddev_clv"),
    F.min("customer_lifetime_value").alias("min_clv"),
    F.max("customer_lifetime_value").alias("max_clv")
).show()

## Step 5: Sales Performance Dashboard
Daily sales metrics with moving averages and cumulative totals.

In [None]:
# BUG: Multiple separate aggregations - should combine for performance
daily_revenue = orders.groupBy("order_date").agg(
    F.sum("total_amount").alias("total_revenue")
)

daily_orders = orders.groupBy("order_date").agg(
    F.count("order_id").alias("total_orders")
)

daily_customers = orders.groupBy("order_date").agg(
    F.countDistinct("customer_id").alias("unique_customers")
)

# Combine metrics
daily_sales = daily_revenue.join(daily_orders, "order_date", "outer") \
    .join(daily_customers, "order_date", "outer") \
    .withColumn("avg_order_value", F.col("total_revenue") / F.col("total_orders"))

# BUG: Wrong window specification for 7-day moving average
# rowsBetween(-6, 0) assumes continuous dates, which may not exist
windowSpec = Window.orderBy("order_date").rowsBetween(-6, 0)

daily_sales = daily_sales.withColumn(
    "moving_avg_7d",
    F.avg("total_revenue").over(windowSpec)
)

# BUG: Cumulative sum without year partitioning
# This continues across years instead of resetting
windowSpec = Window.orderBy("order_date").rowsBetween(Window.unboundedPreceding, 0)

daily_sales = daily_sales.withColumn(
    "ytd_revenue",
    F.sum("total_revenue").over(windowSpec)
)

print("Daily Sales Performance:")
daily_sales.orderBy(F.desc("order_date")).show(30)

## Step 6: Category Performance Analysis
Analyze sales performance by product category with rankings.

In [None]:
# BUG: Direct join between products and order_items causes double counting
# Should aggregate order_items first
category_sales = products.join(
    order_items,
    "product_id",
    "inner"
).withColumn(
    "line_total",
    F.col("quantity") * F.col("unit_price") * (1 - F.coalesce(F.col("discount_percent"), F.lit(0)) / 100)
)

# Aggregate by category
category_performance = category_sales.groupBy("category").agg(
    F.sum("line_total").alias("total_revenue"),
    F.sum("quantity").alias("total_quantity"),
    F.countDistinct("product_id").alias("unique_products")
).withColumn(
    "revenue_per_product",
    F.col("total_revenue") / F.col("unique_products")
)

print("Category Performance:")
category_performance.orderBy(F.desc("total_revenue")).show()

# Product ranking within categories
# BUG: Ranks globally instead of within each category
product_revenue = order_items.groupBy("product_id").agg(
    F.sum(F.col("quantity") * F.col("unit_price") * 
          (1 - F.coalesce(F.col("discount_percent"), F.lit(0)) / 100)).alias("revenue")
)

product_with_category = products.join(product_revenue, "product_id", "inner")

# Missing partition by category in window!
windowSpec = Window.orderBy(F.desc("revenue"))

top_products = product_with_category.withColumn(
    "rank",
    F.rank().over(windowSpec)
).filter(F.col("rank") <= 5)

print("\nTop 5 Products by Revenue (should be per category!):")
top_products.select("category", "product_name", "revenue", "rank").show(20)

## Summary & Next Steps

If you've made it this far, you've encountered numerous bugs! 🐛

### Key Issues Found:
1. **Cohort Analysis**: Wrong cohort assignment and period calculation
2. **Product Affinity**: Duplicate pairs and missing constraints
3. **CLV Calculation**: Invalid order inclusion and edge case handling
4. **Sales Dashboard**: Window function errors and performance issues
5. **Category Analysis**: Double counting and missing partitioning

### How to Debug:
1. Use GitHub Copilot Chat to review each cell
2. Ask Copilot to explain what the code SHOULD do
3. Compare with what it ACTUALLY does
4. Use Copilot to generate fixes
5. Test incrementally

### Resources:
- See `DEBUGGING_GUIDE.md` for detailed solutions
- Ask Copilot: "What's wrong with this cohort analysis?"
- Ask Copilot: "How can I optimize this self-join?"

Good luck! 🚀