In [1]:
# Install required packages if not already installed
%pip install pyspark python-dotenv

# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window
import os
import sys
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Database connection string
DB_CONNECTION = os.getenv('DB_CONNECTION_STRING', 
                          'postgresql://postgressadmin:wf**F!$3dGdf14@copilot-workshop-db.postgres.database.azure.com:5432/workshop_db')

# IMPORTANT: Set up Hadoop for Windows BEFORE creating Spark session
if sys.platform.startswith('win'):
    # Create a minimal Hadoop directory structure for Windows
    hadoop_home = os.path.join(os.path.expanduser('~'), '.hadoop')
    os.makedirs(hadoop_home, exist_ok=True)
    os.makedirs(os.path.join(hadoop_home, 'bin'), exist_ok=True)
    os.environ['HADOOP_HOME'] = hadoop_home
    
    # Download winutils.exe if not present (required for Windows)
    winutils_path = os.path.join(hadoop_home, 'bin', 'winutils.exe')
    if not os.path.exists(winutils_path):
        print("⚠️ winutils.exe not found. Downloading...")
        import urllib.request
        try:
            urllib.request.urlretrieve(
                'https://github.com/steveloughran/winutils/raw/master/hadoop-3.0.0/bin/winutils.exe',
                winutils_path
            )
            print("✅ winutils.exe downloaded successfully!")
        except Exception as e:
            print(f"⚠️ Could not download winutils.exe automatically: {e}")
            print("Please download manually from: https://github.com/steveloughran/winutils")

# Initialize Spark Session with PostgreSQL driver
# Note: The driver will be downloaded on first run, which may take a moment
spark = SparkSession.builder \
    .appName("DataPipelineDebugging") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.7.3") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "4") \
    .master("local[*]") \
    .getOrCreate()

print("✅ Spark session initialized successfully!")
print(f"Spark version: {spark.version}")
print(f"Running on: {sys.platform}")


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.
✅ Spark session initialized successfully!
Spark version: 3.4.1
Running on: win32


# 🐛 Data Pipeline Debugging Exercise

This notebook contains a data pipeline with **several bugs and performance issues**. Your task is to use GitHub Copilot to identify and fix them.

## Your Mission:
Use GitHub Copilot Chat to:
1. Review the code and identify issues
2. Understand what each section is trying to accomplish
3. Fix bugs and optimize performance
4. Add proper error handling and validation

## Hints:
- Try asking Copilot to review specific cells
- Ask about performance optimization
- Request explanations for suspicious code patterns
- Use Copilot to suggest best practices

Good luck! 🚀

## Step 1: Load Data from Database

In [None]:
# Load data from PostgreSQL database
customers = spark.read \
    .format("jdbc") \
    .option("url", DB_CONNECTION) \
    .option("dbtable", "raw.customers") \
    .option("driver", "org.postgresql.Driver") \
    .load()

orders = spark.read \
    .format("jdbc") \
    .option("url", DB_CONNECTION) \
    .option("dbtable", "raw.orders") \
    .option("driver", "org.postgresql.Driver") \
    .load()

order_items = spark.read \
    .format("jdbc") \
    .option("url", DB_CONNECTION) \
    .option("dbtable", "raw.order_items") \
    .option("driver", "org.postgresql.Driver") \
    .load()

products = spark.read \
    .format("jdbc") \
    .option("url", DB_CONNECTION) \
    .option("dbtable", "raw.products") \
    .option("driver", "org.postgresql.Driver") \
    .load()

print(f"Loaded {customers.count()} customers")
print(f"Loaded {orders.count()} orders")
print(f"Loaded {order_items.count()} order items")
print(f"Loaded {products.count()} products")

IllegalArgumentException: requirement failed: The driver could not open a JDBC connection. Check the URL: postgresql://postgressadmin:wf**F!$3dGdf14@copilot-workshop-db.postgres.database.azure.com:5432/workshop_db

## Step 2: Calculate Product Revenue

In [None]:
# Join order items with product information
product_sales = order_items.join(
    products,
    order_items.order_id == products.product_id,
    "inner"
)

# Calculate line total with discount applied
product_sales = product_sales.withColumn(
    "line_total",
    F.col("quantity") * F.col("unit_price") * (1 - F.col("discount_percent"))
)

# Aggregate revenue by product
revenue_by_product = product_sales.groupBy("product_id", "product_name", "category") \
    .agg(
        F.sum("line_total").alias("total_revenue"),
        F.sum("quantity").alias("total_quantity"),
        F.count("order_item_id").alias("num_orders")
    )

print("Top 10 Products by Revenue:")
revenue_by_product.orderBy(F.desc("total_revenue")).show(10)

## Step 3: Customer Segmentation (RFM Analysis)

In [None]:
from datetime import datetime

# Calculate RFM metrics for customer segmentation
reference_date = datetime(2024, 1, 1)

# Join customers with their orders
customer_orders = customers.join(
    orders,
    customers.customer_id == orders.customer_id,
    "left"
)

# Calculate RFM metrics
rfm = customer_orders.groupBy("customer_id", "customer_name", "country") \
    .agg(
        F.datediff(F.lit(reference_date), F.max("order_date")).alias("recency"),
        F.count("order_id").alias("frequency"),
        F.sum("total_amount").alias("monetary")
    )

# Score recency (1-5 scale)
rfm = rfm.withColumn(
    "r_score",
    F.when(F.col("recency") < 30, 1)
     .when(F.col("recency") < 60, 2)
     .when(F.col("recency") < 90, 3)
     .when(F.col("recency") < 180, 4)
     .otherwise(5)
).withColumn(
    "f_score",
    F.when(F.col("frequency") >= 10, 5)
     .when(F.col("frequency") >= 5, 4)
     .when(F.col("frequency") >= 3, 3)
     .when(F.col("frequency") >= 2, 2)
     .otherwise(1)
).withColumn(
    "m_score",
    F.when(F.col("monetary") >= 10000, 5)
     .when(F.col("monetary") >= 5000, 4)
     .when(F.col("monetary") >= 2000, 3)
     .when(F.col("monetary") >= 1000, 2)
     .otherwise(1)
)

# Calculate overall RFM score
rfm = rfm.withColumn(
    "rfm_score",
    F.col("r_score") + F.col("f_score") + F.col("m_score")
)

print("Customer Segmentation Results:")
rfm.orderBy(F.desc("rfm_score")).show(10)

## Step 4: Sales Trend Analysis

In [None]:
# Calculate monthly sales trends
monthly_sales = orders.withColumn(
    "month",
    F.date_format("order_date", "yyyy-MM")
)

# Aggregate sales by month
monthly_sales = monthly_sales.groupBy("month") \
    .agg(
        F.count("order_id").alias("total_orders"),
        F.sum("order_id").alias("unique_customers"),
        F.sum("total_amount").alias("revenue"),
        F.sum("total_amount").alias("avg_order_value")
    )

print("Monthly Sales Trends:")
monthly_sales.orderBy("month").show(12)

# Calculate month-over-month growth rate
windowSpec = Window.orderBy("month")
monthly_sales = monthly_sales.withColumn(
    "prev_month_revenue",
    F.lead("revenue").over(windowSpec)
)

monthly_sales = monthly_sales.withColumn(
    "growth_rate",
    ((F.col("revenue") - F.col("prev_month_revenue")) / F.col("prev_month_revenue") * 100)
)

print("\nMonthly Growth Rates:")
monthly_sales.select("month", "revenue", "prev_month_revenue", "growth_rate").show()