# Phase 2: Northwind Traders - SQL Python Integration

Execute SQL queries in Python and build visualizations. 

- All analysis logic remains in SQL
- Python runs queries and visualizes results.

## Setup

In [None]:
import sqlite3
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set_theme(style='whitegrid') # Set the theme for the visualizations

db_path = next((p for base in [Path.cwd(), Path.cwd().parent] for p in [base / "data" / "northwind.db"] if p.exists()), None)

# Check if the database file was found
if db_path is None:
    raise FileNotFoundError("data/northwind.db not found (run from project root or notebooks/)")

# Connect to the database
conn = sqlite3.connect(db_path.resolve())

## Part 1: Employee Sales Performance - Bar Chart (Top 5)

Calculate total sales per employee, rank with `RANK()`, run in Python, and build a bar chart for the top 5.

In [None]:
# Query 1: Employee sales
query_employees = """
-- Part 1: Employee sales. Revenue = UnitPrice * Quantity * (1 - Discount). RANK() leaves gaps after ties.
SELECT
    e.EmployeeID,
    e.FirstName || ' ' || e.LastName AS EmployeeName,
    ROUND(SUM(od.UnitPrice * od.Quantity * (1 - od.Discount)), 2) AS TotalSales,
    RANK() OVER (ORDER BY SUM(od.UnitPrice * od.Quantity * (1 - od.Discount)) DESC) AS SalesRank
FROM Employees e
JOIN Orders o ON e.EmployeeID = o.EmployeeID
JOIN "Order Details" od ON o.OrderID = od.OrderID
GROUP BY e.EmployeeID, e.FirstName, e.LastName
ORDER BY SalesRank;
"""

# Execute the query and store the result in a DataFrame
df_emp = pd.read_sql_query(query_employees, conn)

# Display the top 5 employees
df_emp_top5 = df_emp.head(5)


In [None]:
df_emp_top5

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
bars = ax.barh(df_emp_top5['EmployeeName'][::-1], df_emp_top5['TotalSales'][::-1], color='steelblue', edgecolor='navy', alpha=0.85)
ax.set_xlabel('Total Sales ($)')
ax.set_ylabel('Employee')
ax.set_title('Top 5 Performing Employees by Total Sales')
ax.bar_label(bars, labels=[f"${x:,.0f}" for x in df_emp_top5['TotalSales'][::-1]], padding=5)
plt.tight_layout()
plt.show()

## Part 2: Monthly Sales Trend - Visualization

Aggregate total sales by month, compute month-over-month growth in SQL, run in Python, and visualize sales trends.

In [None]:
query_monthly = """
-- Part 2: Monthly trend + MoM. strftime buckets by YYYY-MM; LAG gives previous month; NULLIF avoids /0.
WITH MonthlySales AS (
    SELECT
        strftime('%Y-%m', o.OrderDate) AS Month,
        SUM(od.UnitPrice * od.Quantity * (1 - od.Discount)) AS TotalSales
    FROM Orders o
    JOIN "Order Details" od ON o.OrderID = od.OrderID
    GROUP BY strftime('%Y-%m', o.OrderDate)
)
SELECT
    Month,
    ROUND(TotalSales, 2) AS TotalSales,
    ROUND(LAG(TotalSales) OVER (ORDER BY Month), 2) AS PrevMonthSales,
    ROUND((TotalSales - LAG(TotalSales) OVER (ORDER BY Month)) /
          NULLIF(LAG(TotalSales) OVER (ORDER BY Month), 0) * 100, 2) AS MoM_Growth_Pct
FROM MonthlySales
ORDER BY Month;
"""

df_monthly = pd.read_sql_query(query_monthly, conn)
df_monthly['Month'] = pd.to_datetime(df_monthly['Month'] + '-01')
df_monthly.head(10)

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(df_monthly['Month'], df_monthly['TotalSales'], marker='o', markersize=3, linewidth=1.5, color='darkgreen', label='Total Sales')
ax.fill_between(df_monthly['Month'], df_monthly['TotalSales'], alpha=0.3, color='green')
ax.set_xlabel('Month')
ax.set_ylabel('Total Sales ($)')
ax.set_title('Monthly Sales Trend')
ax.legend(loc='upper right')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Part 3: Product Sales Ranking by Category

Calculate total sales per product, rank within category with `RANK()`, run in Python. Optional: grouped bar chart for top products by category.

In [None]:
query_products = """
-- Part 3: Top products per category. PARTITION BY c.CategoryID resets rank within each category.
SELECT
    c.CategoryName,
    p.ProductName,
    ROUND(SUM(od.UnitPrice * od.Quantity * (1 - od.Discount)), 2) AS TotalSales,
    RANK() OVER (PARTITION BY c.CategoryID ORDER BY SUM(od.UnitPrice * od.Quantity * (1 - od.Discount)) DESC) AS CategoryRank
FROM Categories c
JOIN Products p ON c.CategoryID = p.CategoryID
JOIN "Order Details" od ON p.ProductID = od.ProductID
GROUP BY c.CategoryID, c.CategoryName, p.ProductID, p.ProductName
ORDER BY c.CategoryName, CategoryRank;
"""

df_prod = pd.read_sql_query(query_products, conn)
top_per_category = df_prod[df_prod['CategoryRank'] == 1]
top_per_category

In [None]:
# Top 3 products per category for a readable chart
top3 = df_prod[df_prod['CategoryRank'] <= 3]
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(data=top3, x='CategoryName', y='TotalSales', hue='ProductName', dodge=True, ax=ax)
ax.set_xlabel('Category')
ax.set_ylabel('Total Sales ($)')
ax.set_title('Top 3 Products by Sales in Each Category')
ax.legend(title='Product', bbox_to_anchor=(1.02, 1), loc='upper left')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Part 4: Customer Purchase Behavior - AOV Ranks 2, 3, 5, 8, 12, 15, 17

Average order value per customer, rank with `RANK()`, run in Python, show customers at ranks 2, 3, 5, 8, 12, 15, 17 in a formatted table.

In [None]:
query_customers = """
-- Part 4: Customer AOV. OrderTotals CTE = revenue per order; then AVG per customer + RANK(); filter to given ranks.
WITH OrderTotals AS (
    SELECT OrderID, SUM(UnitPrice * Quantity * (1 - Discount)) AS OrderTotal
    FROM "Order Details"
    GROUP BY OrderID
),
CustomerAOV AS (
    SELECT
        c.CustomerID,
        c.CompanyName,
        ROUND(AVG(ot.OrderTotal), 2) AS AvgOrderValue,
        RANK() OVER (ORDER BY AVG(ot.OrderTotal) DESC) AS AOV_Rank
    FROM Customers c
    JOIN Orders o ON c.CustomerID = o.CustomerID
    JOIN OrderTotals ot ON o.OrderID = ot.OrderID
    GROUP BY c.CustomerID, c.CompanyName
)
SELECT * FROM CustomerAOV WHERE AOV_Rank IN (2, 3, 5, 8, 12, 15, 17)
ORDER BY AOV_Rank;
"""

df_cust = pd.read_sql_query(query_customers, conn)
df_cust.style.format({'AvgOrderValue': '${:,.2f}'}).set_caption('Customers by AOV Rank (2, 3, 5, 8, 12, 15, 17)')

In [None]:
conn.close()