In [None]:
# Data Engineering Pipeline: Ingestion → Processing → Reporting

# Step 1: Ingestion
# -----------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Simulated sales data (in a real pipeline this could be read from CSV, DB, or API)
data = {
    "date": pd.date_range(start="2025-01-01", periods=10, freq="D"),
    "region": ["North", "South", "East", "West", "North", "South", "East", "West", "North", "South"],
    "sales": np.random.randint(100, 500, size=10),
    "returns": np.random.randint(0, 50, size=10)
}

df = pd.DataFrame(data)
print("Raw Data:")
display(df)

# Step 2: Processing
# ------------------
# Calculate net sales
df["net_sales"] = df["sales"] - df["returns"]

# Aggregate by region
region_summary = (
    df.groupby("region")
      .agg(total_sales=("sales", "sum"),
           total_returns=("returns", "sum"),
           net_sales=("net_sales", "sum"))
      .reset_index()
)

# Sort by net sales
region_summary = region_summary.sort_values(by="net_sales", ascending=False)
print("\nProcessed Summary:")
display(region_summary)

# Step 3: Reporting
# -----------------
# Summary statistics
print("\nSummary Statistics:")
display(df.describe())

# Plot net sales by region
plt.figure(figsize=(6,4))
plt.bar(region_summary["region"], region_summary["net_sales"], color="skyblue")
plt.title("Net Sales by Region")
plt.xlabel("Region")
plt.ylabel("Net Sales")
plt.show()
