In [None]:
#hands pn lab
"""You are a Data Engineer tasked with building a Q1 2024 sales analytics pipeline. You receive partitioned monthly 
data that must be integrated with dimension tables, analyzed for KPIs, and optimized for production BI consumption.
Lab Objectives
Concatenate partitioned datasets with hierarchical indexing
Perform complex multi-table joins with overlapping columns
Execute advanced GroupBy operations (filter, transform, apply)
Generate executive pivot tables with margins
Implement time series resampling and rolling windows
Optimize memory using categorical data types and proper indexing
"""
import pandas as pd
from glob import glob

# Step 1: Load partitioned datasets
file_paths = glob("data/sales_2024_*.csv")
sales_data = pd.concat([pd.read_csv(f) for f in file_paths], ignore_index=True)

# Step 2: Process Date for Hierarchical Indexing
sales_data['Date'] = pd.to_datetime(sales_data['Date'])
sales_data['Year'] = sales_data['Date'].dt.year
sales_data['Month'] = sales_data['Date'].dt.month
sales_data.set_index(['Year', 'Month'], inplace=True)


# Step 4: Merge with Dimension Tables
sales_data = sales_data.merge(products, on='ProductID', how='left')
sales_data = sales_data.merge(customers, on='CustomerID', how='left')

# Step 5: Rename Overlapping Columns if necessary
sales_data = sales_data.rename(columns={
    'SalesAmount_x': 'SalesAmount', 
    'SalesAmount_y': 'RefundAmount'
})

# Step 6: Execute Complex GroupBy Operations
kpis = sales_data.groupby(['ProductID', 'CustomerID']).agg(
    TotalSales=('SalesAmount', 'sum'),
    AvgDiscount=('Discount', 'mean')
).reset_index()


# Step 7: Create Executive Pivot Table
pivot_table = sales_data.pivot_table(
    index=['ProductCategory'],
    values='SalesAmount',
    aggfunc='sum',
    margins=True,
    margins_name='Total'
)

# Step 8: Time Series Resampling
sales_data.set_index('Date', inplace=True)
daily_sales = sales_data.resample('D').sum()

# Step 9: Implement Rolling Windows Analysis
daily_sales['RollingAvg'] = daily_sales['SalesAmount'].rolling(window=7).mean()


# Step 10: Optimize Memory Usage with Categorical Types
sales_data['ProductID'] = sales_data['ProductID'].astype('category')
sales_data['CustomerID'] = sales_data['CustomerID'].astype('category')
sales_data.set_index(['ProductID', 'CustomerID'], inplace=True)

# Step 11: Export Final DataFrame for BI Tools
sales_data.to_csv("data/optimized_sales_data.csv", index=False)


In [None]:
#assignment 1
#Analyze sales data with advanced grouping and pivot tables.

In [None]:
#assignment 2
#Build a time series analysis for stock market data.