# Pandas â€” GroupBy & Aggregations
## Data Engineering Perspective


In [12]:
# Initialize

#Imports
import pandas as pd
pd.set_option("display.max_columns", None)
# Load Data
employees = pd.read_csv("../data/employees.csv")
departments = pd.read_csv("../data/departments.csv")
sales = pd.read_csv("../data/sales.csv")

In [13]:
# Department-level salary metrics using explicit named aggregation

dept_salary_metrics = (
    employees.groupby("department")
             .agg(
                 total_salary=("salary", "sum"),
                 avg_salary=("salary", "mean"),
                 employee_count=("emp_id", "count")
             )
             .reset_index()
)

dept_salary_metrics

Unnamed: 0,department,total_salary,avg_salary,employee_count
0,Engineering,578000,82571.428571,7
1,Finance,217000,72333.333333,3
2,HR,262000,52400.0,5
3,Sales,300000,60000.0,5


In [14]:
# Employee-level sales aggregation

employee_sales_metrics = (
    sales.groupby("emp_id")
         .agg(
             total_sales=("amount", "sum"),
             transaction_count=("sale_id", "count"),
             avg_sale_value=("amount", "mean")
         )
         .reset_index()
)

employee_sales_metrics.head()

Unnamed: 0,emp_id,total_sales,transaction_count,avg_sale_value
0,1,29000,3,9666.666667
1,2,30500,2,15250.0
2,3,4000,1,4000.0
3,5,16500,2,8250.0
4,6,7500,1,7500.0


In [15]:
# Joining sales (fact) with employees (dimension)

sales_with_employee = sales.merge(
    employees,
    on="emp_id",
    how="left"
)

sales_with_employee.head()

Unnamed: 0,sale_id,emp_id,amount,sale_date,region,name,department,salary,joining_date
0,1,1,12000,2023-01-10,North,Amit,Engineering,80000,2021-06-15
1,2,2,15000,2023-01-12,East,Neha,Engineering,75000,2022-03-10
2,3,5,8000,2023-02-01,West,Karan,Sales,60000,2023-02-01
3,4,1,7000,2023-02-05,North,Amit,Engineering,80000,2021-06-15
4,5,3,4000,2023-02-10,South,Ravi,HR,50000,2020-01-20


In [16]:
# Aggregating sales at department level

department_sales = (
    sales_with_employee.groupby("department")
                       .agg(
                           total_sales=("amount", "sum"),
                           total_transactions=("sale_id", "count")
                       )
                       .reset_index()
)

department_sales

Unnamed: 0,department,total_sales,total_transactions
0,Engineering,186000,12
1,Finance,13000,1
2,HR,4000,1
3,Sales,53500,6


In [None]:
# Creating month column
sales["sale_date"] = pd.to_datetime(sales["sale_date"])
# Creating month column

sales["sale_month"] = sales["sale_date"].dt.to_period("M")

monthly_revenue = (
    sales.groupby("sale_month")
         .agg(
             total_revenue=("amount", "sum"),
             transaction_count=("sale_id", "count")
         )
         .reset_index()
)

monthly_revenue

Unnamed: 0,sale_month,total_revenue,transaction_count
0,2023-01-10,12000,1
1,2023-01-12,15000,1
2,2023-02-01,8000,1
3,2023-02-05,7000,1
4,2023-02-10,4000,1
5,2023-03-01,20000,1
6,2023-03-05,18000,1
7,2023-03-12,9000,1
8,2023-03-20,11000,1
9,2023-04-01,7500,1
