# Aggregating Data

In [2]:
import polars as pl

In [3]:
math_students = pl.read_parquet("math.parquet")

(
    math_students.select(
        median_result=pl.col("G3").median(),
        most_frequent_result=pl.col("G3").mode(),
        variance=pl.col("G3").var(),
    )
)

median_result,most_frequent_result,variance
f64,i64,f64
11.0,10,20.989616


# Grouping Data

In [18]:
import polars as pl

all_students = pl.read_parquet("course.parquet")

(
    all_students.group_by(["subject", "internet"])
    .agg(
        total=pl.col("student_id").count(),
        passes=pl.col("G3").filter(pl.col("G3") > 12).count(),
    )
    .select(
        pl.col("subject", "internet", "total", "passes"),
        percentage=pl.format(
            "{}%", (pl.col("passes") * 100 / pl.col("total")).round(2)
        ),
    )
    .sort("subject")
)

subject,internet,total,passes,percentage
str,str,u32,u32,str
"""M""","""yes""",329,117,"""35.56%"""
"""M""","""no""",66,14,"""21.21%"""
"""P""","""yes""",498,231,"""46.39%"""
"""P""","""no""",151,45,"""29.8%"""


# Grouping Time Series Data With .group_by_dynamic()

In [30]:
math_attendance = pl.read_parquet("math_classes.parquet")

(
    math_attendance.group_by_dynamic(
        index_column="class_start",
        every="1mo",
        closed="both",
        group_by="lecturer_initials",
    ).agg(pl.col("absences").mean())
)

lecturer_initials,class_start,absences
str,datetime[μs],f64
"""DH""",2024-01-01 00:00:00,4.0
"""DH""",2024-02-01 00:00:00,3.4
"""DH""",2024-03-01 00:00:00,4.25
"""DH""",2024-04-01 00:00:00,2.5
"""DH""",2024-05-01 00:00:00,2.0
…,…,…
"""LM""",2024-02-01 00:00:00,3.0
"""LM""",2024-03-01 00:00:00,3.4
"""LM""",2024-04-01 00:00:00,3.8
"""LM""",2024-05-01 00:00:00,3.666667


## Grouping and Aggregating Using Window Functions

In [45]:
all_students = pl.read_parquet("course.parquet")

all_students.select(
    pl.col("subject", "reason", "G1"),
    mean_G1=pl.col("G1").mean().over("subject", "reason"),
    G2=pl.col("G2"),
    mean_G2=pl.col("G2").mean().over("subject", "reason"),
).filter(
    (pl.col("G1") > pl.col("mean_G1")) & (pl.col("G2") > pl.col("mean_G2"))
)

subject,reason,G1,mean_G1,G2,mean_G2
str,str,i64,f64,i64,f64
"""M""","""home""",15,10.816514,14,10.743119
"""M""","""reputation""",15,11.457143,15,11.257143
"""M""","""home""",12,10.816514,12,10.743119
"""M""","""home""",16,10.816514,18,10.743119
"""M""","""home""",14,10.816514,15,10.743119
…,…,…,…,…,…
"""P""","""home""",17,11.657718,18,11.785235
"""P""","""home""",14,11.657718,15,11.785235
"""P""","""other""",14,10.694444,17,10.777778
"""P""","""course""",15,10.982456,15,11.147368


# Grouping and Aggregating Using Pivot Tables

In [50]:
all_students = pl.read_parquet("course.parquet")
(
    all_students.pivot(
        on="school",
        index=["subject", "sex"],
        values=["G1", "G2"],
        aggregate_function="mean",
    )
)

subject,sex,G1_GP,G1_MS,G2_GP,G2_MS
str,str,f64,f64,f64,f64
"""M""","""F""",10.579235,10.92,10.398907,10.32
"""M""","""M""",11.337349,10.380952,11.204819,10.047619
"""P""","""F""",12.28692,10.582192,12.50211,10.719178
"""P""","""M""",11.602151,9.7875,11.688172,10.0875
