# 14. Reshaping

In [None]:
import polars as pl
pl.__version__

'1.20.0'

## Wide Versus Long DataFrames

In [2]:
grades_wide = pl.DataFrame(
    {
        "student": ["Jeroen", "Thijs", "Ritchie"],
        "math": [85, 78, 92],
        "science": [90, 82, 85],
        "history": [88, 80, 87],
    }
)

grades_wide

student,math,science,history
str,i64,i64,i64
"""Jeroen""",85,90,88
"""Thijs""",78,82,80
"""Ritchie""",92,85,87


In [3]:
grades_long = pl.DataFrame(
    {
        "student": [
            "Jeroen",
            "Jeroen",
            "Jeroen",
            "Thijs",
            "Thijs",
            "Thijs",
            "Ritchie",
            "Ritchie",
            "Ritchie",
        ],
        "subject": [
            "Math",
            "Science",
            "History",
            "Math",
            "Science",
            "History",
            "Math",
            "Science",
            "History",
        ],
        "grade": [85, 90, 88, 78, 82, 80, 92, 85, 87],
    }
)

grades_long

student,subject,grade
str,str,i64
"""Jeroen""","""Math""",85
"""Jeroen""","""Science""",90
"""Jeroen""","""History""",88
"""Thijs""","""Math""",78
"""Thijs""","""Science""",82
"""Thijs""","""History""",80
"""Ritchie""","""Math""",92
"""Ritchie""","""Science""",85
"""Ritchie""","""History""",87


## Pivot to a Wider DataFrame

In [4]:
grades = pl.DataFrame(
    {
        "student": [
            "Jeroen",
            "Jeroen",
            "Jeroen",
            "Thijs",
            "Thijs",
            "Thijs",
            "Ritchie",
            "Ritchie",
            "Ritchie",
        ],
        "subject": [
            "Math",
            "Science",
            "History",
            "Math",
            "Science",
            "History",
            "Math",
            "Science",
            "History",
        ],
        "grade": [85, 90, 88, 78, 82, 80, 92, 85, 87],
    }
)

grades

student,subject,grade
str,str,i64
"""Jeroen""","""Math""",85
"""Jeroen""","""Science""",90
"""Jeroen""","""History""",88
"""Thijs""","""Math""",78
"""Thijs""","""Science""",82
"""Thijs""","""History""",80
"""Ritchie""","""Math""",92
"""Ritchie""","""Science""",85
"""Ritchie""","""History""",87


In [5]:
grades.pivot(index="student", on="subject", values="grade")

student,Math,Science,History
str,i64,i64,i64
"""Jeroen""",85,90,88
"""Thijs""",78,82,80
"""Ritchie""",92,85,87


In [6]:
multiple_grades = pl.DataFrame(
    {
        "student": [
            "Jeroen",
            "Jeroen",
            "Jeroen",
            "Jeroen",
            "Jeroen",
            "Jeroen",
            "Thijs",
            "Thijs",
            "Thijs",
            "Thijs",
            "Thijs",
            "Thijs",
        ],
        "subject": [
            "Math",
            "Math",
            "Math",
            "Science",
            "Science",
            "Science",
            "Math",
            "Math",
            "Math",
            "Science",
            "Science",
            "Science",
        ],
        "grade": [85, 88, 85, 60, 66, 63, 51, 79, 62, 82, 85, 82],
    }
)

multiple_grades

student,subject,grade
str,str,i64
"""Jeroen""","""Math""",85
"""Jeroen""","""Math""",88
"""Jeroen""","""Math""",85
"""Jeroen""","""Science""",60
"""Jeroen""","""Science""",66
…,…,…
"""Thijs""","""Math""",79
"""Thijs""","""Math""",62
"""Thijs""","""Science""",82
"""Thijs""","""Science""",85


In [7]:
multiple_grades.pivot(
    index="student", on="subject", values="grade", aggregate_function="mean"
)

student,Math,Science
str,f64,f64
"""Jeroen""",86.0,63.0
"""Thijs""",64.0,83.0


In [8]:
multiple_grades.pivot(
    index="student",
    on="subject",
    values="grade",
    aggregate_function=pl.element().max() - pl.element().min(),
)

student,Math,Science
str,i64,i64
"""Jeroen""",3,6
"""Thijs""",28,3


In [9]:
lf = pl.LazyFrame(
    {
        "col1": ["a", "a", "a", "b", "b", "b"],
        "col2": ["x", "x", "x", "x", "y", "y"],
        "col3": [6, 7, 3, 2, 5, 7],
    }
)

index = pl.col("col1")
on = pl.col("col2")
values = pl.col("col3")
unique_column_values = ["x", "y"]
aggregate_function = lambda col: col.tanh().mean()

lf.group_by(index).agg(
    aggregate_function(values.filter(on == value)).alias(value)
    for value in unique_column_values
).collect()

col1,x,y
str,f64,f64
"""a""",0.998347,
"""b""",0.964028,0.999954


## Unpivot to a Longer DataFrame

In [10]:
grades_wide = pl.DataFrame(
    {
        "student": ["Jeroen", "Thijs", "Ritchie"],
        "math": [85, 78, 92],
        "science": [90, 82, 85],
        "history": [88, 80, 87],
    }
)

grades_wide

student,math,science,history
str,i64,i64,i64
"""Jeroen""",85,90,88
"""Thijs""",78,82,80
"""Ritchie""",92,85,87


In [11]:
grades_wide.unpivot(
    index=["student"],
    on=["math", "science", "history"],
    variable_name="subject",
    value_name="grade",
)

student,subject,grade
str,str,i64
"""Jeroen""","""math""",85
"""Thijs""","""math""",78
"""Ritchie""","""math""",92
"""Jeroen""","""science""",90
"""Thijs""","""science""",82
"""Ritchie""","""science""",85
"""Jeroen""","""history""",88
"""Thijs""","""history""",80
"""Ritchie""","""history""",87


In [12]:
df = pl.DataFrame(
    {
        "student": ["Jeroen", "Thijs", "Ritchie", "Jeroen", "Thijs", "Ritchie"],
        "class": [
            "Math101",
            "Math101",
            "Math101",
            "Math102",
            "Math102",
            "Math102",
        ],
        "age": [20, 21, 22, 20, 21, 22],
        "semester": ["Fall", "Fall", "Fall", "Spring", "Spring", "Spring"],
        "math": [85, 78, 92, 88, 79, 95],
        "science": [90, 82, 85, 92, 81, 87],
        "history": [88, 80, 87, 85, 82, 89],
    }
)
df

student,class,age,semester,math,science,history
str,str,i64,str,i64,i64,i64
"""Jeroen""","""Math101""",20,"""Fall""",85,90,88
"""Thijs""","""Math101""",21,"""Fall""",78,82,80
"""Ritchie""","""Math101""",22,"""Fall""",92,85,87
"""Jeroen""","""Math102""",20,"""Spring""",88,92,85
"""Thijs""","""Math102""",21,"""Spring""",79,81,82
"""Ritchie""","""Math102""",22,"""Spring""",95,87,89


In [13]:
df.unpivot(
    index=["student", "class", "age", "semester"],
    on=["math", "science", "history"],
    variable_name="subject",
    value_name="grade",
)

student,class,age,semester,subject,grade
str,str,i64,str,str,i64
"""Jeroen""","""Math101""",20,"""Fall""","""math""",85
"""Thijs""","""Math101""",21,"""Fall""","""math""",78
"""Ritchie""","""Math101""",22,"""Fall""","""math""",92
"""Jeroen""","""Math102""",20,"""Spring""","""math""",88
"""Thijs""","""Math102""",21,"""Spring""","""math""",79
…,…,…,…,…,…
"""Thijs""","""Math101""",21,"""Fall""","""history""",80
"""Ritchie""","""Math101""",22,"""Fall""","""history""",87
"""Jeroen""","""Math102""",20,"""Spring""","""history""",85
"""Thijs""","""Math102""",21,"""Spring""","""history""",82


## Transposing

In [14]:
grades_wide = pl.DataFrame(
    {
        "student": ["Jeroen", "Thijs", "Ritchie"],
        "math": [85, 78, 92],
        "science": [90, 82, 85],
        "history": [88, 80, 87],
    }
)

grades_wide

student,math,science,history
str,i64,i64,i64
"""Jeroen""",85,90,88
"""Thijs""",78,82,80
"""Ritchie""",92,85,87


In [15]:
report_columns = (f"report_{i + 1}" for i, _ in enumerate(df.columns))

grades_wide.transpose(
    include_header=True,
    header_name="original_headers",
    column_names=report_columns,
)

original_headers,report_1,report_2,report_3
str,str,str,str
"""student""","""Jeroen""","""Thijs""","""Ritchie"""
"""math""","""85""","""78""","""92"""
"""science""","""90""","""82""","""85"""
"""history""","""88""","""80""","""87"""


## Exploding

In [16]:
grades_nested = pl.DataFrame(
    {
        "student": ["Jeroen", "Thijs", "Ritchie"],
        "math": [[85, 90, 88], [78, 82, 80], [92, 85, 87]],
    }
)

grades_nested

student,math
str,list[i64]
"""Jeroen""","[85, 90, 88]"
"""Thijs""","[78, 82, 80]"
"""Ritchie""","[92, 85, 87]"


In [17]:
grades_nested.explode("math")

student,math
str,i64
"""Jeroen""",85
"""Jeroen""",90
"""Jeroen""",88
"""Thijs""",78
"""Thijs""",82
"""Thijs""",80
"""Ritchie""",92
"""Ritchie""",85
"""Ritchie""",87


In [18]:
grades_nested = pl.DataFrame(
    {
        "student": ["Jeroen", "Thijs", "Ritchie"],
        "math": [[85, 90, 88], [78, 82, 80], [92, 85, 87]],
        "science": [[85, 90, 88], [78, 82], [92, 85, 87]],
        "history": [[85, 90, 88], [78, 82], [92, 85, 87]],
    }
)

grades_nested

student,math,science,history
str,list[i64],list[i64],list[i64]
"""Jeroen""","[85, 90, 88]","[85, 90, 88]","[85, 90, 88]"
"""Thijs""","[78, 82, 80]","[78, 82]","[78, 82]"
"""Ritchie""","[92, 85, 87]","[92, 85, 87]","[92, 85, 87]"


In [None]:
# This raises a ShapeError:
# grades_nested.explode("math", "science", "history")

In [19]:
grades_nested_long = grades_nested.unpivot(
    index="student", variable_name="subject", value_name="grade"
)

grades_nested_long

student,subject,grade
str,str,list[i64]
"""Jeroen""","""math""","[85, 90, 88]"
"""Thijs""","""math""","[78, 82, 80]"
"""Ritchie""","""math""","[92, 85, 87]"
"""Jeroen""","""science""","[85, 90, 88]"
"""Thijs""","""science""","[78, 82]"
"""Ritchie""","""science""","[92, 85, 87]"
"""Jeroen""","""history""","[85, 90, 88]"
"""Thijs""","""history""","[78, 82]"
"""Ritchie""","""history""","[92, 85, 87]"


In [20]:
grades_nested_long.explode("grade")

student,subject,grade
str,str,i64
"""Jeroen""","""math""",85
"""Jeroen""","""math""",90
"""Jeroen""","""math""",88
"""Thijs""","""math""",78
"""Thijs""","""math""",82
…,…,…
"""Thijs""","""history""",78
"""Thijs""","""history""",82
"""Ritchie""","""history""",92
"""Ritchie""","""history""",85


In [21]:
nested_lists = pl.DataFrame(
    {
        "id": [1, 2],
        "nested_value": [[["a", "b"]], [["c"], ["d", "e"]]],
    },
    strict=False,
)
nested_lists

id,nested_value
i64,list[list[str]]
1,"[[""a"", ""b""]]"
2,"[[""c""], [""d"", ""e""]]"


In [22]:
nested_lists.explode("nested_value")

id,nested_value
i64,list[str]
1,"[""a"", ""b""]"
2,"[""c""]"
2,"[""d"", ""e""]"


In [23]:
nested_lists.explode("nested_value").explode("nested_value")

id,nested_value
i64,str
1,"""a"""
1,"""b"""
2,"""c"""
2,"""d"""
2,"""e"""


## Partition into Multiple DataFrames

In [24]:
sales = pl.DataFrame(
    {
        "OrderID": [1, 2, 3, 4, 5, 6],
        "Product": ["A", "B", "A", "C", "B", "A"],
        "Quantity": [10, 5, 8, 7, 3, 12],
        "Region": ["North", "South", "North", "West", "South", "West"],
    }
)

In [25]:
sales.partition_by("Region")

[shape: (2, 4)
 ┌─────────┬─────────┬──────────┬────────┐
 │ OrderID ┆ Product ┆ Quantity ┆ Region │
 │ ---     ┆ ---     ┆ ---      ┆ ---    │
 │ i64     ┆ str     ┆ i64      ┆ str    │
 ╞═════════╪═════════╪══════════╪════════╡
 │ 1       ┆ A       ┆ 10       ┆ North  │
 │ 3       ┆ A       ┆ 8        ┆ North  │
 └─────────┴─────────┴──────────┴────────┘,
 shape: (2, 4)
 ┌─────────┬─────────┬──────────┬────────┐
 │ OrderID ┆ Product ┆ Quantity ┆ Region │
 │ ---     ┆ ---     ┆ ---      ┆ ---    │
 │ i64     ┆ str     ┆ i64      ┆ str    │
 ╞═════════╪═════════╪══════════╪════════╡
 │ 2       ┆ B       ┆ 5        ┆ South  │
 │ 5       ┆ B       ┆ 3        ┆ South  │
 └─────────┴─────────┴──────────┴────────┘,
 shape: (2, 4)
 ┌─────────┬─────────┬──────────┬────────┐
 │ OrderID ┆ Product ┆ Quantity ┆ Region │
 │ ---     ┆ ---     ┆ ---      ┆ ---    │
 │ i64     ┆ str     ┆ i64      ┆ str    │
 ╞═════════╪═════════╪══════════╪════════╡
 │ 4       ┆ C       ┆ 7        ┆ West   │
 │ 6   

In [26]:
sales.partition_by("Region", include_key=False)

[shape: (2, 3)
 ┌─────────┬─────────┬──────────┐
 │ OrderID ┆ Product ┆ Quantity │
 │ ---     ┆ ---     ┆ ---      │
 │ i64     ┆ str     ┆ i64      │
 ╞═════════╪═════════╪══════════╡
 │ 1       ┆ A       ┆ 10       │
 │ 3       ┆ A       ┆ 8        │
 └─────────┴─────────┴──────────┘,
 shape: (2, 3)
 ┌─────────┬─────────┬──────────┐
 │ OrderID ┆ Product ┆ Quantity │
 │ ---     ┆ ---     ┆ ---      │
 │ i64     ┆ str     ┆ i64      │
 ╞═════════╪═════════╪══════════╡
 │ 2       ┆ B       ┆ 5        │
 │ 5       ┆ B       ┆ 3        │
 └─────────┴─────────┴──────────┘,
 shape: (2, 3)
 ┌─────────┬─────────┬──────────┐
 │ OrderID ┆ Product ┆ Quantity │
 │ ---     ┆ ---     ┆ ---      │
 │ i64     ┆ str     ┆ i64      │
 ╞═════════╪═════════╪══════════╡
 │ 4       ┆ C       ┆ 7        │
 │ 6       ┆ A       ┆ 12       │
 └─────────┴─────────┴──────────┘]

In [27]:
sales_dict = sales.partition_by(["Region"], as_dict=True)

sales_dict

{('North',): shape: (2, 4)
 ┌─────────┬─────────┬──────────┬────────┐
 │ OrderID ┆ Product ┆ Quantity ┆ Region │
 │ ---     ┆ ---     ┆ ---      ┆ ---    │
 │ i64     ┆ str     ┆ i64      ┆ str    │
 ╞═════════╪═════════╪══════════╪════════╡
 │ 1       ┆ A       ┆ 10       ┆ North  │
 │ 3       ┆ A       ┆ 8        ┆ North  │
 └─────────┴─────────┴──────────┴────────┘,
 ('South',): shape: (2, 4)
 ┌─────────┬─────────┬──────────┬────────┐
 │ OrderID ┆ Product ┆ Quantity ┆ Region │
 │ ---     ┆ ---     ┆ ---      ┆ ---    │
 │ i64     ┆ str     ┆ i64      ┆ str    │
 ╞═════════╪═════════╪══════════╪════════╡
 │ 2       ┆ B       ┆ 5        ┆ South  │
 │ 5       ┆ B       ┆ 3        ┆ South  │
 └─────────┴─────────┴──────────┴────────┘,
 ('West',): shape: (2, 4)
 ┌─────────┬─────────┬──────────┬────────┐
 │ OrderID ┆ Product ┆ Quantity ┆ Region │
 │ ---     ┆ ---     ┆ ---      ┆ ---    │
 │ i64     ┆ str     ┆ i64      ┆ str    │
 ╞═════════╪═════════╪══════════╪════════╡
 │ 4       ┆ C 

In [28]:
sales_dict[("North",)]

OrderID,Product,Quantity,Region
i64,str,i64,str
1,"""A""",10,"""North"""
3,"""A""",8,"""North"""


## Takeaways