# 13. Joining and Concatenating

In [1]:
import polars as pl
pl.__version__  # The book is built with Polars version 1.20.0

'1.20.0'

## Joining

### Join Strategies

In [2]:
df_left = pl.DataFrame({"key": ["A", "B", "C", "D"], "value": [1, 2, 3, 4]})

df_right = pl.DataFrame({"key": ["B", "C", "D", "E"], "value": [5, 6, 7, 8]})

#### Inner

In [3]:
df_left.join(df_right, on="key", how="inner")

key,value,value_right
str,i64,i64
"""B""",2,5
"""C""",3,6
"""D""",4,7


#### Full

In [4]:
df_left.join(df_right, on="key", how="full", suffix="_other")

key,value,key_other,value_other
str,i64,str,i64
"""B""",2.0,"""B""",5.0
"""C""",3.0,"""C""",6.0
"""D""",4.0,"""D""",7.0
,,"""E""",8.0
"""A""",1.0,,


#### Left

In [5]:
df_left.join(df_right, on="key", how="left")

key,value,value_right
str,i64,i64
"""A""",1,
"""B""",2,5.0
"""C""",3,6.0
"""D""",4,7.0


#### Right

In [6]:
df_left.join(df_right, on="key", how="right")

value,key,value_right
i64,str,i64
2.0,"""B""",5
3.0,"""C""",6
4.0,"""D""",7
,"""E""",8


#### Cross

In [7]:
df_left.join(df_right, how="cross")

key,value,key_right,value_right
str,i64,str,i64
"""A""",1,"""B""",5
"""A""",1,"""C""",6
"""A""",1,"""D""",7
"""A""",1,"""E""",8
"""B""",2,"""B""",5
…,…,…,…
"""C""",3,"""E""",8
"""D""",4,"""B""",5
"""D""",4,"""C""",6
"""D""",4,"""D""",7


#### Semi

In [8]:
df_left.join(df_right, on="key", how="semi")

key,value
str,i64
"""B""",2
"""C""",3
"""D""",4


#### Anti

In [9]:
df_left.join(df_right, on="key", how="anti")

key,value
str,i64
"""A""",1


### Joining on Multiple Columns

In [10]:
residences_left = pl.DataFrame(
    {
        "name": ["Alice", "Bob", "Charlie", "Dave"],
        "city": ["NY", "LA", "NY", "SF"],
        "age": [25, 30, 35, 40],
    }
)

departments_right = pl.DataFrame(
    {
        "name": ["Alice", "Bob", "Charlie", "Dave"],
        "city": ["NY", "LA", "NY", "Chicago"],
        "department": ["Finance", "Marketing", "Engineering", "Operations"],
    }
)

residences_left.join(departments_right, on=["name", "city"], how="inner")

name,city,age,department
str,str,i64,str
"""Alice""","""NY""",25,"""Finance"""
"""Bob""","""LA""",30,"""Marketing"""
"""Charlie""","""NY""",35,"""Engineering"""


### Validation

#### Many-to-many

#### One-to-many

#### Many-to-one

#### One-to-one

In [11]:
employees = pl.DataFrame(
    {
        "employee_id": [1, 2, 3, 4],
        "name": ["Alice", "Bob", "Charlie", "Dave"],
        "department_id": [10, 10, 30, 10],
    }
)

departments = pl.DataFrame(
    {
        "department_id": [10, 20, 30],
        "department_name": [
            "Information Technology",
            "Finance",
            "Human Resources",
        ],
    }
)

employees.join(departments, on="department_id", how="left", validate="m:1")

employee_id,name,department_id,department_name
i64,str,i64,str
1,"""Alice""",10,"""Information Technology"""
2,"""Bob""",10,"""Information Technology"""
3,"""Charlie""",30,"""Human Resources"""
4,"""Dave""",10,"""Information Technology"""


In [12]:
# This raises a ComputeError:
# departments = pl.DataFrame(
#     {
#         "department_id": [10, 20, 10],
#         "department_name": [
#             "Information Technology",
#             "Finance",
#             "Human Resources",
#         ],
#     }
# )

# employees.join(
#     departments, on="department_id", how="left", validate="m:1"
# )

## Inexact Joining

In [13]:
df_left = pl.DataFrame({"int_id": [10, 5], "value": ["b", "a"]})

df_right = pl.DataFrame({"int_id": [4, 7, 12], "value": [1, 2, 3]})

In [14]:
# This raises an InvalidOperationError:
# df_left.join_asof(df_right, on="int_id", tolerance=3)

In [15]:
df_left = df_left.sort("int_id")
df_right = df_right

df_left.join_asof(df_right, on="int_id")

int_id,value,value_right
i64,str,i64
5,"""a""",1
10,"""b""",2


In [16]:
df_left.join_asof(
    df_right,
    on="int_id",
    coalesce=False,
)

int_id,value,int_id_right,value_right
i64,str,i64,i64
5,"""a""",4,1
10,"""b""",7,2


In [17]:
df_left.join_asof(
    df_right.rename({"int_id": "int_id_right"}),
    left_on="int_id",
    right_on="int_id_right",
)

int_id,value,int_id_right,value_right
i64,str,i64,i64
5,"""a""",4,1
10,"""b""",7,2


### Inexact Join Strategies

In [18]:
print(df_left)
print(df_right)

shape: (2, 2)
┌────────┬───────┐
│ int_id ┆ value │
│ ---    ┆ ---   │
│ i64    ┆ str   │
╞════════╪═══════╡
│ 5      ┆ a     │
│ 10     ┆ b     │
└────────┴───────┘
shape: (3, 2)
┌────────┬───────┐
│ int_id ┆ value │
│ ---    ┆ ---   │
│ i64    ┆ i64   │
╞════════╪═══════╡
│ 4      ┆ 1     │
│ 7      ┆ 2     │
│ 12     ┆ 3     │
└────────┴───────┘


In [19]:
df_left.join_asof(
    df_right,
    on="int_id",
    tolerance=3,
    strategy="backward",
)

int_id,value,value_right
i64,str,i64
5,"""a""",1
10,"""b""",2


In [20]:
df_left.join_asof(
    df_right,
    on="int_id",
    tolerance=3,
    strategy="forward",
)

int_id,value,value_right
i64,str,i64
5,"""a""",2
10,"""b""",3


In [None]:
df_left.join_asof(
    df_right,
    on="int_id",
    tolerance=3,
    strategy="nearest",
)

### Additional Fine-Tuning

### Use Case: Marketing Campaign Attribution

In [21]:
campaigns = pl.scan_csv("data/campaigns.csv")
campaigns.head(1).collect()

Campaign Name,Campaign Date,Product Type
str,str,str
"""Launch""","""2023-01-01 20:00:00""","""Electronics"""


In [22]:
campaigns.select(pl.col("Product Type").unique()).collect()

Product Type
str
"""Electronics"""
"""Books"""
"""Clothing"""
"""Furniture"""


In [23]:
transactions = pl.scan_csv("data/transactions.csv")
transactions.head(1).collect()

Sale Date,Product Type,Quantity
str,str,i64
"""2023-01-01 02:00:00.000000000""","""Books""",7


In [24]:
transactions = transactions.with_columns(
    pl.col("Sale Date")
    .str.to_datetime("%Y-%m-%d %H:%M:%S%.f")
    .cast(pl.Datetime("us")),
)
campaigns = campaigns.with_columns(
    pl.col("Campaign Date").str.to_datetime("%Y-%m-%d %H:%M:%S"),
)

sales_with_campaign_df = (
    transactions.sort("Sale Date")
    .join_asof(
        campaigns.sort("Campaign Date"),
        left_on="Sale Date",
        right_on="Campaign Date",
        by="Product Type",
        strategy="backward",
        tolerance="60d",
    )
    .collect()
)
sales_with_campaign_df

Sale Date,Product Type,Quantity,Campaign Name,Campaign Date
datetime[μs],str,i64,str,datetime[μs]
2023-01-01 01:26:12.558627,"""Electronics""",2,,
2023-01-01 02:00:00,"""Books""",7,,
2023-01-01 06:14:30.703535,"""Toys""",9,,
2023-01-01 06:52:25.117255,"""Clothing""",9,,
2023-01-01 07:44:50.234511,"""Books""",7,,
…,…,…,…,…
2023-12-31 15:45:29.296464,"""Clothing""",10,,
2023-12-31 18:15:09.765488,"""Toys""",4,,
2023-12-31 18:33:47.441372,"""Electronics""",7,,
2023-12-31 18:37:54.413720,"""Books""",6,,


In [25]:
(
    sales_with_campaign_df.group_by("Product Type", "Campaign Name")
    .agg(pl.col("Quantity").mean())
    .sort("Product Type", "Campaign Name")
)

Product Type,Campaign Name,Quantity
str,str,f64
"""Books""",,5.527716
"""Clothing""",,5.433385
"""Clothing""","""New Arrivals""",8.200581
"""Electronics""",,5.486832
"""Electronics""","""Launch""",8.080775
"""Electronics""","""Seasonal Sale""",8.471406
"""Furniture""",,5.430222
"""Furniture""","""Discount""",8.191888
"""Toys""",,5.50318


In [26]:
campaigns.filter(pl.col("Product Type") == "Books").collect()

Campaign Name,Campaign Date,Product Type
str,datetime[μs],str
"""Clearance""",2023-12-31 21:00:00,"""Books"""


In [27]:
(
    transactions.filter(
        (pl.col("Product Type") == "Books")
        & (
            pl.col("Sale Date")
            > pl.lit("2023-12-31 21:00:00").str.to_datetime()
        )
    ).collect()
)

Sale Date,Product Type,Quantity
datetime[μs],str,i64


## Vertical and Horizontal Concatenation

### Vertical

In [28]:
df1 = pl.DataFrame(
    {
        "id": [1, 2, 3],
        "value": ["a", "b", "c"],
    }
)
df2 = pl.DataFrame(
    {
        "id": [4, 5],
        "value": ["d", "e"],
    }
)
pl.concat([df1, df2], how="vertical")

id,value
i64,str
1,"""a"""
2,"""b"""
3,"""c"""
4,"""d"""
5,"""e"""


### Horizontal

In [29]:
df1 = pl.DataFrame(
    {
        "id": [1, 2, 3],
        "value": ["a", "b", "c"],
    }
)
df2 = pl.DataFrame(
    {
        "value2": ["x", "y"],
    }
)
pl.concat([df1, df2], how="horizontal")

id,value,value2
i64,str,str
1,"""a""","""x"""
2,"""b""","""y"""
3,"""c""",


### Diagonal

In [30]:
df1 = pl.DataFrame(
    {
        "id": [1, 2, 3],
        "value": ["a", "b", "c"],
    }
)
df2 = pl.DataFrame(
    {
        "value": ["d", "e"],
        "value2": ["x", "y"],
    }
)
pl.concat([df1, df2], how="diagonal")

id,value,value2
i64,str,str
1.0,"""a""",
2.0,"""b""",
3.0,"""c""",
,"""d""","""x"""
,"""e""","""y"""


### Align

In [31]:
df1 = pl.DataFrame(
    {
        "id": [1, 2, 3],
        "value": ["a", "b", "c"],
    }
)
df2 = pl.DataFrame(
    {
        "value": ["a", "c", "d"],
        "value2": ["x", "y", "z"],
    }
)
pl.concat([df1, df2], how="align")

id,value,value2
i64,str,str
1.0,"""a""","""x"""
2.0,"""b""",
3.0,"""c""","""y"""
,"""d""","""z"""


In [32]:
df1 = pl.DataFrame(
    {
        "id": [1, 2, 2],
        "value": ["a", "c", "b"],
    }
)
df2 = pl.DataFrame(
    {
        "id": [2, 2],
        "value": ["x", "y"],
    }
)
pl.align_frames(df1, df2, on="id")

[shape: (5, 2)
 ┌─────┬───────┐
 │ id  ┆ value │
 │ --- ┆ ---   │
 │ i64 ┆ str   │
 ╞═════╪═══════╡
 │ 1   ┆ a     │
 │ 2   ┆ c     │
 │ 2   ┆ b     │
 │ 2   ┆ c     │
 │ 2   ┆ b     │
 └─────┴───────┘,
 shape: (5, 2)
 ┌─────┬───────┐
 │ id  ┆ value │
 │ --- ┆ ---   │
 │ i64 ┆ str   │
 ╞═════╪═══════╡
 │ 1   ┆ null  │
 │ 2   ┆ x     │
 │ 2   ┆ x     │
 │ 2   ┆ y     │
 │ 2   ┆ y     │
 └─────┴───────┘]

### Relaxed

In [33]:
# This raises a SchemaError:
# df1 = pl.DataFrame(
#     {
#         "id": [1, 2, 3],
#         "value": ["a", "b", "c"],
#     }
# )
# df2 = pl.DataFrame(
#     {
#         "id": [4.0, 5.0],
#         "value": [1, 2],
#     }
# )
# pl.concat([df1, df2], how="vertical")

In [34]:
pl.concat([df1, df2], how="vertical_relaxed")

id,value
i64,str
1,"""a"""
2,"""c"""
2,"""b"""
2,"""x"""
2,"""y"""


### Stacking

In [35]:
df1 = pl.DataFrame(
    {
        "id": [1, 2],
        "value": ["a", "b"],
    }
)
df2 = pl.DataFrame(
    {
        "id": [3, 4],
        "value": ["c", "d"],
    }
)
df1.vstack(df2)

id,value
i64,str
1,"""a"""
2,"""b"""
3,"""c"""
4,"""d"""


In [36]:
df1 = pl.DataFrame(
    {
        "id": [1, 2],
        "value": ["a", "b"],
    }
)
df2 = pl.DataFrame(
    {
        "value2": ["x", "y"],
    }
)
df1.hstack(df2)

id,value,value2
i64,str,str
1,"""a""","""x"""
2,"""b""","""y"""


### Appending

In [37]:
series_a = pl.Series("a", [1, 2])
series_b = pl.Series("b", [3, 4])
series_a.append(series_b)

a
i64
1
2
3
4


### Extending

In [38]:
df1 = pl.DataFrame(
    {
        "id": [1, 2],
        "value": ["a", "b"],
    }
)
df2 = pl.DataFrame(
    {
        "id": [3, 4],
        "value": ["c", "d"],
    }
)
df1.extend(df2)

id,value
i64,str
1,"""a"""
2,"""b"""
3,"""c"""
4,"""d"""
