In [2]:
import pandas as pd
import polars as pl
from datetime import datetime

In [21]:
data = {
    "id": [1, 2, 3, 4, 5],
    "name": ["Alice", "Bob", "Charlie", "David", "Eve"],
    "age": [25, 30, 35, 40, 28],
    "salary": [50000, 60000, 55000, 70000, 65000],
    "dept": ["HR", "IT", "IT", "Finance", "HR"]
}

dpd = pd.DataFrame(data)
dpd

Unnamed: 0,id,name,age,salary,dept
0,1,Alice,25,50000,HR
1,2,Bob,30,60000,IT
2,3,Charlie,35,55000,IT
3,4,David,40,70000,Finance
4,5,Eve,28,65000,HR


In [22]:
dpl = pl.DataFrame(data)
dpl

id,name,age,salary,dept
i64,str,i64,i64,str
1,"""Alice""",25,50000,"""HR"""
2,"""Bob""",30,60000,"""IT"""
3,"""Charlie""",35,55000,"""IT"""
4,"""David""",40,70000,"""Finance"""
5,"""Eve""",28,65000,"""HR"""


In [6]:
dpd['name']

0      Alice
1        Bob
2    Charlie
3      David
4        Eve
Name: name, dtype: object

In [7]:
dpl['name']

name
str
"""Alice"""
"""Bob"""
"""Charlie"""
"""David"""
"""Eve"""


In [8]:
dpl.select('name')

name
str
"""Alice"""
"""Bob"""
"""Charlie"""
"""David"""
"""Eve"""


In [9]:
pl.col("name")

In [10]:
dpl.select([pl.col("name"), pl.col("salary")])

name,salary
str,i64
"""Alice""",50000
"""Bob""",60000
"""Charlie""",55000
"""David""",70000
"""Eve""",65000


In [11]:
dpl.select(['name', 'salary'])

name,salary
str,i64
"""Alice""",50000
"""Bob""",60000
"""Charlie""",55000
"""David""",70000
"""Eve""",65000


##### select

In [12]:
# ❌❌ ERROR:- AttributeError: 'str' object has no attribute 'mean' ❌❌

# dpl.select(['salary'.mean().alias('avg_salary'),
#             'age'.max().alias('oldest_age')])

In [13]:
dpl.select([pl.col("salary").mean().alias('avg_salary'),
            pl.col("age").max().alias('oldest_age')])

avg_salary,oldest_age
f64,i64
60000.0,40


In [56]:
dpl.select([pl.all().cast(pl.Utf8)]).with_columns(pl.lit(datetime.now()).alias("load_timestamp"))

id,name,age,salary,dept,load_timestamp
str,str,str,str,str,datetime[μs]
"""1""","""Alice""","""25""","""50000""","""HR""",2025-08-28 12:39:16.174166
"""2""","""Bob""","""30""","""60000""","""IT""",2025-08-28 12:39:16.174166
"""3""","""Charlie""","""35""","""55000""","""IT""",2025-08-28 12:39:16.174166
"""4""","""David""","""40""","""70000""","""Finance""",2025-08-28 12:39:16.174166
"""5""","""Eve""","""28""","""65000""","""HR""",2025-08-28 12:39:16.174166


### Row_numbers

In [15]:
dpl.height 

5

In [16]:
range(dpl.height)

range(0, 5)

In [24]:
dpl

id,name,age,salary,dept
i64,str,i64,i64,str
1,"""Alice""",25,50000,"""HR"""
2,"""Bob""",30,60000,"""IT"""
3,"""Charlie""",35,55000,"""IT"""
4,"""David""",40,70000,"""Finance"""
5,"""Eve""",28,65000,"""HR"""


In [27]:
dpl.with_row_index("index")

index,id,name,age,salary,dept
u32,i64,str,i64,i64,str
0,1,"""Alice""",25,50000,"""HR"""
1,2,"""Bob""",30,60000,"""IT"""
2,3,"""Charlie""",35,55000,"""IT"""
3,4,"""David""",40,70000,"""Finance"""
4,5,"""Eve""",28,65000,"""HR"""


In [33]:
dpl_new = dpl.with_row_index("index").rename({"index": "row_numbers"})
dpl_new

row_numbers,id,name,age,salary,dept
u32,i64,str,i64,i64,str
0,1,"""Alice""",25,50000,"""HR"""
1,2,"""Bob""",30,60000,"""IT"""
2,3,"""Charlie""",35,55000,"""IT"""
3,4,"""David""",40,70000,"""Finance"""
4,5,"""Eve""",28,65000,"""HR"""


In [None]:
# dpl_new.drop(pl.col("row_numbers"))
# OR
dpl_new.drop("row_numbers")

id,name,age,salary,dept
i64,str,i64,i64,str
1,"""Alice""",25,50000,"""HR"""
2,"""Bob""",30,60000,"""IT"""
3,"""Charlie""",35,55000,"""IT"""
4,"""David""",40,70000,"""Finance"""
5,"""Eve""",28,65000,"""HR"""


### pl.col() VS pl.lit()
> pl.col("...") → refers to an existing column in your DataFrame.
- Used when you want to transform, filter, or compute with existing data.

> pl.lit(value) → creates a literal constant value (the same for every row).
- Used for constants, default values, or batch audit fields.

> Both are expressions you can use inside select, with_columns, filter, group_by, etc.

In [36]:
df = pl.DataFrame({"a": [1, 2, 3], "b": [10, 20, 30]})
df

a,b
i64,i64
1,10
2,20
3,30


In [37]:
df.select(
    (pl.col("a") + pl.col("b")).alias("sum")
)

sum
i64
11
22
33


In [47]:
pl.col("a") * 0.9

In [46]:
df.select(
    pl.col("a") * 0.9
)

a
f64
0.9
1.8
2.7


In [44]:
df.select(
    pl.lit("constant").alias("flag"),
    pl.lit(datetime.now()).alias("timestamp")
)

flag,timestamp
str,datetime[μs]
"""constant""",2025-08-28 12:00:30.372824


In [None]:
df.select(
    pl.col('a'),
    pl.col('b'),
    pl.lit("constant").alias("flag"),
    pl.lit(datetime.now()).alias("timestamp")
)

a,b,flag,timestamp
i64,i64,str,datetime[μs]
1,10,"""constant""",2025-08-28 12:00:11.781613
2,20,"""constant""",2025-08-28 12:00:11.781613
3,30,"""constant""",2025-08-28 12:00:11.781613


In [52]:
df.select(
    pl.col("a") == pl.lit(2)
)

a
bool
False
True
False


### pl.all()

In [3]:
df = pl.DataFrame({
    "product_id": ["P1", "P1", "P2"],
    "product_name": ["Phone", "PhoneX", "Laptop"],
    "brand": ["Apple", "Apple", "Dell"],
    "price": [1000, 1200, 2000]
})
print(df)

shape: (3, 4)
┌────────────┬──────────────┬───────┬───────┐
│ product_id ┆ product_name ┆ brand ┆ price │
│ ---        ┆ ---          ┆ ---   ┆ ---   │
│ str        ┆ str          ┆ str   ┆ i64   │
╞════════════╪══════════════╪═══════╪═══════╡
│ P1         ┆ Phone        ┆ Apple ┆ 1000  │
│ P1         ┆ PhoneX       ┆ Apple ┆ 1200  │
│ P2         ┆ Laptop       ┆ Dell  ┆ 2000  │
└────────────┴──────────────┴───────┴───────┘


#### 1. Why group_by?
- to collect all rows that share the same product_id together
- i.e. if we want one row per product_id
> exactly like your SQL ROW_NUMBER() PARTITION BY product_id … WHERE row_num=1

In [4]:
for k,v in df.group_by('product_id'):
    print(k, v, sep="\n")

('P2',)
shape: (1, 4)
┌────────────┬──────────────┬───────┬───────┐
│ product_id ┆ product_name ┆ brand ┆ price │
│ ---        ┆ ---          ┆ ---   ┆ ---   │
│ str        ┆ str          ┆ str   ┆ i64   │
╞════════════╪══════════════╪═══════╪═══════╡
│ P2         ┆ Laptop       ┆ Dell  ┆ 2000  │
└────────────┴──────────────┴───────┴───────┘
('P1',)
shape: (2, 4)
┌────────────┬──────────────┬───────┬───────┐
│ product_id ┆ product_name ┆ brand ┆ price │
│ ---        ┆ ---          ┆ ---   ┆ ---   │
│ str        ┆ str          ┆ str   ┆ i64   │
╞════════════╪══════════════╪═══════╪═══════╡
│ P1         ┆ Phone        ┆ Apple ┆ 1000  │
│ P1         ┆ PhoneX       ┆ Apple ┆ 1200  │
└────────────┴──────────────┴───────┴───────┘


#### 2. Why .agg()?

- Once you group, Polars asks: “How should I collapse multiple rows in each group into one row?”
- That’s what .agg() (aggregate) specifies.

> Common aggregates: .count(), .sum(), .mean(), .first(), .last(), etc.

In [6]:
df.group_by('product_id').agg(pl.count())

(Deprecated in version 0.20.5)
  df.group_by('product_id').agg(pl.count())


product_id,count
str,u32
"""P2""",1
"""P1""",2


#### 3. Why pl.all().first()?

- pl.all() → means: apply the same rule to all columns in the DataFrame.

- .first() → says: take the first value of that column within the group.

> So pl.all().first() = for every column in the DataFrame, collapse duplicates by keeping the first value within each group.

In [10]:
df.group_by('product_id').agg(pl.all().first())

product_id,product_name,brand,price
str,str,str,i64
"""P1""","""Phone""","""Apple""",1000
"""P2""","""Laptop""","""Dell""",2000


In [11]:
df.group_by('product_id').agg(pl.all().last())

product_id,product_name,brand,price
str,str,str,i64
"""P2""","""Laptop""","""Dell""",2000
"""P1""","""PhoneX""","""Apple""",1200


#### 5. Why Not Just drop_duplicates?

- Drops duplicate product_id rows,
- But Keeps the first occurrence (only). ==> Can't be customised for last row or other function

> Both are valid — .group_by().agg() is more flexible if you want different rules per column (e.g., take min(price) but first(product_name)).

In [14]:
df = df.unique(subset=["product_id"], keep="first")
df

product_id,product_name,brand,price
str,str,str,i64
"""P2""","""Laptop""","""Dell""",2000
"""P1""","""Phone""","""Apple""",1000


#### implode vs. explode

In [None]:
s = pl.Series("gender", ["Male", "Female", "Other"])
s

gender
str
"""Male"""
"""Female"""
"""Other"""


In [None]:
s.implode()

gender
list[str]
"[""Male"", ""Female"", ""Other""]"


In [None]:
s1 = s.implode()

shape: (1,)
Series: 'gender' [list[str]]
[
	["Male", "Female", "Other"]
]


In [None]:
s1.explode()

gender
str
"""Male"""
"""Female"""
"""Other"""


##### using implode with 'is_not_in'
> new_rows = df_src.filter(pl.col(key_col).is_not_in(df_tgt.select(key_col)[key_col].implode()))

- df_tgt.select(key_col) → returns a DataFrame with one column.

- df_tgt.select(key_col)[key_col] → gives you a Series (a column).

- But is_in() / is_not_in() expects a list-like-value (Series of values) to compare against.

- .implode() takes that Series of many rows and combines them into a single Series containing a list.

> .implode() → turns that Series into a single column vector (instead of multiple rows), so that is_in can compare against it.

In [4]:
df_tgt = pl.DataFrame({"id": [2, 3, 4]})
print(df_tgt.select("id")["id"])
print(df_tgt.select("id")["id"].implode())

shape: (3,)
Series: 'id' [i64]
[
	2
	3
	4
]
shape: (1,)
Series: 'id' [list[i64]]
[
	[2, 3, 4]
]


In [6]:
# Alternative
print(df_tgt["id"])
print(df_tgt["id"].to_list())

shape: (3,)
Series: 'id' [i64]
[
	2
	3
	4
]
[2, 3, 4]


##### is_in VS anti-join
- > Use anti join (Version B) whenever possible because:
- It’s clearer (SQL-style semantics).
- Handles multi-column keys naturally.
- More scalable (doesn’t materialize a giant list in memory).
- > Use is_in (Version A) only if:
- You’re dealing with a single column key.
- The target set (active_rows) is very small.
- You want to write a quick one-liner.

In [3]:
df_src = pl.DataFrame({"id": [1, 2, 3, 4]})
active_rows = pl.DataFrame({"id": [2, 4]})
print(df_src)
active_rows

shape: (4, 1)
┌─────┐
│ id  │
│ --- │
│ i64 │
╞═════╡
│ 1   │
│ 2   │
│ 3   │
│ 4   │
└─────┘


id
i64
2
4


In [5]:
df_src.filter(~pl.col("id").is_in(active_rows["id"].implode()))

id
i64
1
3


In [None]:
df_src.join(active_rows, on="id", how="anti")

id
i64
1
3


### Parquet > CSV
- Smaller size (column compression).
- Faster queries (skip irrelevant columns, predicate pushdown).
- Schema preserved (no need to recast each time).
- Serializes your DataFrame to a .parquet file on disk.
- Keeps column types (strings, ints, dates, timestamps).
- Compresses by default using Snappy (good balance of speed & size).
- DWH-like feel → exactly how “bronze/silver/gold” layers are often stored in lakehouse pipelines.
- Columnar storage system

## joins

In [4]:
left_df = pl.DataFrame({
    "id": [1, 2, 3, 4],
    "name": ["Alice", "Bob", "Charlie", "David"],
    "age": [25, 30, 35, 40]
})

right_df = pl.DataFrame({
    "id": [1, 2, 3, 5],
    "salary": [50000, 60000, 70000, 80000],
    "department": ["HR", "IT", "Finance", "IT"]
})

print("Left table:")
print(left_df)
print("\nRight table:")
print(right_df)

# Method 1 in action
result1 = (
    left_df
    .join(right_df, on="id", how="inner")
    # .select(left_df.columns)
)
print("\nResult keeping only left table columns:")
print(result1)

# Method 5 - Semi join (most efficient for filtering)
result2 = left_df.join(right_df, on="id", how="semi")
print("\nResult using semi join (filtering only):")
print(result2)

# Method 6 - Anti join (right rows without matches in left):")
result_anti = (
    right_df
    .join(left_df, on="id", how="anti")  # Right rows WITHOUT matches in left
)
print("\nMethod 3 - Anti join (right rows without matches in left):")
print(result_anti)

Left table:
shape: (4, 3)
┌─────┬─────────┬─────┐
│ id  ┆ name    ┆ age │
│ --- ┆ ---     ┆ --- │
│ i64 ┆ str     ┆ i64 │
╞═════╪═════════╪═════╡
│ 1   ┆ Alice   ┆ 25  │
│ 2   ┆ Bob     ┆ 30  │
│ 3   ┆ Charlie ┆ 35  │
│ 4   ┆ David   ┆ 40  │
└─────┴─────────┴─────┘

Right table:
shape: (4, 3)
┌─────┬────────┬────────────┐
│ id  ┆ salary ┆ department │
│ --- ┆ ---    ┆ ---        │
│ i64 ┆ i64    ┆ str        │
╞═════╪════════╪════════════╡
│ 1   ┆ 50000  ┆ HR         │
│ 2   ┆ 60000  ┆ IT         │
│ 3   ┆ 70000  ┆ Finance    │
│ 5   ┆ 80000  ┆ IT         │
└─────┴────────┴────────────┘

Result keeping only left table columns:
shape: (3, 5)
┌─────┬─────────┬─────┬────────┬────────────┐
│ id  ┆ name    ┆ age ┆ salary ┆ department │
│ --- ┆ ---     ┆ --- ┆ ---    ┆ ---        │
│ i64 ┆ str     ┆ i64 ┆ i64    ┆ str        │
╞═════╪═════════╪═════╪════════╪════════════╡
│ 1   ┆ Alice   ┆ 25  ┆ 50000  ┆ HR         │
│ 2   ┆ Bob     ┆ 30  ┆ 60000  ┆ IT         │
│ 3   ┆ Charlie ┆ 35  ┆ 70000