# 9. Selecting and Creating Columns

In [None]:
import polars as pl
pl.__version__  

'1.20.0'

In [2]:
starwars = pl.read_parquet("data/starwars.parquet")
rebels = starwars.drop("films").filter(
    pl.col("name").is_in(["Luke Skywalker", "Leia Organa", "Han Solo"])
)

print(rebels[:, :6])
print(rebels[:, 6:11])
print(rebels[:, 11:])

shape: (3, 6)
┌────────────────┬────────┬──────┬────────────┬────────────┬───────────┐
│ name           ┆ height ┆ mass ┆ hair_color ┆ skin_color ┆ eye_color │
│ ---            ┆ ---    ┆ ---  ┆ ---        ┆ ---        ┆ ---       │
│ str            ┆ u16    ┆ f64  ┆ str        ┆ str        ┆ str       │
╞════════════════╪════════╪══════╪════════════╪════════════╪═══════════╡
│ Han Solo       ┆ 180    ┆ 80.0 ┆ brown      ┆ fair       ┆ brown     │
│ Leia Organa    ┆ 150    ┆ 49.0 ┆ brown      ┆ light      ┆ brown     │
│ Luke Skywalker ┆ 172    ┆ 77.0 ┆ blond      ┆ fair       ┆ blue      │
└────────────────┴────────┴──────┴────────────┴────────────┴───────────┘
shape: (3, 5)
┌────────────┬────────┬───────────┬───────────┬─────────┐
│ birth_year ┆ sex    ┆ gender    ┆ homeworld ┆ species │
│ ---        ┆ ---    ┆ ---       ┆ ---       ┆ ---     │
│ f64        ┆ cat    ┆ cat       ┆ str       ┆ str     │
╞════════════╪════════╪═══════════╪═══════════╪═════════╡
│ 29.0       ┆ male   ┆ m

## Selecting Columns

In [3]:
rebels.select(
    "name",
    pl.col("homeworld"),
    pl.col("^.*_color$"),
    (pl.col("height") / 100).alias("height_m"),
)

name,homeworld,hair_color,skin_color,eye_color,height_m
str,str,str,str,str,f64
"""Han Solo""","""Corellia""","""brown""","""fair""","""brown""",1.8
"""Leia Organa""","""Alderaan""","""brown""","""light""","""brown""",1.5
"""Luke Skywalker""","""Tatooine""","""blond""","""fair""","""blue""",1.72


### Introducing Selectors

In [4]:
import polars.selectors as cs

In [5]:
rebels.select(
    "name",
    cs.by_name("homeworld"),
    cs.by_name("^.*_color$"),
    (cs.by_name("height") / 100).alias("height_m"),
)

name,homeworld,hair_color,skin_color,eye_color,height_m
str,str,str,str,str,f64
"""Han Solo""","""Corellia""","""brown""","""fair""","""brown""",1.8
"""Leia Organa""","""Alderaan""","""brown""","""light""","""brown""",1.5
"""Luke Skywalker""","""Tatooine""","""blond""","""fair""","""blue""",1.72


### Selecting Based on Name

In [6]:
rebels.select(cs.starts_with("birth_"))

birth_year,birth_date
f64,date
29.0,1948-06-01
19.0,1958-05-30
19.0,1958-05-30


In [7]:
rebels.select(cs.ends_with("_color"))

hair_color,skin_color,eye_color
str,str,str
"""brown""","""fair""","""brown"""
"""brown""","""light""","""brown"""
"""blond""","""fair""","""blue"""


In [8]:
rebels.select(cs.contains("_"))

hair_color,skin_color,eye_color,birth_year,birth_date,screen_time
str,str,str,f64,date,duration[μs]
"""brown""","""fair""","""brown""",29.0,1948-06-01,1h 12m 37s
"""brown""","""light""","""brown""",19.0,1958-05-30,1h 3m 40s
"""blond""","""fair""","""blue""",19.0,1958-05-30,1h 58m 44s


In [9]:
rebels.select(cs.matches("^[a-z]{4}$"))

name,mass
str,f64
"""Han Solo""",80.0
"""Leia Organa""",49.0
"""Luke Skywalker""",77.0


### Selecting Based on Data Type

In [10]:
rebels.group_by("hair_color").agg(cs.numeric().mean())

hair_color,height,mass,birth_year
str,f64,f64,f64
"""blond""",172.0,77.0,19.0
"""brown""",165.0,64.5,24.0


In [11]:
rebels.select(cs.string())

name,hair_color,skin_color,eye_color,homeworld,species
str,str,str,str,str,str
"""Han Solo""","""brown""","""fair""","""brown""","""Corellia""","""Human"""
"""Leia Organa""","""brown""","""light""","""brown""","""Alderaan""","""Human"""
"""Luke Skywalker""","""blond""","""fair""","""blue""","""Tatooine""","""Human"""


In [12]:
rebels.select(cs.temporal())

birth_date,screen_time
date,duration[μs]
1948-06-01,1h 12m 37s
1958-05-30,1h 3m 40s
1958-05-30,1h 58m 44s


In [13]:
rebels.select(cs.by_dtype(pl.List(pl.String)))

vehicles,starships
list[str],list[str]
,"[""Millennium Falcon"", ""Imperial shuttle""]"
"[""Imperial Speeder Bike""]",
"[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]"


### Selecting Based on Position

In [14]:
rebels.select(cs.by_index(range(0, 999, 3)))

name,hair_color,birth_year,homeworld,starships
str,str,f64,str,list[str]
"""Han Solo""","""brown""",29.0,"""Corellia""","[""Millennium Falcon"", ""Imperial shuttle""]"
"""Leia Organa""","""brown""",19.0,"""Alderaan""",
"""Luke Skywalker""","""blond""",19.0,"""Tatooine""","[""X-wing"", ""Imperial shuttle""]"


In [15]:
rebels.select("name", cs.by_index(range(-2, 0)))

name,birth_date,screen_time
str,date,duration[μs]
"""Han Solo""",1948-06-01,1h 12m 37s
"""Leia Organa""",1958-05-30,1h 3m 40s
"""Luke Skywalker""",1958-05-30,1h 58m 44s


In [None]:
# This raises a ColumnNotFoundError:
# rebels.select(cs.by_index(20))

In [16]:
rebels.select(cs.by_index(range(20, 22)))

### Combining Selectors

In [17]:
rebels.select(cs.by_name("hair_color") | cs.numeric())

height,mass,hair_color,birth_year
u16,f64,str,f64
180,80.0,"""brown""",29.0
150,49.0,"""brown""",19.0
172,77.0,"""blond""",19.0


In [18]:
df = pl.DataFrame({"d": 1, "i": True, "s": True, "c": True, "o": 1.0})

print(df)

x = cs.by_name("d", "i", "s")
y = cs.boolean()

print("\nselector => columns")

for s in ["x", "y", "x | y", "x & y", "x - y", "x ^ y", "~x", "x - x"]:
    print(f"{s:8} => {cs.expand_selector(df, eval(s))}")

shape: (1, 5)
┌─────┬──────┬──────┬──────┬─────┐
│ d   ┆ i    ┆ s    ┆ c    ┆ o   │
│ --- ┆ ---  ┆ ---  ┆ ---  ┆ --- │
│ i64 ┆ bool ┆ bool ┆ bool ┆ f64 │
╞═════╪══════╪══════╪══════╪═════╡
│ 1   ┆ true ┆ true ┆ true ┆ 1.0 │
└─────┴──────┴──────┴──────┴─────┘

selector => columns
x        => ('d', 'i', 's')
y        => ('i', 's', 'c')
x | y    => ('d', 'i', 's', 'c')
x & y    => ('i', 's')
x - y    => ('d',)
x ^ y    => ('d', 'c')
~x       => ('c', 'o')
x - x    => ()


In [19]:
df.select(x - x)

In [20]:
print(df.select(first := cs.by_name("c", "i"), ~first))
print(f"first: {first}, ~first: {~first}")

shape: (1, 5)
┌──────┬──────┬─────┬──────┬─────┐
│ c    ┆ i    ┆ d   ┆ s    ┆ o   │
│ ---  ┆ ---  ┆ --- ┆ ---  ┆ --- │
│ bool ┆ bool ┆ i64 ┆ bool ┆ f64 │
╞══════╪══════╪═════╪══════╪═════╡
│ true ┆ true ┆ 1   ┆ true ┆ 1.0 │
└──────┴──────┴─────┴──────┴─────┘
first: cols(["c", "i"]), ~first: selector


In [21]:
print(df.select(first := cs.last(), ~first))
print(f"first: {first}, ~first: {~first}")

shape: (1, 5)
┌─────┬─────┬──────┬──────┬──────┐
│ o   ┆ d   ┆ i    ┆ s    ┆ c    │
│ --- ┆ --- ┆ ---  ┆ ---  ┆ ---  │
│ f64 ┆ i64 ┆ bool ┆ bool ┆ bool │
╞═════╪═════╪══════╪══════╪══════╡
│ 1.0 ┆ 1   ┆ true ┆ true ┆ true │
└─────┴─────┴──────┴──────┴──────┘
first: nth(-1), ~first: selector


## Creating Columns

In [22]:
rebels.with_columns(bmi=pl.col("mass") / ((pl.col("height") / 100) ** 2))

name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,vehicles,starships,birth_date,screen_time,bmi
str,u16,f64,str,str,str,f64,cat,cat,str,str,list[str],list[str],date,duration[μs],f64
"""Han Solo""",180,80.0,"""brown""","""fair""","""brown""",29.0,"""male""","""masculine""","""Corellia""","""Human""",,"[""Millennium Falcon"", ""Imperial shuttle""]",1948-06-01,1h 12m 37s,24.691358
"""Leia Organa""",150,49.0,"""brown""","""light""","""brown""",19.0,"""female""","""feminine""","""Alderaan""","""Human""","[""Imperial Speeder Bike""]",,1958-05-30,1h 3m 40s,21.777778
"""Luke Skywalker""",172,77.0,"""blond""","""fair""","""blue""",19.0,"""male""","""masculine""","""Tatooine""","""Human""","[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]",1958-05-30,1h 58m 44s,26.027582


In [23]:
df = pl.DataFrame({"a": [1, 2, 3]})
df.with_columns(pl.col("a") * 2)

a
i64
2
4
6


In [24]:
df.with_columns(a2=pl.col("a") * 2)

a,a2
i64,i64
1,2
2,4
3,6


In [25]:
rebels.with_columns(
    bmi=pl.col("mass") / ((pl.col("height") / 100) ** 2),
    age_destroy=(
        (pl.date(1983, 5, 25) - pl.col("birth_date")).dt.total_days() / 365
    ).cast(pl.UInt8),
)

name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,vehicles,starships,birth_date,screen_time,bmi,age_destroy
str,u16,f64,str,str,str,f64,cat,cat,str,str,list[str],list[str],date,duration[μs],f64,u8
"""Han Solo""",180,80.0,"""brown""","""fair""","""brown""",29.0,"""male""","""masculine""","""Corellia""","""Human""",,"[""Millennium Falcon"", ""Imperial shuttle""]",1948-06-01,1h 12m 37s,24.691358,35
"""Leia Organa""",150,49.0,"""brown""","""light""","""brown""",19.0,"""female""","""feminine""","""Alderaan""","""Human""","[""Imperial Speeder Bike""]",,1958-05-30,1h 3m 40s,21.777778,25
"""Luke Skywalker""",172,77.0,"""blond""","""fair""","""blue""",19.0,"""male""","""masculine""","""Tatooine""","""Human""","[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]",1958-05-30,1h 58m 44s,26.027582,25


In [26]:
# This raises a ColumnNotFoundError:
# rebels.with_columns(
#     bmi=pl.col("mass") / ((pl.col("height") / 100) ** 2),
#     bmi_cat=pl.col("bmi").cut(
#         [18.5, 25], labels=["Underweight", "Normal", "Overweight"]
#     ),
# )

In [27]:
(
    rebels.with_columns(
        bmi=pl.col("mass") / ((pl.col("height") / 100) ** 2)
    ).with_columns(
        bmi_cat=pl.col("bmi").cut(
            [18.5, 25], labels=["Underweight", "Normal", "Overweight"]
        )
    )
)

name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,vehicles,starships,birth_date,screen_time,bmi,bmi_cat
str,u16,f64,str,str,str,f64,cat,cat,str,str,list[str],list[str],date,duration[μs],f64,cat
"""Han Solo""",180,80.0,"""brown""","""fair""","""brown""",29.0,"""male""","""masculine""","""Corellia""","""Human""",,"[""Millennium Falcon"", ""Imperial shuttle""]",1948-06-01,1h 12m 37s,24.691358,"""Normal"""
"""Leia Organa""",150,49.0,"""brown""","""light""","""brown""",19.0,"""female""","""feminine""","""Alderaan""","""Human""","[""Imperial Speeder Bike""]",,1958-05-30,1h 3m 40s,21.777778,"""Normal"""
"""Luke Skywalker""",172,77.0,"""blond""","""fair""","""blue""",19.0,"""male""","""masculine""","""Tatooine""","""Human""","[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]",1958-05-30,1h 58m 44s,26.027582,"""Overweight"""


In [None]:
# This raises a SyntaxError:
# starwars.select(
#     "name",
#     bmi=(pl.col("mass") / ((pl.col("height") / 100) ** 2)),
#     "species",
# )

In [28]:
(
    starwars.select(
        "name",
        (pl.col("mass") / ((pl.col("height") / 100) ** 2)).alias("bmi"),
        "species",
    )
    .drop_nulls()
    .top_k(5, by="bmi")
)

name,bmi,species
str,f64,str
"""Jabba Desilijic Tiure""",443.428571,"""Hutt"""
"""Dud Bolt""",50.928022,"""Vulptereen"""
"""Yoda""",39.02663,"""Yoda's species"""
"""Owen Lars""",37.874006,"""Human"""
"""IG-88""",35.0,"""Droid"""


In [29]:
df.with_columns(pl.lit(1).alias("ones"))

a,ones
i64,i32
1,1
2,1
3,1


In [30]:
df.select(pl.all(), pl.lit(1).alias("ones"))

a,ones
i64,i32
1,1
2,1
3,1


## Related Column Operations

### Dropping

In [31]:
rebels.drop("name", "films", "screen_time", strict=False)

height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,vehicles,starships,birth_date
u16,f64,str,str,str,f64,cat,cat,str,str,list[str],list[str],date
180,80.0,"""brown""","""fair""","""brown""",29.0,"""male""","""masculine""","""Corellia""","""Human""",,"[""Millennium Falcon"", ""Imperial shuttle""]",1948-06-01
150,49.0,"""brown""","""light""","""brown""",19.0,"""female""","""feminine""","""Alderaan""","""Human""","[""Imperial Speeder Bike""]",,1958-05-30
172,77.0,"""blond""","""fair""","""blue""",19.0,"""male""","""masculine""","""Tatooine""","""Human""","[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]",1958-05-30


In [32]:
rebels.select(~cs.by_name("name", "films", "screen_time"))

height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,vehicles,starships,birth_date
u16,f64,str,str,str,f64,cat,cat,str,str,list[str],list[str],date
180,80.0,"""brown""","""fair""","""brown""",29.0,"""male""","""masculine""","""Corellia""","""Human""",,"[""Millennium Falcon"", ""Imperial shuttle""]",1948-06-01
150,49.0,"""brown""","""light""","""brown""",19.0,"""female""","""feminine""","""Alderaan""","""Human""","[""Imperial Speeder Bike""]",,1958-05-30
172,77.0,"""blond""","""fair""","""blue""",19.0,"""male""","""masculine""","""Tatooine""","""Human""","[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]",1958-05-30


In [33]:
rebels.select(cs.exclude("name", "films", "screen_time"))

height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,vehicles,starships,birth_date
u16,f64,str,str,str,f64,cat,cat,str,str,list[str],list[str],date
180,80.0,"""brown""","""fair""","""brown""",29.0,"""male""","""masculine""","""Corellia""","""Human""",,"[""Millennium Falcon"", ""Imperial shuttle""]",1948-06-01
150,49.0,"""brown""","""light""","""brown""",19.0,"""female""","""feminine""","""Alderaan""","""Human""","[""Imperial Speeder Bike""]",,1958-05-30
172,77.0,"""blond""","""fair""","""blue""",19.0,"""male""","""masculine""","""Tatooine""","""Human""","[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]",1958-05-30


### Renaming

In [34]:
(
    rebels.rename({"homeworld": "planet", "mass": "weight"})
    .rename(lambda s: s.removesuffix("_color"))
    .select("name", "planet", "weight", "hair", "skin", "eye")
)

name,planet,weight,hair,skin,eye
str,str,f64,str,str,str
"""Han Solo""","""Corellia""",80.0,"""brown""","""fair""","""brown"""
"""Leia Organa""","""Alderaan""",49.0,"""brown""","""light""","""brown"""
"""Luke Skywalker""","""Tatooine""",77.0,"""blond""","""fair""","""blue"""


### Stacking

In [35]:
rebel_names = rebels.select("name")
rebel_colors = rebels.select(cs.ends_with("_color"))
rebel_quotes = pl.Series(
    "quote",
    [
        "You know, sometimes I amaze myself.",
        "That doesn't sound too hard.",
        "I have a bad feeling about this.",
    ],
)

(rebel_names.hstack(rebel_colors).hstack([rebel_quotes]))

name,hair_color,skin_color,eye_color,quote
str,str,str,str,str
"""Han Solo""","""brown""","""fair""","""brown""","""You know, sometimes I amaze my…"
"""Leia Organa""","""brown""","""light""","""brown""","""That doesn't sound too hard."""
"""Luke Skywalker""","""blond""","""fair""","""blue""","""I have a bad feeling about thi…"


### Adding Row Indices

In [36]:
rebels.with_row_index(name="rebel_id", offset=1)

rebel_id,name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,vehicles,starships,birth_date,screen_time
u32,str,u16,f64,str,str,str,f64,cat,cat,str,str,list[str],list[str],date,duration[μs]
1,"""Han Solo""",180,80.0,"""brown""","""fair""","""brown""",29.0,"""male""","""masculine""","""Corellia""","""Human""",,"[""Millennium Falcon"", ""Imperial shuttle""]",1948-06-01,1h 12m 37s
2,"""Leia Organa""",150,49.0,"""brown""","""light""","""brown""",19.0,"""female""","""feminine""","""Alderaan""","""Human""","[""Imperial Speeder Bike""]",,1958-05-30,1h 3m 40s
3,"""Luke Skywalker""",172,77.0,"""blond""","""fair""","""blue""",19.0,"""male""","""masculine""","""Tatooine""","""Human""","[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]",1958-05-30,1h 58m 44s
