# 11. Working with Textual, Temporal, and Nested Data Types

In [None]:
import polars as pl
pl.__version__

'1.20.0'

## String

### String Methods

#### String methods for conversion

#### String methods for describing and querying

#### String methods for manipulation

### String Examples

In [8]:
corpus = pl.DataFrame(
    {
        "raw_text": [
            "  Data Science is amazing ",
            "Data_analysis > Data entry",
            " Python&Polars; Fast",
        ]
    }
)

corpus

raw_text
str
""" Data Science is amazing """
"""Data_analysis > Data entry"""
""" Python&Polars; Fast"""


In [9]:
corpus = corpus.with_columns(
    processed_text=pl.col("raw_text")
    .str.strip_chars()
    .str.to_lowercase()
    .str.replace_all("_", " ")
)
corpus

raw_text,processed_text
str,str
""" Data Science is amazing ""","""data science is amazing"""
"""Data_analysis > Data entry""","""data analysis > data entry"""
""" Python&Polars; Fast""","""python&polars; fast"""


In [10]:
corpus.with_columns(
    first_5_chars=pl.col("processed_text").str.slice(0, 5),
    first_word=pl.col("processed_text")
    .str.split(" ")
    .list.get(0),
    second_word=pl.col("processed_text").str.split(" ").list.get(1),
)

raw_text,processed_text,first_5_chars,first_word,second_word
str,str,str,str,str
""" Data Science is amazing ""","""data science is amazing""","""data ""","""data""","""science"""
"""Data_analysis > Data entry""","""data analysis > data entry""","""data ""","""data""","""analysis"""
""" Python&Polars; Fast""","""python&polars; fast""","""pytho""","""python&polars;""","""fast"""


In [11]:
corpus.with_columns(
    len_chars=pl.col("processed_text").str.len_chars(),
    len_bytes=pl.col("processed_text").str.len_bytes(),
    count_a=pl.col("processed_text").str.count_matches("a"),
)

raw_text,processed_text,len_chars,len_bytes,count_a
str,str,u32,u32,u32
""" Data Science is amazing ""","""data science is amazing""",23,23,4
"""Data_analysis > Data entry""","""data analysis > data entry""",26,26,6
""" Python&Polars; Fast""","""python&polars; fast""",19,19,2


In [6]:
posts = pl.DataFrame(
    {"post": ["Loving #python and #polars!", "A boomer post without a hashtag"]}
)

hashtag_regex = r"#(\w+)"

posts.with_columns(
    hashtags=pl.col("post").str.extract_all(hashtag_regex)
)

post,hashtags
str,list[str]
"""Loving #python and #polars!""","[""#python"", ""#polars""]"
"""A boomer post without a hashta…",[]


## Categorical

In [7]:
cats = pl.DataFrame(
    {"name": ["Persian cat", "Siamese Cat", "Lynx", "Lynx"]},
    schema={"name": pl.Categorical},
)

cats.with_columns(name_physical=pl.col("name").to_physical())

name,name_physical
cat,u32
"""Persian cat""",0
"""Siamese Cat""",1
"""Lynx""",2
"""Lynx""",2


### Categorical Methods

### Categorical Examples

In [12]:
more_cats = pl.DataFrame(
    {"name": ["Maine Coon Cat", "Lynx", "Lynx", "Siamese Cat"]},
    schema={"name": pl.Categorical},
)

more_cats.with_columns(name_physical=pl.col("name").to_physical())

name,name_physical
cat,u32
"""Maine Coon Cat""",0
"""Lynx""",1
"""Lynx""",1
"""Siamese Cat""",2


In [13]:
cats.join(more_cats, on="name")

  cats.join(more_cats, on="name")


name
cat
"""Lynx"""
"""Lynx"""
"""Lynx"""
"""Lynx"""
"""Siamese Cat"""


In [14]:
with pl.StringCache():
    left = pl.DataFrame(
        {
            "categorical_column": ["value3", "value2", "value1"],
            "other": ["a", "b", "c"],
        },
        schema={"categorical_column": pl.Categorical, "other": pl.String},
    )
    right = pl.DataFrame(
        {
            "categorical_column": ["value2", "value3", "value4"],
            "other": ["d", "e", "f"],
        },
        schema={"categorical_column": pl.Categorical, "other": pl.String},
    )

In [15]:
left.join(right, on="categorical_column")

categorical_column,other,other_right
cat,str,str
"""value2""","""b""","""d"""
"""value3""","""a""","""e"""


In [19]:
pl.enable_string_cache()

In [20]:
right.select(pl.col("categorical_column").cat.get_categories())

categorical_column
str
"""value2"""
"""value3"""
"""value4"""


In [21]:
sorting_comparison_df = cats.select(cat_lexical=pl.col("name")).with_columns(
    cat_physical=pl.col("cat_lexical").to_physical()
)

sorting_comparison_df

cat_lexical,cat_physical
cat,u32
"""Persian cat""",0
"""Siamese Cat""",1
"""Lynx""",2
"""Lynx""",2


In [22]:
sorting_comparison_df.with_columns(
    pl.col("cat_lexical").cast(pl.Categorical("physical"))
).sort(by="cat_lexical")

cat_lexical,cat_physical
cat,u32
"""Persian cat""",0
"""Siamese Cat""",1
"""Lynx""",2
"""Lynx""",2


In [23]:
sorting_comparison_df.with_columns(
    pl.col("cat_lexical").cast(pl.Categorical("lexical"))
).sort(by="cat_lexical")

cat_lexical,cat_physical
cat,u32
"""Lynx""",2
"""Lynx""",2
"""Persian cat""",0
"""Siamese Cat""",1


## Enum

In [24]:
bear_enum_dtype = pl.Enum(["Polar", "Panda", "Brown"])

bear_enum_series = pl.Series(
    ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=bear_enum_dtype
)

bear_cat_series = pl.Series(
    ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical
)

## Temporal

### Temporal Methods

#### Temporal methods for conversion

#### Temporal methods for describing and querying

#### Temporal methods for manipulation

### Temporal Examples

#### Loading from a CSV file

In [25]:
pl.read_csv("data/all_stocks.csv", try_parse_dates=True)

symbol,date,open,high,low,close,adj close,volume
str,date,f64,f64,f64,f64,f64,i64
"""ASML""",1999-01-04,11.765625,12.28125,11.765625,12.140625,7.522523,1801867
"""ASML""",1999-01-05,11.859375,14.25,11.71875,13.96875,8.655257,8241600
"""ASML""",1999-01-06,14.25,17.601563,14.203125,16.875,10.456018,16400267
"""ASML""",1999-01-07,14.742188,17.8125,14.53125,16.851563,10.441495,17722133
"""ASML""",1999-01-08,16.078125,16.289063,15.023438,15.796875,9.787995,10696000
…,…,…,…,…,…,…,…
"""TSM""",2023-06-26,102.019997,103.040001,100.089996,100.110001,99.125954,8560000
"""TSM""",2023-06-27,101.150002,102.790001,100.019997,102.080002,101.076591,9732000
"""TSM""",2023-06-28,100.5,101.879997,100.220001,100.919998,99.927986,8160900
"""TSM""",2023-06-29,101.339996,101.519997,100.019997,100.639999,99.650742,7383900


#### Converting to and from a String

In [26]:
dates = pl.DataFrame({"date_str": ["2023-12-31", "2024-02-29"]}).with_columns(
    date=pl.col("date_str").str.to_date("%Y-%m-%d")
)

dates

date_str,date
str,date
"""2023-12-31""",2023-12-31
"""2024-02-29""",2024-02-29


In [27]:
dates.with_columns(formatted_date=pl.col("date").dt.to_string("%d-%m-%Y"))

date_str,date,formatted_date
str,date,str
"""2023-12-31""",2023-12-31,"""31-12-2023"""
"""2024-02-29""",2024-02-29,"""29-02-2024"""


#### Generating date ranges

In [28]:
pl.DataFrame(
    {
        "monday": pl.date_range(
            start=pl.date(2024, 10, 28),
            end=pl.date(2024, 12, 1),
            interval="1w",
            eager=True,
        ),
    }
)

monday
date
2024-10-28
2024-11-04
2024-11-11
2024-11-18
2024-11-25


#### Time zones

In [29]:
pl.DataFrame(
    {
        "utc_mixed_offset": [
            "2021-03-27T00:00:00+0100",
            "2021-03-28T00:00:00+0100",
            "2021-03-29T00:00:00+0200",
            "2021-03-30T00:00:00+0200",
        ]
    }
).with_columns(
    parsed=pl.col("utc_mixed_offset").str.to_datetime(
        "%Y-%m-%dT%H:%M:%S%z"
    )
).with_columns(
    converted=pl.col("parsed").dt.convert_time_zone("Europe/Amsterdam")
)

utc_mixed_offset,parsed,converted
str,"datetime[μs, UTC]","datetime[μs, Europe/Amsterdam]"
"""2021-03-27T00:00:00+0100""",2021-03-26 23:00:00 UTC,2021-03-27 00:00:00 CET
"""2021-03-28T00:00:00+0100""",2021-03-27 23:00:00 UTC,2021-03-28 00:00:00 CET
"""2021-03-29T00:00:00+0200""",2021-03-28 22:00:00 UTC,2021-03-29 00:00:00 CEST
"""2021-03-30T00:00:00+0200""",2021-03-29 22:00:00 UTC,2021-03-30 00:00:00 CEST


## List

### List Methods

### List Examples

In [30]:
bools = pl.DataFrame({"values": [[True, True], [False, False, True], [False]]})

bools.with_columns(
    all_true=pl.col("values").list.all(),
    any_true=pl.col("values").list.any(),
)

values,all_true,any_true
list[bool],bool,bool
"[true, true]",True,True
"[false, false, true]",False,True
[false],False,False


In [31]:
groups = pl.DataFrame({"ages": [[18, 21], [30, 40, 50], [42, 69]]})

groups.with_columns(
    over_forty=pl.col("ages").list.eval(
        pl.element() > 40,
        parallel=True,
    )
).with_columns(
    all_over_forty=pl.col("over_forty").list.all()
)

ages,over_forty,all_over_forty
list[i64],list[bool],bool
"[18, 21]","[false, false]",False
"[30, 40, 50]","[false, false, true]",False
"[42, 69]","[true, true]",True


In [32]:
groups.with_columns(
    ages_sorted_descending=pl.col("ages").list.sort(descending=True)
)

ages,ages_sorted_descending
list[i64],list[i64]
"[18, 21]","[21, 18]"
"[30, 40, 50]","[50, 40, 30]"
"[42, 69]","[69, 42]"


In [33]:
groups.explode("ages")

ages
i64
18
21
30
40
50
42
69


In [34]:
groups.select(ages=pl.col("ages").list.explode())

ages
i64
18
21
30
40
50
42
69


## Array

### Array Methods

### Array Examples

In [35]:
events = pl.DataFrame(
    [
        pl.Series(
            "location", ["Paris", "Amsterdam", "Barcelona"], dtype=pl.String
        ),
        pl.Series(
            "temperatures",
            [
                [23, 27, 21, 22, 24, 23, 22],
                [17, 19, 15, 22, 18, 20, 21],
                [30, 32, 28, 29, 34, 33, 31],
            ],
            dtype=pl.Array(pl.Int64, shape=7),
        ),
    ]
)

events

location,temperatures
str,"array[i64, 7]"
"""Paris""","[23, 27, … 22]"
"""Amsterdam""","[17, 19, … 21]"
"""Barcelona""","[30, 32, … 31]"


In [36]:
events.with_columns(
    median=pl.col("temperatures").arr.median(),
    max=pl.col("temperatures").arr.max(),
    warmest_dow=pl.col("temperatures").arr.arg_max(),
)

location,temperatures,median,max,warmest_dow
str,"array[i64, 7]",f64,i64,u32
"""Paris""","[23, 27, … 22]",23.0,27,1
"""Amsterdam""","[17, 19, … 21]",19.0,22,3
"""Barcelona""","[30, 32, … 31]",31.0,34,4


## Struct

### Struct Methods

### Struct Examples

In [37]:
from datetime import date

orders = pl.DataFrame(
    {
        "customer_id": [2781, 6139, 5392],
        "order_details": [
            {"amount": 250.00, "date": date(2024, 1, 3), "items": 5},
            {"amount": 150.00, "date": date(2024, 1, 5), "items": 1},
            {"amount": 100.00, "date": date(2024, 1, 2), "items": 3},
        ],
    },
)

orders

customer_id,order_details
i64,struct[3]
2781,"{250.0,2024-01-03,5}"
6139,"{150.0,2024-01-05,1}"
5392,"{100.0,2024-01-02,3}"


In [38]:
orders.select(pl.col("order_details").struct.field("amount"))

amount
f64
250.0
150.0
100.0


In [39]:
order_details_df = orders.unnest("order_details")

order_details_df

customer_id,amount,date,items
i64,f64,date,i64
2781,250.0,2024-01-03,5
6139,150.0,2024-01-05,1
5392,100.0,2024-01-02,3


In [40]:
order_details_df.select(
    "amount",
    "date",
    "items",
    order_details=pl.struct(pl.col("amount"), pl.col("date"), pl.col("items")),
)

amount,date,items,order_details
f64,date,i64,struct[3]
250.0,2024-01-03,5,"{250.0,2024-01-03,5}"
150.0,2024-01-05,1,"{150.0,2024-01-05,1}"
100.0,2024-01-02,3,"{100.0,2024-01-02,3}"


In [41]:
basket = pl.DataFrame(
    {
        "fruit": ["cherry", "apple", "banana", "banana", "apple", "banana"],
    }
)

basket

fruit
str
"""cherry"""
"""apple"""
"""banana"""
"""banana"""
"""apple"""
"""banana"""


In [42]:
basket.select(pl.col("fruit").value_counts(sort=True))

fruit
struct[2]
"{""banana"",3}"
"{""apple"",2}"
"{""cherry"",1}"


In [43]:
basket.select(pl.col("fruit").value_counts(sort=True).struct.unnest())

fruit,count
str,u32
"""banana""",3
"""apple""",2
"""cherry""",1
