# polarsの基礎学習

In [1]:
import os

DATADIR = "../data/"
SAMPLE_DIR = os.path.join(DATADIR, "sample")


## クイックスタート

<https://pola-rs.github.io/polars-book/user-guide/quickstart/quick-exploration-guide.html>

In [2]:
from datetime import datetime, timedelta

import polars as pl
import numpy as np


### オブジェクトの作成

In [3]:
# key-value
series = pl.Series("a", [1, 2, 3, 4, 5])

print(series)


shape: (5,)
Series: 'a' [i64]
[
	1
	2
	3
	4
	5
]


In [4]:
# list
series = pl.Series([1, 2, 3, 4, 5])

print(series)


shape: (5,)
Series: '' [i64]
[
	1
	2
	3
	4
	5
]


In [5]:
dataframe = pl.DataFrame(
    {
        "integer": [1, 2, 3],
        "date": [(datetime(2022, 1, 1)), (datetime(2022, 1, 2)), (datetime(2022, 1, 3))],
        "float": [4.0, 5.0, 6.0],
    }
)

print(dataframe)


shape: (3, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
└─────────┴─────────────────────┴───────┘


In [6]:
fake_physical_csv = os.path.join(SAMPLE_DIR, "fake-physical.csv")
df_csv = pl.read_csv(fake_physical_csv)
print(df_csv)


shape: (300, 3)
┌────────────┬───────────┬─────┐
│ height     ┆ weight    ┆ age │
│ ---        ┆ ---       ┆ --- │
│ f64        ┆ f64       ┆ i64 │
╞════════════╪═══════════╪═════╡
│ 186.841723 ┆ 53.281552 ┆ 18  │
│ 160.624646 ┆ 42.62277  ┆ 23  │
│ 187.717041 ┆ 70.062486 ┆ 27  │
│ 179.71521  ┆ 46.055601 ┆ 19  │
│ ...        ┆ ...       ┆ ... │
│ 168.65566  ┆ 63.386823 ┆ 26  │
│ 169.872215 ┆ 75.888479 ┆ 31  │
│ 162.099499 ┆ 60.009295 ┆ 28  │
│ 165.639841 ┆ 38.386991 ┆ 23  │
└────────────┴───────────┴─────┘


### データの表示

pandasと似ている

In [7]:
df = pl.DataFrame(
    {
        "a": np.arange(0, 8),
        "b": np.random.rand(8),
        "c": [datetime(2022, 12, 1) + timedelta(days=idx) for idx in range(8)],
        "d": [1, 2.0, np.NaN, np.NaN, 0, -5, -42, None],
    }
)
df


a,b,c,d
i64,f64,datetime[μs],f64
0,0.944905,2022-12-01 00:00:00,1.0
1,0.588551,2022-12-02 00:00:00,2.0
2,0.037508,2022-12-03 00:00:00,
3,0.340911,2022-12-04 00:00:00,
4,0.819841,2022-12-05 00:00:00,0.0
5,0.560921,2022-12-06 00:00:00,-5.0
6,0.837768,2022-12-07 00:00:00,-42.0
7,0.449459,2022-12-08 00:00:00,


In [8]:
print(df.head(5))
print(df.tail(5))


shape: (5, 4)
┌─────┬──────────┬─────────────────────┬─────┐
│ a   ┆ b        ┆ c                   ┆ d   │
│ --- ┆ ---      ┆ ---                 ┆ --- │
│ i64 ┆ f64      ┆ datetime[μs]        ┆ f64 │
╞═════╪══════════╪═════════════════════╪═════╡
│ 0   ┆ 0.944905 ┆ 2022-12-01 00:00:00 ┆ 1.0 │
│ 1   ┆ 0.588551 ┆ 2022-12-02 00:00:00 ┆ 2.0 │
│ 2   ┆ 0.037508 ┆ 2022-12-03 00:00:00 ┆ NaN │
│ 3   ┆ 0.340911 ┆ 2022-12-04 00:00:00 ┆ NaN │
│ 4   ┆ 0.819841 ┆ 2022-12-05 00:00:00 ┆ 0.0 │
└─────┴──────────┴─────────────────────┴─────┘
shape: (5, 4)
┌─────┬──────────┬─────────────────────┬───────┐
│ a   ┆ b        ┆ c                   ┆ d     │
│ --- ┆ ---      ┆ ---                 ┆ ---   │
│ i64 ┆ f64      ┆ datetime[μs]        ┆ f64   │
╞═════╪══════════╪═════════════════════╪═══════╡
│ 3   ┆ 0.340911 ┆ 2022-12-04 00:00:00 ┆ NaN   │
│ 4   ┆ 0.819841 ┆ 2022-12-05 00:00:00 ┆ 0.0   │
│ 5   ┆ 0.560921 ┆ 2022-12-06 00:00:00 ┆ -5.0  │
│ 6   ┆ 0.837768 ┆ 2022-12-07 00:00:00 ┆ -42.0 │
│ 7   ┆ 0.4494

In [9]:
df.describe()


describe,a,b,c,d
str,f64,f64,str,f64
"""count""",8.0,8.0,"""8""",8.0
"""null_count""",0.0,0.0,"""0""",1.0
"""mean""",3.5,0.572483,,
"""std""",2.44949,0.298953,,
"""min""",0.0,0.037508,"""2022-12-01 00:...",-42.0
"""max""",7.0,0.944905,"""2022-12-08 00:...",2.0
"""median""",3.5,0.574736,,1.0


### 式

### 選択ステートメント

In [10]:
# 全ての列の取得
df.select(pl.col("*"))


a,b,c,d
i64,f64,datetime[μs],f64
0,0.944905,2022-12-01 00:00:00,1.0
1,0.588551,2022-12-02 00:00:00,2.0
2,0.037508,2022-12-03 00:00:00,
3,0.340911,2022-12-04 00:00:00,
4,0.819841,2022-12-05 00:00:00,0.0
5,0.560921,2022-12-06 00:00:00,-5.0
6,0.837768,2022-12-07 00:00:00,-42.0
7,0.449459,2022-12-08 00:00:00,


In [11]:
# 指定した列の取得
df.select(pl.col(["a", "b"]))

# 結果は一応変わらない
# df.select(['a', 'b'])


a,b
i64,f64
0,0.944905
1,0.588551
2,0.037508
3,0.340911
4,0.819841
5,0.560921
6,0.837768
7,0.449459


In [12]:
# 制限付きアクセス
df.select([pl.col("a"), pl.col("b")]).limit(3)


a,b
i64,f64
0,0.944905
1,0.588551
2,0.037508


In [13]:
# 除外検索
df.select([pl.exclude("a")])


b,c,d
f64,datetime[μs],f64
0.944905,2022-12-01 00:00:00,1.0
0.588551,2022-12-02 00:00:00,2.0
0.037508,2022-12-03 00:00:00,
0.340911,2022-12-04 00:00:00,
0.819841,2022-12-05 00:00:00,0.0
0.560921,2022-12-06 00:00:00,-5.0
0.837768,2022-12-07 00:00:00,-42.0
0.449459,2022-12-08 00:00:00,


#### フィルター

In [14]:
# 日付で検索
df.filter(
    pl.col("c").is_between(datetime(2022, 12, 2), datetime(2022, 12, 8)),
)


a,b,c,d
i64,f64,datetime[μs],f64
1,0.588551,2022-12-02 00:00:00,2.0
2,0.037508,2022-12-03 00:00:00,
3,0.340911,2022-12-04 00:00:00,
4,0.819841,2022-12-05 00:00:00,0.0
5,0.560921,2022-12-06 00:00:00,-5.0
6,0.837768,2022-12-07 00:00:00,-42.0
7,0.449459,2022-12-08 00:00:00,


In [15]:
# 要素の細かな指定で検索
df.filter((pl.col("a") <= 3) & (pl.col("d").is_not_nan()))


a,b,c,d
i64,f64,datetime[μs],f64
0,0.944905,2022-12-01 00:00:00,1.0
1,0.588551,2022-12-02 00:00:00,2.0


#### with_columns

In [16]:
df.with_columns([pl.col("b").sum().alias("e"), (pl.col("b") + 42).alias("b+42")])


a,b,c,d,e,b+42
i64,f64,datetime[μs],f64,f64,f64
0,0.944905,2022-12-01 00:00:00,1.0,4.579864,42.944905
1,0.588551,2022-12-02 00:00:00,2.0,4.579864,42.588551
2,0.037508,2022-12-03 00:00:00,,4.579864,42.037508
3,0.340911,2022-12-04 00:00:00,,4.579864,42.340911
4,0.819841,2022-12-05 00:00:00,0.0,4.579864,42.819841
5,0.560921,2022-12-06 00:00:00,-5.0,4.579864,42.560921
6,0.837768,2022-12-07 00:00:00,-42.0,4.579864,42.837768
7,0.449459,2022-12-08 00:00:00,,4.579864,42.449459


#### グループ列

In [17]:
df2 = pl.DataFrame(
    {
        "x": np.arange(0, 8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)

print(df2)


shape: (8, 2)
┌─────┬─────┐
│ x   ┆ y   │
│ --- ┆ --- │
│ i64 ┆ str │
╞═════╪═════╡
│ 0   ┆ A   │
│ 1   ┆ A   │
│ 2   ┆ A   │
│ 3   ┆ B   │
│ 4   ┆ B   │
│ 5   ┆ C   │
│ 6   ┆ X   │
│ 7   ┆ X   │
└─────┴─────┘


In [18]:
# グルーピングする
df2.groupby("y", maintain_order=True).count()

y,count
str,u32
"""A""",3
"""B""",2
"""C""",1
"""X""",2


In [19]:
df2.groupby("y", maintain_order=True).agg([
    pl.col("*").count().alias("count"),
    pl.col("*").sum().alias("sum")
])

y,count,sum
str,u32,i64
"""A""",3,3
"""B""",2,7
"""C""",1,5
"""X""",2,13


#### 操作を結合する

複数の操作を同時に行う

In [20]:
df_x = df.with_columns((pl.col("a") * pl.col("b")).alias("a * b")).select([pl.all().exclude(["c", "d"])])

print(df_x)


shape: (8, 3)
┌─────┬──────────┬──────────┐
│ a   ┆ b        ┆ a * b    │
│ --- ┆ ---      ┆ ---      │
│ i64 ┆ f64      ┆ f64      │
╞═════╪══════════╪══════════╡
│ 0   ┆ 0.944905 ┆ 0.0      │
│ 1   ┆ 0.588551 ┆ 0.588551 │
│ 2   ┆ 0.037508 ┆ 0.075016 │
│ 3   ┆ 0.340911 ┆ 1.022733 │
│ 4   ┆ 0.819841 ┆ 3.279365 │
│ 5   ┆ 0.560921 ┆ 2.804607 │
│ 6   ┆ 0.837768 ┆ 5.026606 │
│ 7   ┆ 0.449459 ┆ 3.146211 │
└─────┴──────────┴──────────┘


### データフレームの結合

#### 接続

2つから1つ

In [21]:
df = pl.DataFrame(
    {
        "a": np.arange(0, 8),
        "b": np.random.rand(8),
        "c": [datetime(2022, 12, 1) + timedelta(days=idx) for idx in range(8)],
        "d": [1, 2.0, np.NaN, np.NaN, 0, -5, -42, None],
    }
)

df2 = pl.DataFrame(
    {
        "x": np.arange(0, 8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)


In [22]:
df.join(df2, left_on="a", right_on="x")

a,b,c,d,y
i64,f64,datetime[μs],f64,str
0,0.214657,2022-12-01 00:00:00,1.0,"""A"""
1,0.903308,2022-12-02 00:00:00,2.0,"""A"""
2,0.551506,2022-12-03 00:00:00,,"""A"""
3,0.780019,2022-12-04 00:00:00,,"""B"""
4,0.852326,2022-12-05 00:00:00,0.0,"""B"""
5,0.820489,2022-12-06 00:00:00,-5.0,"""C"""
6,0.290284,2022-12-07 00:00:00,-42.0,"""X"""
7,0.459139,2022-12-08 00:00:00,,"""X"""


#### 連結

In [23]:
pl.concat([df, df2], how="horizontal")


a,b,c,d,x,y
i64,f64,datetime[μs],f64,i64,str
0,0.214657,2022-12-01 00:00:00,1.0,0,"""A"""
1,0.903308,2022-12-02 00:00:00,2.0,1,"""A"""
2,0.551506,2022-12-03 00:00:00,,2,"""A"""
3,0.780019,2022-12-04 00:00:00,,3,"""B"""
4,0.852326,2022-12-05 00:00:00,0.0,4,"""B"""
5,0.820489,2022-12-06 00:00:00,-5.0,5,"""C"""
6,0.290284,2022-12-07 00:00:00,-42.0,6,"""X"""
7,0.459139,2022-12-08 00:00:00,,7,"""X"""
