In [102]:
import polars as pl
import featuretools as ft
from datetime import datetime as dt
import numpy as np
import pandas as pd

# Primitives in Polars

# Absolute ✅

In [32]:
df = pl.DataFrame(
    {
        "a": [-20, 10, -30],
    }
)
df.select(pl.col("a").abs())

a
i64
20
10
30


# AddNumeric ✅

In [33]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
        "b": [1, 2, 2],
    }
)
df.select(pl.map(["a", "b"], lambda s: s[0] + s[1]))

a
i64
3
3
4


# AddNumericScalar ✅

In [34]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
    }
)
df.select(pl.col("a") + 3)

a
i64
5
4
5


# Age ❌

# And ✅

In [95]:
df = pl.DataFrame(
    {
        "a": [True, False, True],
        "b": [True, True, False],
    }
)
df.select(pl.map(["a", "b"], lambda s: s[0] or s[1]))

a
bool
True
False
True


# CityblockDistance ❌

# Cosine ✅

In [35]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
    }
)
df.select(pl.col("a").cos())

a
f64
-0.416147
0.540302
-0.416147


# CumCount ✅

In [4]:
df = pl.DataFrame({"a": [1, 2, 3, 4]})
df.select(
    [
        pl.col("a").cumcount(),
        pl.col("a").cumcount(reverse=True).alias("a_reverse"),
    ]
)

a,a_reverse
u32,u32
0,3
1,2
2,1
3,0


# CumMax ✅

In [5]:
df = pl.DataFrame({"a": [1, 2, 3, 4]})
df.select(
    [
        pl.col("a").cummax(),
        pl.col("a").cummax(reverse=True).alias("a_reverse"),
    ]
)

a,a_reverse
i64,i64
1,4
2,4
3,4
4,4


# CumMean ✅

In [18]:
ft.primitives.CumMean()([1, 2, 3, 4, 5])

0    1.0
1    1.5
2    2.0
3    2.5
4    3.0
dtype: float64

In [12]:
df = pl.DataFrame({"values": [1, 2, 3, 4, 5]})
df.select(
    [
        pl.col("values").cumulative_eval(
            pl.element().mean()
        )
    ]
)

values
f64
1.0
1.5
2.0
2.5
3.0


# CumMin ✅

In [19]:
df = pl.DataFrame({"a": [1, 2, 3, 4]})
df.select(
    [
        pl.col("a").cummin(),
        pl.col("a").cummin(reverse=True).alias("a_reverse"),
    ]
)

a,a_reverse
i64,i64
1,1
1,2
1,3
1,4


# CumSum ✅

In [20]:
df = pl.DataFrame({"a": [1, 2, 3, 4]})
df.select(
    [
        pl.col("a").cumsum(),
        pl.col("a").cumsum(reverse=True).alias("a_reverse"),
    ]
)

a,a_reverse
i64,i64
1,10
3,9
6,7
10,4


# DateToHoliday ❌

# Day ✅

In [6]:
df = pl.DataFrame({"date": [dt.now(), dt.now(), dt.now()],})
df.select(
    [
        pl.col("date").dt.day()
    ]
)

date
u32
31
31
31


# DayOfYear ✅

In [7]:
df = pl.DataFrame({"date": [dt.now(), dt.now(), dt.now()],})
df.select(
    [
        pl.col("date").dt.ordinal_day()
    ]
)

date
u32
212
212
212


# DaysInMonth

# Diff ✅

In [3]:
df = pl.DataFrame(
    {
        "a": [20, 10, 30],
    }
)
df.select(pl.col("a").diff())

a
i64
""
-10.0
20.0


# Diff Datetime ✅

In [7]:
df = pl.DataFrame({"date": [dt(2019, 3, 1), dt(2019, 6, 30), dt(2019, 11, 17), dt(2020, 1, 30), dt(2020, 3, 11)],})
df.select(
    [
        pl.col("date").diff()
    ]
)

date
duration[μs]
""
121 days
140 days
74 days
41 days


# DistanceToHoliday

# DivideByFeature ✅

In [53]:
df = pl.DataFrame(
    {
        "a": [20, 10, 30],
    }
)
df.select(100 / pl.col("a"))

literal
f64
5.0
10.0
3.333333


# DivideNumeric ✅

In [14]:
df = pl.DataFrame(
    {
        "a": [20, 10, 30],
        "b": [10, 5, 15],
    }
)
df.select(pl.map(["a", "b"], lambda s: s[0]/s[1]))

a
f64
2.0
2.0
2.0


# DivideNumericScalar ✅

In [10]:
df = pl.DataFrame(
    {
        "a": [20, 10, 30],
    }
)
df.select(pl.col("a") / 3)

a
f64
6.666667
3.333333
10.0


# EmailAddressToDomain ✅

In [17]:
df = pl.DataFrame(
    {
        "a": ['name@gmail.com', 'name@featuretools.com']
    }
)
df.select(
    [
        pl.col("a").str.extract(r"@(.+)", 1),
    ]
)

a
str
"""gmail.com"""
"""featuretools.c..."


# Equal ✅

In [18]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
        "b": [1, 2, 2],
    }
)
df.select(pl.map(["a", "b"], lambda s: s[0] == s[1]))

a
bool
False
False
True


# EqualScalar ✅

In [19]:
df = pl.DataFrame(
    {
        "a": [20, 10, 30],
    }
)
df.select(pl.col("a")  == 20)

a
bool
True
False
False


# GeoMidpoint

# GreaterThan ✅

In [20]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
        "b": [1, 2, 2],
    }
)
df.select(pl.map(["a", "b"], lambda s: s[0] > s[1]))

a
bool
True
False
False


# GreaterThanEqualTo ✅

In [21]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
        "b": [1, 2, 2],
    }
)
df.select(pl.map(["a", "b"], lambda s: s[0] >= s[1]))

a
bool
True
False
True


# GreaterThanEqualToScalar ✅

In [25]:
df = pl.DataFrame(
    {
        "a": [20, 10, 30],
    }
)
df.select(pl.col("a") >= 20)

a
bool
True
False
True


# GreaterThanScalar ✅

In [24]:
df = pl.DataFrame(
    {
        "a": [20, 10, 30],
    }
)
df.select(pl.col("a") > 20)

a
bool
False
False
True


# Haversine ❌

# Hour ✅

In [27]:
df = pl.DataFrame({"date": [dt(2019, 3, 1, 12, 22, 45), dt(2019, 6, 30, 14,30,0), ]})
df.select(
    [
        pl.col("date").dt.hour()
    ]
)

date
u32
12
14


# IsFederalHoliday ❌

# IsFreeEmailDomain ❌

# IsIn

# IsInGeoBox ❌

# IsLeapYear ❌

# IsLunchTime ❌

# IsMonthEnd ❌

# IsMonthStart ❌

# IsNull ✅

In [31]:
df = pl.DataFrame(
    {
        "a": [1, 2, None, 1, 5],
        "b": [1.0, 2.0, float("nan"), 1.0, 5.0],
        "c": [1.0, 2.0, np.nan, 1.0, 5.0],
    }
)
df.with_column(pl.all().is_null().suffix("_isnull"))  # nan != null
df.with_column(pl.all().is_nan().suffix("_isnan"))

a,b,c,a_isnan,b_isnan,c_isnan
i64,f64,f64,bool,bool,bool
1.0,1.0,1.0,False,False,False
2.0,2.0,2.0,False,False,False
,,,False,True,True
1.0,1.0,1.0,False,False,False
5.0,5.0,5.0,False,False,False


# IsQuarterEnd

# IsQuarterStart

# IsWeekend

# IsWorkingHours

# IsYearEnd

# IsYearStart

# Latitude

# LessThan ✅

In [40]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
        "b": [1, 2, 2],
    }
)
df.select(pl.map(["a", "b"], lambda s: s[0] <= s[1]))

a
bool
False
True
True


# LessThanEqualTo ✅

In [39]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
        "b": [1, 2, 2],
    }
)
df.select(pl.map(["a", "b"], lambda s: s[0] <= s[1]))

a
bool
False
True
True


# LessThanEqualToScalar ✅

In [37]:
df = pl.DataFrame(
    {
        "a": [20, 10, 30],
    }
)
df.select(pl.col("a") <= 20)

a
bool
True
True
False


# LessThanScalar ✅

In [36]:
df = pl.DataFrame(
    {
        "a": [20, 10, 30],
    }
)
df.select(pl.col("a") < 20)

a
bool
False
True
False


# Longitude

# Minute ✅

In [41]:
df = pl.DataFrame({"date": [dt(2019, 3, 1, 12, 22, 45), dt(2019, 6, 30, 14,30,0), ]})
df.select(
    [
        pl.col("date").dt.minute()
    ]
)

date
u32
22
30


# ModuloByFeature ✅

In [45]:
df = pl.DataFrame(
    {
        "a": [4, 1, 2],
    }
)
df.select(2 % pl.col("a"))

literal
i64
2
0
0


# ModuloNumeric ✅

In [43]:
df = pl.DataFrame(
    {
        "a": [2, 1, 5], 
        "b": [1, 2, 2],
    }
)
df.select(pl.map(["a", "b"], lambda s: s[0] % s[1]))

a
i64
0
1
1


# ModuloNumericScalar ✅

In [44]:
df = pl.DataFrame(
    {
        "a": [20, 10, 30],
    }
)
df.select(pl.col("a") % 21)

a
i64
20
10
9


# Month ✅

In [46]:
df = pl.DataFrame({"date": [dt(2019, 3, 1, 12, 22, 45), dt(2019, 6, 30, 14,30,0), ]})
df.select(
    [
        pl.col("date").dt.month()
    ]
)

date
u32
3
6


# MultiplyBoolean ✅

In [52]:
df = pl.DataFrame(
    {
        "a": [False, True, True],
        "b": [True, False, True],
    }
)
df.select(pl.map(["a", "b"], lambda s: s[0] & s[1]))

a
bool
False
False
True


# MultiplyNumeric ✅

In [48]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
        "b": [1, 2, 2],
    }
)
df.select(pl.map(["a", "b"], lambda s: s[0] * s[1]))

a
i64
2
2
4


# MultiplyNumericBoolean ✅

In [50]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
        "b": [True, False, True],
    }
)
df.select(pl.map(["a", "b"], lambda s: s[0] * s[1]))

a
i64
2
0
2


# MultiplyNumericScalar ✅

In [47]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
    }
)
df.select(pl.col("a") * 3)

a
i64
6
3
6


# NaturalLogarithm ✅

In [54]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
    }
)
df.select(pl.col("a").log())

a
f64
0.693147
0.0
0.693147


# Negate ✅

In [55]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
    }
)
df.select(pl.col("a") * -1)

a
i64
-2
-1
-2


# Not

In [61]:
df = pl.DataFrame(
    {
        "a": [True, False, True],
    }
)
df.select(pl.col("a"))

a
bool
True
False
True


# NotEqual ✅

In [62]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
        "b": [1, 2, 2],
    }
)
df.select(pl.map(["a", "b"], lambda s: s[0] != s[1]))

a
bool
True
True
False


# NotEqualScalar ✅

In [63]:
df = pl.DataFrame(
    {
        "a": [2, 1, 3],
    }
)
df.select(pl.col("a") != 2)

a
bool
False
True
True


# NumCharacters ✅

In [64]:
df = pl.DataFrame(
    {
        "a": ['This is a string', 'second item', 'final1'],
    }
)
df.select(pl.col("a").str.lengths())

a
u32
16
11
6


# NumWords ✅

In [67]:
df = pl.DataFrame({"s": ['This is a string', 'Two words', 'no-spaces', 'Also works with sentences. Second sentence!']})
df.select(pl.col("s").str.split(by=" ").arr.lengths())

s
u32
4
2
1
6


# NumericLag ✅

In [73]:
def numeric_lag(col, periods):
    return pl.col(col).shift(periods)

df = pl.DataFrame({"foo": [1, 2, 3, 4]})
df.select([
    numeric_lag("foo", 2).alias("foo_lag2"),
    numeric_lag("foo", 3).alias("foo_lag3"),
])

foo_lag2,foo_lag3
i64,i64
,
,
1.0,
2.0,1.0


# Or ✅

In [79]:
df = pl.DataFrame(
    {
        "a": [True, False, True],
        "b": [True, True, False],
    }
)
df.select(pl.map(["a", "b"], lambda s: s[0] or s[1]))

a
bool
True
False
True


# PartOfDay

# Percentile ✅

In [81]:
def percentile(col):
    return pl.col(col).rank() / pl.col(col).count()

df = pl.DataFrame({"foo": [10, 15, 1, 20]})
df.select([
    percentile("foo").alias("foo_lag2")
])

foo_lag2
f64
0.5
0.75
0.25
1.0


# Quarter ✅

In [82]:
df = pl.DataFrame({"date": [dt(2019, 3, 1, 12, 22, 45), dt(2019, 6, 30, 14,30,0), ]})
df.select(
    [
        pl.col("date").dt.quarter()
    ]
)

date
u32
1
2


# RollingCount

# RollingMax

# RollingMean

# RollingMin

In [115]:
df = pl.DataFrame({"a": [4, 3, 2, 1, 0]})
df.select(
    [
        pl.col("a").rolling_min(window_size=3, min_periods=1)
    ]
)

a
i64
4
4
3
2
1


# RollingSTD

In [113]:
df = pl.DataFrame({"a": [4, 3, 2, 1, 0]})
df.select(
    [
        pl.col("a").rolling_std(window_size=4, min_periods=1)
    ]
)

a
f64
0.0
0.707107
1.0
1.290994
1.290994


# RollingTrend

# ScalarSubtractNumericFeature ✅

In [87]:
df = pl.DataFrame(
    {
        "a": [3, 1, 2],
    }
)
df.select(2 - pl.col("a"))

a
i64
-1
1
0


# Second ✅

In [83]:
df = pl.DataFrame({"date": [dt(2019, 3, 1, 12, 22, 45), dt(2019, 6, 30, 14,30,0), ]})
df.select(
    [
        pl.col("date").dt.second()
    ]
)

date
u32
45
0


# Sine ✅

In [84]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
    }
)
df.select(pl.col("a").sin())

a
f64
0.909297
0.841471
0.909297


# SquareRoot ✅

In [85]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
    }
)
df.select(pl.col("a").sqrt())

a
f64
1.414214
1.0
1.414214


# SubtractNumeric ✅

In [86]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
        "b": [1, 2, 2],
    }
)
df.select(pl.map(["a", "b"], lambda s: s[0] - s[1]))

a
i64
1
-1
0


# SubtractNumericScalar ✅

In [88]:
df = pl.DataFrame(
    {
        "a": [3, 1, 2],
    }
)
df.select(pl.col("a") - 2)

a
i64
1
-1
0


# Tangent ✅

In [89]:
df = pl.DataFrame(
    {
        "a": [2, 1, 2],
    }
)
df.select(pl.col("a").tan())

a
f64
-2.185
1.557408
-2.185


# TimeSince ✅

In [99]:
times = [dt(2019, 3, 1, 0, 0, 0, 1),
        dt(2019, 3, 1, 0, 0, 1, 0),
        dt(2019, 3, 1, 0, 2, 0, 0)]
cutoff_time = dt(2019, 3, 1, 0, 0, 0, 0)

df = pl.DataFrame({"date": times})
df.select(
    [
        cutoff_time - pl.col("date")
    ]
)

literal
duration[μs]
0 µs
-1 seconds
-2 minutes


# TimeSincePrevious ✅

In [100]:
dates = [dt(2019, 3, 1, 0, 0, 0), dt(2019, 3, 1, 0, 2, 0), dt(2019, 3, 1, 0, 3, 0), dt(2019, 3, 1, 0, 2, 30), dt(2019, 3, 1, 0, 10, 0)]
df = pl.DataFrame({"date": dates})
df.select(
    [
        pl.col("date").diff()
    ]
)

date
duration[μs]
""
2 minutes
1 minute
-30 seconds
7 minutes 30 seconds


# URLToDomain

# URLToProtocol

# URLToTLD

# Week ✅

In [90]:
df = pl.DataFrame({"date": [dt(2019, 3, 1, 12, 22, 45), dt(2019, 6, 30, 14,30,0), ]})
df.select(
    [
        pl.col("date").dt.week()
    ]
)

date
u32
9
26


# Weekday ✅

In [91]:
df = pl.DataFrame({"date": [dt(2019, 3, 1, 12, 22, 45), dt(2019, 6, 30, 14,30,0), ]})
df.select(
    [
        pl.col("date").dt.weekday()
    ]
)

date
u32
5
7


# Year ✅

In [94]:
df = pl.DataFrame({"date": [dt(2020, 3, 1, 12, 22, 45), dt(2019, 6, 30, 14,30,0), ]})
df.select(
    [
        pl.col("date").dt.year()
    ]
)

date
i32
2020
2019
