In [1]:
import polars as pl

df = pl.DataFrame({
    'Name': ['Alice', 'Kumar', 'Mei'],
    'Age': [25, 40, 35],
    'City': ['New York', 'Mumbai', 'Shanghai']
})
print(df)

shape: (3, 3)
┌───────┬─────┬──────────┐
│ Name  ┆ Age ┆ City     │
│ ---   ┆ --- ┆ ---      │
│ str   ┆ i64 ┆ str      │
╞═══════╪═════╪══════════╡
│ Alice ┆ 25  ┆ New York │
│ Kumar ┆ 40  ┆ Mumbai   │
│ Mei   ┆ 35  ┆ Shanghai │
└───────┴─────┴──────────┘


Selection and Filtering

In [2]:
print(df['Name'])

shape: (3,)
Series: 'Name' [str]
[
	"Alice"
	"Kumar"
	"Mei"
]


In [3]:
print(df.select(['Name']))

shape: (3, 1)
┌───────┐
│ Name  │
│ ---   │
│ str   │
╞═══════╡
│ Alice │
│ Kumar │
│ Mei   │
└───────┘


In [4]:
print(df.select(pl.col('Age') + 5))

shape: (3, 1)
┌─────┐
│ Age │
│ --- │
│ i64 │
╞═════╡
│ 30  │
│ 45  │
│ 40  │
└─────┘


In [5]:
print(df[0])

shape: (1, 3)
┌───────┬─────┬──────────┐
│ Name  ┆ Age ┆ City     │
│ ---   ┆ --- ┆ ---      │
│ str   ┆ i64 ┆ str      │
╞═══════╪═════╪══════════╡
│ Alice ┆ 25  ┆ New York │
└───────┴─────┴──────────┘


In [6]:
print(df.row(0))

('Alice', 25, 'New York')


In [7]:
print(df.filter(pl.col('Age') > 30))

shape: (2, 3)
┌───────┬─────┬──────────┐
│ Name  ┆ Age ┆ City     │
│ ---   ┆ --- ┆ ---      │
│ str   ┆ i64 ┆ str      │
╞═══════╪═════╪══════════╡
│ Kumar ┆ 40  ┆ Mumbai   │
│ Mei   ┆ 35  ┆ Shanghai │
└───────┴─────┴──────────┘


Adding New Columns

In [8]:
df_new = df.with_columns(
    (pl.col('Age') + 5).alias('NewAge')
)
print(df_new)

shape: (3, 4)
┌───────┬─────┬──────────┬────────┐
│ Name  ┆ Age ┆ City     ┆ NewAge │
│ ---   ┆ --- ┆ ---      ┆ ---    │
│ str   ┆ i64 ┆ str      ┆ i64    │
╞═══════╪═════╪══════════╪════════╡
│ Alice ┆ 25  ┆ New York ┆ 30     │
│ Kumar ┆ 40  ┆ Mumbai   ┆ 45     │
│ Mei   ┆ 35  ┆ Shanghai ┆ 40     │
└───────┴─────┴──────────┴────────┘


Grouping and Aggregations


In [9]:
df = pl.read_csv('data/flights_sample_3m.csv')

result = (
    df.group_by('AIRLINE')
    .agg(pl.col('DEP_DELAY').mean().alias('mean_dep_delay'))
    .sort('mean_dep_delay', descending=True)
)
print(result)

shape: (18, 2)
┌─────────────────────────────────┬────────────────┐
│ AIRLINE                         ┆ mean_dep_delay │
│ ---                             ┆ ---            │
│ str                             ┆ f64            │
╞═════════════════════════════════╪════════════════╡
│ JetBlue Airways                 ┆ 18.322555      │
│ Frontier Airlines Inc.          ┆ 16.033574      │
│ Allegiant Air                   ┆ 13.907797      │
│ Spirit Air Lines                ┆ 12.98188       │
│ ExpressJet Airlines LLC d/b/a … ┆ 12.774462      │
│ …                               ┆ …              │
│ Endeavor Air Inc.               ┆ 5.951135       │
│ Republic Airline                ┆ 5.804359       │
│ Hawaiian Airlines Inc.          ┆ 5.089537       │
│ Horizon Air                     ┆ 4.832775       │
│ Alaska Airlines Inc.            ┆ 4.640018       │
└─────────────────────────────────┴────────────────┘


Lazy Mode

In [10]:
df = pl.read_csv('data/flights_sample_3m.csv').lazy()

result = (
    df.group_by('AIRLINE')
    .agg(pl.col('DEP_DELAY').mean().alias('mean_dep_delay'))
    .sort('mean_dep_delay', descending=True)
    .collect()
)
print(result)

shape: (18, 2)
┌─────────────────────────────────┬────────────────┐
│ AIRLINE                         ┆ mean_dep_delay │
│ ---                             ┆ ---            │
│ str                             ┆ f64            │
╞═════════════════════════════════╪════════════════╡
│ JetBlue Airways                 ┆ 18.322555      │
│ Frontier Airlines Inc.          ┆ 16.033574      │
│ Allegiant Air                   ┆ 13.907797      │
│ Spirit Air Lines                ┆ 12.98188       │
│ ExpressJet Airlines LLC d/b/a … ┆ 12.774462      │
│ …                               ┆ …              │
│ Endeavor Air Inc.               ┆ 5.951135       │
│ Republic Airline                ┆ 5.804359       │
│ Hawaiian Airlines Inc.          ┆ 5.089537       │
│ Horizon Air                     ┆ 4.832775       │
│ Alaska Airlines Inc.            ┆ 4.640018       │
└─────────────────────────────────┴────────────────┘


Performance Comparison

In [11]:
import pandas as pd
import polars as pl

# Load the same CSV file into both Pandas and Polars
csv_path = 'data/flights_sample_3m.csv'

%timeit df_pandas = pd.read_csv(csv_path)
%timeit df_polars = pl.read_csv(csv_path)

9.13 s ± 497 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
563 ms ± 18.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
# Reload the data as %timeit does not persist the variable in memory
df_pandas = pd.read_csv(csv_path)
df_polars = pl.read_csv(csv_path)

%timeit df_pandas.groupby('AIRLINE')
%timeit df_polars.group_by('AIRLINE')

27.7 µs ± 1.4 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
998 ns ± 87 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
