# Using DuckDB with Polars

## Introduction to Polars

In [1]:
%pip install polars

Collecting polars
  Downloading polars-1.33.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Downloading polars-1.33.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (39.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.7/39.7 MB[0m [31m32.0 MB/s[0m  [33m0:00:01[0mm0:00:01[0m00:01[0m
[?25hInstalling collected packages: polars
Successfully installed polars-1.33.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import polars
polars.__version__

'1.33.1'

## Creating a Polars Dataframe

In [None]:
import polars as pl

df = pl.DataFrame(
     {
         'Model': ['Camry','Corolla','RAV4',
                   'Mustang','F-150','Escape',
                   'Golf','Tiguan'],
         'Year': [1982,1966,1994,1964,1975,2000,1974,2007],   
         'Engine_Min':[2.5,1.8,2.0,2.3,2.7,1.5,1.0,1.4],
         'Engine_Max':[3.5,2.0,2.5,5.0,5.0,2.5,2.0,2.0],
         'AWD':[False,False,True,False,True,True,True,True],
         'Company': ['Toyota','Toyota','Toyota','Ford',
                     'Ford','Ford','Volkswagen','Volkswagen'],
     }
)
df
# A Polars Dataframe does not have an index.
# Below the headers of the DataFrame are the datatypes of each column.

Model,Year,Engine_Min,Engine_Max,AWD,Company
str,i64,f64,f64,bool,str
"""Camry""",1982,2.5,3.5,False,"""Toyota"""
"""Corolla""",1966,1.8,2.0,False,"""Toyota"""
"""RAV4""",1994,2.0,2.5,True,"""Toyota"""
"""Mustang""",1964,2.3,5.0,False,"""Ford"""
"""F-150""",1975,2.7,5.0,True,"""Ford"""
"""Escape""",2000,1.5,2.5,True,"""Ford"""
"""Golf""",1974,1.0,2.0,True,"""Volkswagen"""
"""Tiguan""",2007,1.4,2.0,True,"""Volkswagen"""


In [None]:
# To diplay the full name of the data type of each column in the Polars DataFrame, type:
df.dtypes

[String, Int64, Float64, Float64, Boolean, String]

In [None]:
# To display the column names of the Polars DataFrame, type:
df.columns

['Model', 'Year', 'Engine_Min', 'Engine_Max', 'AWD', 'Company']

In [None]:
# To display the data in the Polars DataFrame as a list of tuples, type:
df.rows()

[('Camry', 1982, 2.5, 3.5, False, 'Toyota'),
 ('Corolla', 1966, 1.8, 2.0, False, 'Toyota'),
 ('RAV4', 1994, 2.0, 2.5, True, 'Toyota'),
 ('Mustang', 1964, 2.3, 5.0, False, 'Ford'),
 ('F-150', 1975, 2.7, 5.0, True, 'Ford'),
 ('Escape', 2000, 1.5, 2.5, True, 'Ford'),
 ('Golf', 1974, 1.0, 2.0, True, 'Volkswagen'),
 ('Tiguan', 2007, 1.4, 2.0, True, 'Volkswagen')]

### Selecting Columns

In [None]:
# To select a particular column in the dataframe, type:
df.select(
    'Model'
)

Model
str
"""Camry"""
"""Corolla"""
"""RAV4"""
"""Mustang"""
"""F-150"""
"""Escape"""
"""Golf"""
"""Tiguan"""


In [7]:
df['Model']    # not recommended; anti-pattern

Model
str
"""Camry"""
"""Corolla"""
"""RAV4"""
"""Mustang"""
"""F-150"""
"""Escape"""
"""Golf"""
"""Tiguan"""


In [None]:
# To select multiple columns in the dataframe, type:
df.select(
    ['Model','Company']
)

Model,Company
str,str
"""Camry""","""Toyota"""
"""Corolla""","""Toyota"""
"""RAV4""","""Toyota"""
"""Mustang""","""Ford"""
"""F-150""","""Ford"""
"""Escape""","""Ford"""
"""Golf""","""Volkswagen"""
"""Tiguan""","""Volkswagen"""


In [10]:
# get all columns of type Utf8
df.select(
    pl.col(pl.String)
)

Model,Company
str,str
"""Camry""","""Toyota"""
"""Corolla""","""Toyota"""
"""RAV4""","""Toyota"""
"""Mustang""","""Ford"""
"""F-150""","""Ford"""
"""Escape""","""Ford"""
"""Golf""","""Volkswagen"""
"""Tiguan""","""Volkswagen"""


In [11]:
# piping together expressions
df.select(
    pl.col(['Year','Model','Engine_Max'])
    .sort_by(['Engine_Max','Year'],descending = [False,True])
)

Year,Model,Engine_Max
i64,str,f64
2007,"""Tiguan""",2.0
1974,"""Golf""",2.0
1966,"""Corolla""",2.0
2000,"""Escape""",2.5
1994,"""RAV4""",2.5
1982,"""Camry""",3.5
1975,"""F-150""",5.0
1964,"""Mustang""",5.0


In [12]:
df.select(
    [pl.col(pl.String), 'Year']
)

Model,Company,Year
str,str,i64
"""Camry""","""Toyota""",1982
"""Corolla""","""Toyota""",1966
"""RAV4""","""Toyota""",1994
"""Mustang""","""Ford""",1964
"""F-150""","""Ford""",1975
"""Escape""","""Ford""",2000
"""Golf""","""Volkswagen""",1974
"""Tiguan""","""Volkswagen""",2007


### Selecting Rows

In [13]:
df.row(0)  # get the first row

('Camry', 1982, 2.5, 3.5, False, 'Toyota')

In [None]:
df[1:3]  # returns the second and third rows

Model,Year,Engine_Min,Engine_Max,AWD,Company
str,i64,f64,f64,bool,str
"""Corolla""",1966,1.8,2.0,False,"""Toyota"""
"""RAV4""",1994,2.0,2.5,True,"""Toyota"""


In [None]:
df.head() # top 5 rows

Model,Year,Engine_Min,Engine_Max,AWD,Company
str,i64,f64,f64,bool,str
"""Camry""",1982,2.5,3.5,False,"""Toyota"""
"""Corolla""",1966,1.8,2.0,False,"""Toyota"""
"""RAV4""",1994,2.0,2.5,True,"""Toyota"""
"""Mustang""",1964,2.3,5.0,False,"""Ford"""
"""F-150""",1975,2.7,5.0,True,"""Ford"""


In [None]:
df.tail() # bottom 5 rows

Model,Year,Engine_Min,Engine_Max,AWD,Company
str,i64,f64,f64,bool,str
"""Mustang""",1964,2.3,5.0,False,"""Ford"""
"""F-150""",1975,2.7,5.0,True,"""Ford"""
"""Escape""",2000,1.5,2.5,True,"""Ford"""
"""Golf""",1974,1.0,2.0,True,"""Volkswagen"""
"""Tiguan""",2007,1.4,2.0,True,"""Volkswagen"""


In [18]:
df.sample(3)  # get 3 random rows

Model,Year,Engine_Min,Engine_Max,AWD,Company
str,i64,f64,f64,bool,str
"""Tiguan""",2007,1.4,2.0,True,"""Volkswagen"""
"""Corolla""",1966,1.8,2.0,False,"""Toyota"""
"""Camry""",1982,2.5,3.5,False,"""Toyota"""


In [None]:
# filter columms based on a condition
df.filter(
    pl.col('Company') == 'Toyota'
)

Model,Year,Engine_Min,Engine_Max,AWD,Company
str,i64,f64,f64,bool,str
"""Camry""",1982,2.5,3.5,False,"""Toyota"""
"""Corolla""",1966,1.8,2.0,False,"""Toyota"""
"""RAV4""",1994,2.0,2.5,True,"""Toyota"""


In [19]:
df.filter(
    (pl.col('Company') == 'Toyota') | (pl.col('Company') == 'Ford')
)

# parentheses are important

Model,Year,Engine_Min,Engine_Max,AWD,Company
str,i64,f64,f64,bool,str
"""Camry""",1982,2.5,3.5,False,"""Toyota"""
"""Corolla""",1966,1.8,2.0,False,"""Toyota"""
"""RAV4""",1994,2.0,2.5,True,"""Toyota"""
"""Mustang""",1964,2.3,5.0,False,"""Ford"""
"""F-150""",1975,2.7,5.0,True,"""Ford"""
"""Escape""",2000,1.5,2.5,True,"""Ford"""


In [21]:
# alternatively, you can use the .is_in() method
df.filter(
    pl.col('Company').is_in(['Toyota','Ford'])
)

Model,Year,Engine_Min,Engine_Max,AWD,Company
str,i64,f64,f64,bool,str
"""Camry""",1982,2.5,3.5,False,"""Toyota"""
"""Corolla""",1966,1.8,2.0,False,"""Toyota"""
"""RAV4""",1994,2.0,2.5,True,"""Toyota"""
"""Mustang""",1964,2.3,5.0,False,"""Ford"""
"""F-150""",1975,2.7,5.0,True,"""Ford"""
"""Escape""",2000,1.5,2.5,True,"""Ford"""


In [20]:
df.filter(
    (pl.col('Company') == 'Toyota') & (pl.col('Year') > 1980)
)

Model,Year,Engine_Min,Engine_Max,AWD,Company
str,i64,f64,f64,bool,str
"""Camry""",1982,2.5,3.5,False,"""Toyota"""
"""RAV4""",1994,2.0,2.5,True,"""Toyota"""


In [None]:
# All Cars except Toyota using the ~ (not) operator
df.filter(
    ~(pl.col('Company') == 'Toyota')
)

Model,Year,Engine_Min,Engine_Max,AWD,Company
str,i64,f64,f64,bool,str
"""Mustang""",1964,2.3,5.0,False,"""Ford"""
"""F-150""",1975,2.7,5.0,True,"""Ford"""
"""Escape""",2000,1.5,2.5,True,"""Ford"""
"""Golf""",1974,1.0,2.0,True,"""Volkswagen"""
"""Tiguan""",2007,1.4,2.0,True,"""Volkswagen"""


In [None]:
# Another way to get all cars except Toyota using the != operator
df.filter(
(pl.col('Company') != 'Toyota')
)

Model,Year,Engine_Min,Engine_Max,AWD,Company
str,i64,f64,f64,bool,str
"""Mustang""",1964,2.3,5.0,False,"""Ford"""
"""F-150""",1975,2.7,5.0,True,"""Ford"""
"""Escape""",2000,1.5,2.5,True,"""Ford"""
"""Golf""",1974,1.0,2.0,True,"""Volkswagen"""
"""Tiguan""",2007,1.4,2.0,True,"""Volkswagen"""


### Selecting Rows and Columns

In [24]:
df.filter(
    pl.col('Company') == 'Toyota'
).select(
    'Model'
)

Model
str
"""Camry"""
"""Corolla"""
"""RAV4"""


In [25]:
df.filter(
    pl.col('Company') == 'Toyota'
).select(
    ['Model','Year']
)

Model,Year
str,i64
"""Camry""",1982
"""Corolla""",1966
"""RAV4""",1994


### Using SQL on Polars

In [26]:
df.filter(
    pl.col('Company') == 'Toyota'
).select(
    '*'
)

Model,Year,Engine_Min,Engine_Max,AWD,Company
str,i64,f64,f64,bool,str
"""Camry""",1982,2.5,3.5,False,"""Toyota"""
"""Corolla""",1966,1.8,2.0,False,"""Toyota"""
"""RAV4""",1994,2.0,2.5,True,"""Toyota"""


In [27]:
ctx = pl.SQLContext(cars = df)
ctx.execute("SELECT * FROM cars", eager=True)

Model,Year,Engine_Min,Engine_Max,AWD,Company
str,i64,f64,f64,bool,str
"""Camry""",1982,2.5,3.5,False,"""Toyota"""
"""Corolla""",1966,1.8,2.0,False,"""Toyota"""
"""RAV4""",1994,2.0,2.5,True,"""Toyota"""
"""Mustang""",1964,2.3,5.0,False,"""Ford"""
"""F-150""",1975,2.7,5.0,True,"""Ford"""
"""Escape""",2000,1.5,2.5,True,"""Ford"""
"""Golf""",1974,1.0,2.0,True,"""Volkswagen"""
"""Tiguan""",2007,1.4,2.0,True,"""Volkswagen"""


In [28]:
ctx.execute('''
SELECT Company,
  AVG(Engine_Min) AS avg_engine_min,
  AVG(Engine_Max) AS avg_engine_max
FROM cars
GROUP BY Company;
''', eager=True)

Company,avg_engine_min,avg_engine_max
str,f64,f64
"""Volkswagen""",1.2,2.0
"""Ford""",2.166667,4.166667
"""Toyota""",2.1,2.666667


## Understanding Lazy Evalution in Polars

In [29]:
import pandas as pd
df = pd.read_csv('./datasets/flights/flights_sample.csv')
df

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2015,1,1,4,WN,552,N291WN,SAT,ELP,705,...,810.0,30.0,0,0,,1.0,0.0,29.0,0.0,0.0
996,2015,1,1,4,WN,3239,N905WN,TPA,BWI,705,...,907.0,-18.0,0,0,,,,,,
997,2015,1,1,4,WN,1912,N554WN,TUS,DEN,705,...,846.0,-9.0,0,0,,,,,,
998,2015,1,1,4,UA,1660,N17730,SNA,EWR,705,...,1433.0,-51.0,0,0,,,,,,


In [34]:
df = pd.read_csv('./datasets/flights/flights_sample.csv')
df = df[(df['MONTH'] == 1) &
        (df['ORIGIN_AIRPORT'] == 'ANC') &
        (df['DESTINATION_AIRPORT'] == 'SEA')]
df

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
15,2015,1,1,4,AS,108,N309AS,ANC,SEA,45,...,455.0,-14.0,0,0,,,,,,
16,2015,1,1,4,DL,1560,N3743H,ANC,SEA,45,...,451.0,-24.0,0,0,,,,,,
32,2015,1,1,4,AS,136,N431AS,ANC,SEA,135,...,,,0,1,A,,,,,
38,2015,1,1,4,AS,134,N464AS,ANC,SEA,155,...,558.0,-35.0,0,0,,,,,,
47,2015,1,1,4,AS,114,N303AS,ANC,SEA,220,...,628.0,-12.0,0,0,,,,,,
61,2015,1,1,4,AS,730,N423AS,ANC,SEA,505,...,916.0,-14.0,0,0,,,,,,
890,2015,1,1,4,AS,112,N527AS,ANC,SEA,700,...,1115.0,-15.0,0,0,,,,,,


In [None]:
# for mac
%brew install graphviz

In [None]:
# for linux
%apt-get install graphviz

E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?


In [None]:
# Windows using winget
%winget install Graphviz.Graphviz

In [None]:
%pip install graphviz

Collecting graphviz
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Downloading graphviz-0.21-py3-none-any.whl (47 kB)
Installing collected packages: graphviz
Successfully installed graphviz-0.21
Note: you may need to restart the kernel to use updated packages.


### Implicit lazy evaluation

In [35]:
import polars as pl
q = (
    pl.scan_csv('./datasets/flights/flights_sample.csv')    
)
type(q)

polars.lazyframe.frame.LazyFrame

In [36]:
df = pl.read_csv('./datasets/flights/flights_sample.csv')
type(df)

polars.dataframe.frame.DataFrame

In [37]:
q = pl.scan_csv('./datasets/flights/flights_sample.csv')
q = q.select(['MONTH', 'ORIGIN_AIRPORT','DESTINATION_AIRPORT'])
q = q.filter(
    (pl.col('MONTH') == 1) &
    (pl.col('ORIGIN_AIRPORT') == 'ANC') &
    (pl.col('DESTINATION_AIRPORT') == 'SEA'))

In [38]:
q = (
pl.scan_csv('./datasets/flights/flights_sample.csv')
.select(['MONTH', 'ORIGIN_AIRPORT','DESTINATION_AIRPORT'])
.filter(
    (pl.col('MONTH') == 1) &
    (pl.col('ORIGIN_AIRPORT') == 'ANC') &
    (pl.col('DESTINATION_AIRPORT') == 'SEA'))
)

In [1]:
q.show_graph(optimized=True) 

NameError: name 'q' is not defined

In [None]:
q.show_graph(optimized=False) # By default show_graph() prints out the query in its optimized format. However, if you print out the q object, it displays the graph in no-optimized mode.

In [43]:
q.collect()

MONTH,ORIGIN_AIRPORT,DESTINATION_AIRPORT
i64,str,str
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""


### Explicit lazy evaluation

In [44]:
df = (
    pl.read_csv('./datasets/flights/flights_sample.csv')
    .select(['MONTH', 'ORIGIN_AIRPORT','DESTINATION_AIRPORT'])
    .filter(
        (pl.col('MONTH') == 1) &
        (pl.col('ORIGIN_AIRPORT') == 'ANC') &
        (pl.col('DESTINATION_AIRPORT') == 'SEA'))
)
df

MONTH,ORIGIN_AIRPORT,DESTINATION_AIRPORT
i64,str,str
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""


In [None]:
# To ensure all the subsequent queries after the CSV is loaded can be optimized, use the lazy() method immediately after the read_csv() function.
q = (
    pl.read_csv('./datasets/flights/flights_sample.csv')
    .lazy()
    .select(['MONTH', 'ORIGIN_AIRPORT','DESTINATION_AIRPORT'])
    .filter(
        (pl.col('MONTH') == 1) &
        (pl.col('ORIGIN_AIRPORT') == 'ANC') &
        (pl.col('DESTINATION_AIRPORT') == 'SEA'))
)
df = q.collect()
display(df)

MONTH,ORIGIN_AIRPORT,DESTINATION_AIRPORT
i64,str,str
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""
1,"""ANC""","""SEA"""


## Querying Polars DataFrames Using DuckDB

### Using the sql() Function

In [4]:
%pip install pyarrow
# Remember to restart the kernel after installing new packages

Note: you may need to restart the kernel to use updated packages.


In [1]:
import polars as pl
df = pl.DataFrame(
    {
        'Model': ['Camry','Corolla','RAV4',
                  'Mustang','F-150','Escape',
                  'Golf','Tiguan'],
        'Year': [1982,1966,1994,1964,1975,2000,1974,2007],
        'Engine_Min':[2.5,1.8,2.0,2.3,2.7,1.5,1.0,1.4],
        'Engine_Max':[3.5,2.0,2.5,5.0,5.0,2.5,2.0,2.0],
        'AWD':[False,False,True,False,True,True,True,True],
        'Company': ['Toyota','Toyota','Toyota','Ford',
        'Ford','Ford','Volkswagen','Volkswagen'],
    }
)    

In [None]:
import duckdb
result = duckdb.sql('''
  SELECT 
    *
  FROM df
''')
result
# The sql() functions returns a duck.DuckDBPyRelation object.

┌─────────┬───────┬────────────┬────────────┬─────────┬────────────┐
│  Model  │ Year  │ Engine_Min │ Engine_Max │   AWD   │  Company   │
│ varchar │ int64 │   double   │   double   │ boolean │  varchar   │
├─────────┼───────┼────────────┼────────────┼─────────┼────────────┤
│ Camry   │  1982 │        2.5 │        3.5 │ false   │ Toyota     │
│ Corolla │  1966 │        1.8 │        2.0 │ false   │ Toyota     │
│ RAV4    │  1994 │        2.0 │        2.5 │ true    │ Toyota     │
│ Mustang │  1964 │        2.3 │        5.0 │ false   │ Ford       │
│ F-150   │  1975 │        2.7 │        5.0 │ true    │ Ford       │
│ Escape  │  2000 │        1.5 │        2.5 │ true    │ Ford       │
│ Golf    │  1974 │        1.0 │        2.0 │ true    │ Volkswagen │
│ Tiguan  │  2007 │        1.4 │        2.0 │ true    │ Volkswagen │
└─────────┴───────┴────────────┴────────────┴─────────┴────────────┘

In [None]:
result.pl() - # Displays the result as a Polars DataFrame

Model,Year,Engine_Min,Engine_Max,AWD,Company
str,i64,f64,f64,bool,str
"""Camry""",1982,2.5,3.5,False,"""Toyota"""
"""Corolla""",1966,1.8,2.0,False,"""Toyota"""
"""RAV4""",1994,2.0,2.5,True,"""Toyota"""
"""Mustang""",1964,2.3,5.0,False,"""Ford"""
"""F-150""",1975,2.7,5.0,True,"""Ford"""
"""Escape""",2000,1.5,2.5,True,"""Ford"""
"""Golf""",1974,1.0,2.0,True,"""Volkswagen"""
"""Tiguan""",2007,1.4,2.0,True,"""Volkswagen"""


In [None]:
result.describe() # Generate some basic statistics (min, max, median, count, stddev)

┌─────────┬─────────┬────────────────────┬────────────────────┬───────────────────┬─────────┬────────────┐
│  aggr   │  Model  │        Year        │     Engine_Min     │    Engine_Max     │   AWD   │  Company   │
│ varchar │ varchar │       double       │       double       │      double       │ varchar │  varchar   │
├─────────┼─────────┼────────────────────┼────────────────────┼───────────────────┼─────────┼────────────┤
│ count   │ 8       │                8.0 │                8.0 │               8.0 │ 8       │ 8          │
│ mean    │ NULL    │            1982.75 │ 1.9000000000000001 │            3.0625 │ NULL    │ NULL       │
│ stddev  │ NULL    │ 15.953056133543761 │ 0.5855400437691198 │ 1.293872923766914 │ NULL    │ NULL       │
│ min     │ Camry   │             1964.0 │                1.0 │               2.0 │ false   │ Ford       │
│ max     │ Tiguan  │             2007.0 │                2.7 │               5.0 │ true    │ Volkswagen │
│ median  │ NULL    │             197

In [None]:
result.order('Year') # Order the result by the Year column

┌─────────┬───────┬────────────┬────────────┬─────────┬────────────┐
│  Model  │ Year  │ Engine_Min │ Engine_Max │   AWD   │  Company   │
│ varchar │ int64 │   double   │   double   │ boolean │  varchar   │
├─────────┼───────┼────────────┼────────────┼─────────┼────────────┤
│ Mustang │  1964 │        2.3 │        5.0 │ false   │ Ford       │
│ Corolla │  1966 │        1.8 │        2.0 │ false   │ Toyota     │
│ Golf    │  1974 │        1.0 │        2.0 │ true    │ Volkswagen │
│ F-150   │  1975 │        2.7 │        5.0 │ true    │ Ford       │
│ Camry   │  1982 │        2.5 │        3.5 │ false   │ Toyota     │
│ RAV4    │  1994 │        2.0 │        2.5 │ true    │ Toyota     │
│ Escape  │  2000 │        1.5 │        2.5 │ true    │ Ford       │
│ Tiguan  │  2007 │        1.4 │        2.0 │ true    │ Volkswagen │
└─────────┴───────┴────────────┴────────────┴─────────┴────────────┘

In [None]:
result.order('Year DESC') # Order the result by the Year column in descending order

┌─────────┬───────┬────────────┬────────────┬─────────┬────────────┐
│  Model  │ Year  │ Engine_Min │ Engine_Max │   AWD   │  Company   │
│ varchar │ int64 │   double   │   double   │ boolean │  varchar   │
├─────────┼───────┼────────────┼────────────┼─────────┼────────────┤
│ Tiguan  │  2007 │        1.4 │        2.0 │ true    │ Volkswagen │
│ Escape  │  2000 │        1.5 │        2.5 │ true    │ Ford       │
│ RAV4    │  1994 │        2.0 │        2.5 │ true    │ Toyota     │
│ Camry   │  1982 │        2.5 │        3.5 │ false   │ Toyota     │
│ F-150   │  1975 │        2.7 │        5.0 │ true    │ Ford       │
│ Golf    │  1974 │        1.0 │        2.0 │ true    │ Volkswagen │
│ Corolla │  1966 │        1.8 │        2.0 │ false   │ Toyota     │
│ Mustang │  1964 │        2.3 │        5.0 │ false   │ Ford       │
└─────────┴───────┴────────────┴────────────┴─────────┴────────────┘

In [None]:
result.apply('min', 'Year') # Use the apply() method to apply a function to a column. 

┌─────────────┐
│ min("Year") │
│    int64    │
├─────────────┤
│        1964 │
└─────────────┘

In [10]:
duckdb.sql('''
  SELECT Company, Model
  FROM df
  ORDER by Company, Model
''').pl()

Company,Model
str,str
"""Ford""","""Escape"""
"""Ford""","""F-150"""
"""Ford""","""Mustang"""
"""Toyota""","""Camry"""
"""Toyota""","""Corolla"""
"""Toyota""","""RAV4"""
"""Volkswagen""","""Golf"""
"""Volkswagen""","""Tiguan"""


In [11]:
duckdb.sql('''
  SELECT Company, count(Model) as count
  FROM df
  GROUP BY Company
''').pl()

Company,count
str,i64
"""Toyota""",3
"""Volkswagen""",2
"""Ford""",3


In [12]:
result.pl().select(
    pl.col('Company').value_counts()
).unnest('Company')

Company,count
str,u32
"""Toyota""",3
"""Ford""",3
"""Volkswagen""",2


### Using the DuckDBPyRelation Object

In [13]:
import duckdb
conn = duckdb.connect()
conn.execute('''
  CREATE TABLE customers
    (customer_id INTEGER PRIMARY KEY, name STRING)
''')
conn.execute('''
  CREATE TABLE products
    (product_id INTEGER PRIMARY KEY, product_name STRING)
''')
conn.execute('''
  CREATE TABLE sales
    (customer_id INTEGER, product_id INTEGER, qty INTEGER,
  PRIMARY KEY(customer_id,product_id))
''')

<duckdb.duckdb.DuckDBPyConnection at 0x7ac703f9a830>

In [None]:
customers_relation = conn.table('customers') # Load the customers table as a DuckDBPyRelation object

In [15]:
# convert to a pandas DataFrame
customers_relation.df()

# convert to a Polars DataFrame
customers_relation.pl()

customer_id,name
i32,str


### Inserting Rows

In [16]:
customers_relation.insert([1, 'Alice'])
customers_relation.insert([2, 'Bob'])
customers_relation.insert([3, 'Charlie'])

In [17]:
products_relation = conn.table('products')
products_relation.insert([10, 'Paperclips'])
products_relation.insert([20, 'Staple'])
products_relation.insert([30, 'Notebook'])
sales_relation = conn.table("sales")
sales_relation.insert([1,20,1])
sales_relation.insert([1,10,2])
sales_relation.insert([2,30,7])
sales_relation.insert([3,10,3])
sales_relation.insert([3,20,2])

### Joining tables

In [18]:
result = customers_relation.join(
    sales_relation,
    condition = "customer_id",
    how = "inner"
).join(
    products_relation,
    condition = "product_id",
    how = "inner"
)    

In [19]:
result

┌─────────────┬─────────┬────────────┬───────┬──────────────┐
│ customer_id │  name   │ product_id │  qty  │ product_name │
│    int32    │ varchar │   int32    │ int32 │   varchar    │
├─────────────┼─────────┼────────────┼───────┼──────────────┤
│           1 │ Alice   │         20 │     1 │ Staple       │
│           1 │ Alice   │         10 │     2 │ Paperclips   │
│           2 │ Bob     │         30 │     7 │ Notebook     │
│           3 │ Charlie │         10 │     3 │ Paperclips   │
│           3 │ Charlie │         20 │     2 │ Staple       │
└─────────────┴─────────┴────────────┴───────┴──────────────┘

### Filtering rows

In [20]:
result.filter('customer_id = 1')

┌─────────────┬─────────┬────────────┬───────┬──────────────┐
│ customer_id │  name   │ product_id │  qty  │ product_name │
│    int32    │ varchar │   int32    │ int32 │   varchar    │
├─────────────┼─────────┼────────────┼───────┼──────────────┤
│           1 │ Alice   │         10 │     2 │ Paperclips   │
│           1 │ Alice   │         20 │     1 │ Staple       │
└─────────────┴─────────┴────────────┴───────┴──────────────┘

In [21]:
# execute a query on the result to fetch and print the joined data
conn.execute('''
  SELECT *
  FROM result
  WHERE customer_id = 1
''').pl()

customer_id,name,product_id,qty,product_name
i32,str,i32,i32,str
1,"""Alice""",10,2,"""Paperclips"""
1,"""Alice""",20,1,"""Staple"""


### Aggregating rows

In [22]:
result.aggregate('customer_id, MAX(name) AS Name, ' +
                 'SUM(qty) as "Total Qty"',
                 'customer_id')

┌─────────────┬─────────┬───────────┐
│ customer_id │  Name   │ Total Qty │
│    int32    │ varchar │  int128   │
├─────────────┼─────────┼───────────┤
│           1 │ Alice   │         3 │
│           2 │ Bob     │         7 │
│           3 │ Charlie │         5 │
└─────────────┴─────────┴───────────┘

In [23]:
# execute a query on the result to fetch and print the aggregated data
conn.execute('''
  SELECT customer_id as 'Customer ID', MAX(name) AS Name, sum(qty) AS "Total Qty"
  FROM result
  GROUP BY customer_id
''').pl()

Customer ID,Name,Total Qty
i32,str,"decimal[38,0]"
1,"""Alice""",3
2,"""Bob""",7
3,"""Charlie""",5


### Projecting columns

In [24]:
result.project('name, qty, product_name')

┌─────────┬───────┬──────────────┐
│  name   │  qty  │ product_name │
│ varchar │ int32 │   varchar    │
├─────────┼───────┼──────────────┤
│ Alice   │     1 │ Staple       │
│ Alice   │     2 │ Paperclips   │
│ Bob     │     7 │ Notebook     │
│ Charlie │     3 │ Paperclips   │
│ Charlie │     2 │ Staple       │
└─────────┴───────┴──────────────┘

### Limiting rows

In [25]:
result.limit(3)

┌─────────────┬─────────┬────────────┬───────┬──────────────┐
│ customer_id │  name   │ product_id │  qty  │ product_name │
│    int32    │ varchar │   int32    │ int32 │   varchar    │
├─────────────┼─────────┼────────────┼───────┼──────────────┤
│           1 │ Alice   │         20 │     1 │ Staple       │
│           1 │ Alice   │         10 │     2 │ Paperclips   │
│           2 │ Bob     │         30 │     7 │ Notebook     │
└─────────────┴─────────┴────────────┴───────┴──────────────┘

In [26]:
result.limit(3,2) # display 3 rows, starting at offset 2 (third row)

┌─────────────┬─────────┬────────────┬───────┬──────────────┐
│ customer_id │  name   │ product_id │  qty  │ product_name │
│    int32    │ varchar │   int32    │ int32 │   varchar    │
├─────────────┼─────────┼────────────┼───────┼──────────────┤
│           2 │ Bob     │         30 │     7 │ Notebook     │
│           3 │ Charlie │         10 │     3 │ Paperclips   │
│           3 │ Charlie │         20 │     2 │ Staple       │
└─────────────┴─────────┴────────────┴───────┴──────────────┘