In [None]:
import duckdb as dd
import pandas as pd

In [None]:
# Create an in-memory DuckDB connection
# con = dd.connect(':memory:')

# Create a persistent DuckDB database
con = dd.connect('my_database.db')

In [3]:
#Running a basic SQL query
result = dd.sql("SELECT 'DuckDB_is_cool' AS answer").fetchall()
print(type(result))
print(result)

result = dd.sql("SELECT 'DuckDB_is_cool' AS answer")
print( type(result) )
print(result)

<class 'list'>
[('DuckDB_is_cool',)]
<class 'duckdb.duckdb.DuckDBPyRelation'>
┌────────────────┐
│     answer     │
│    varchar     │
├────────────────┤
│ DuckDB_is_cool │
└────────────────┘



In [5]:
# Create a relation from a SQL query
rel = dd.sql("SELECT * FROM range(10_00) AS tbl(ID)")
# Display the relation
rel.show()

┌────────────┐
│     ID     │
│   int64    │
├────────────┤
│          0 │
│          1 │
│          2 │
│          3 │
│          4 │
│          5 │
│          6 │
│          7 │
│          8 │
│          9 │
│          · │
│          · │
│          · │
│        990 │
│        991 │
│        992 │
│        993 │
│        994 │
│        995 │
│        996 │
│        997 │
│        998 │
│        999 │
├────────────┤
│ 1000 rows  │
│ (20 shown) │
└────────────┘



In [8]:
con = dd.connect('my_database.db')
con.sql('SHOW ALL TABLES')

┌──────────┬─────────┬─────────┬──────────────┬──────────────┬───────────┐
│ database │ schema  │  name   │ column_names │ column_types │ temporary │
│ varchar  │ varchar │ varchar │  varchar[]   │  varchar[]   │  boolean  │
├──────────┴─────────┴─────────┴──────────────┴──────────────┴───────────┤
│                                 0 rows                                 │
└────────────────────────────────────────────────────────────────────────┘

In [3]:
# Create a table
con.execute('''
CREATE OR REPLACE TABLE countries( 
    country VARCHAR,
    code VARCHAR,
    region VARCHAR,
    sub_region VARCHAR,
    intermediate_region VARCHAR
);
''')
# Insert some data
con.execute('''
INSERT INTO countries VALUES
('Australia', 'AUS', 'Oceania', 'Australia and New Zealand', ''),
('India', 'IND', 'Asia', 'Southern Asia', '');
''')

con.sql('SHOW ALL TABLES')

┌─────────────┬─────────┬───────────┬──────────────────────────────────────────────────────────┬───────────────────────────────────────────────┬───────────┐
│  database   │ schema  │   name    │                       column_names                       │                 column_types                  │ temporary │
│   varchar   │ varchar │  varchar  │                        varchar[]                         │                   varchar[]                   │  boolean  │
├─────────────┼─────────┼───────────┼──────────────────────────────────────────────────────────┼───────────────────────────────────────────────┼───────────┤
│ my_database │ main    │ countries │ [country, code, region, sub_region, intermediate_region] │ [VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR] │ false     │
│ my_database │ main    │ employees │ [id, name, age, salary]                                  │ [INTEGER, VARCHAR, INTEGER, DOUBLE]           │ false     │
└─────────────┴─────────┴───────────┴─────────────────────

In [4]:
# Create second table
con.execute('''
CREATE OR REPLACE TABLE employees(
    id INTEGER,
    name VARCHAR,
    age INTEGER,
    salary DOUBLE
);
''')

# Insert some data in second table
con.execute('''
INSERT INTO employees VALUES
(1, 'Person 1', 30, 70000),
(2, 'Person 2', 25, 55000),
(3, 'Person 3', 35, 80000);
''')


# Result of showing tables after creating the second table
con.sql('SHOW ALL TABLES')

┌─────────────┬─────────┬───────────┬──────────────────────────────────────────────────────────┬───────────────────────────────────────────────┬───────────┐
│  database   │ schema  │   name    │                       column_names                       │                 column_types                  │ temporary │
│   varchar   │ varchar │  varchar  │                        varchar[]                         │                   varchar[]                   │  boolean  │
├─────────────┼─────────┼───────────┼──────────────────────────────────────────────────────────┼───────────────────────────────────────────────┼───────────┤
│ my_database │ main    │ countries │ [country, code, region, sub_region, intermediate_region] │ [VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR] │ false     │
│ my_database │ main    │ employees │ [id, name, age, salary]                                  │ [INTEGER, VARCHAR, INTEGER, DOUBLE]           │ false     │
└─────────────┴─────────┴───────────┴─────────────────────

In [5]:
con.sql('SELECT * FROM countries;')

┌───────────┬─────────┬─────────┬───────────────────────────┬─────────────────────┐
│  country  │  code   │ region  │        sub_region         │ intermediate_region │
│  varchar  │ varchar │ varchar │          varchar          │       varchar       │
├───────────┼─────────┼─────────┼───────────────────────────┼─────────────────────┤
│ Australia │ AUS     │ Oceania │ Australia and New Zealand │                     │
│ India     │ IND     │ Asia    │ Southern Asia             │                     │
└───────────┴─────────┴─────────┴───────────────────────────┴─────────────────────┘

In [6]:
con.sql('SELECT * FROM employees;')

┌───────┬──────────┬───────┬─────────┐
│  id   │   name   │  age  │ salary  │
│ int32 │ varchar  │ int32 │ double  │
├───────┼──────────┼───────┼─────────┤
│     1 │ Person 1 │    30 │ 70000.0 │
│     2 │ Person 2 │    25 │ 55000.0 │
│     3 │ Person 3 │    35 │ 80000.0 │
└───────┴──────────┴───────┴─────────┘

In [7]:
con.sql('''
        INSERT INTO countries (country, code, region, sub_region, intermediate_region) 
        (SELECT * FROM "countries.csv")
''')

In [8]:
con.sql('SELECT * FROM countries;')

┌────────────────────────────────────┬─────────┬──────────┬─────────────────────────────────┬─────────────────────┐
│              country               │  code   │  region  │           sub_region            │ intermediate_region │
│              varchar               │ varchar │ varchar  │             varchar             │       varchar       │
├────────────────────────────────────┼─────────┼──────────┼─────────────────────────────────┼─────────────────────┤
│ Australia                          │ AUS     │ Oceania  │ Australia and New Zealand       │                     │
│ India                              │ IND     │ Asia     │ Southern Asia                   │                     │
│ Afghanistan                        │ AFG     │ Asia     │ Southern Asia                   │ NULL                │
│ Åland Islands                      │ ALA     │ Europe   │ Northern Europe                 │ NULL                │
│ Albania                            │ ALB     │ Europe   │ Southern Eur

In [16]:
con.sql('''
        SELECT * FROM countries 
        where 
        region = 'Oceania' 
        AND sub_region = 'Polynesia'
''')

┌───────────────────┬─────────┬─────────┬────────────┬─────────────────────┐
│      country      │  code   │ region  │ sub_region │ intermediate_region │
│      varchar      │ varchar │ varchar │  varchar   │       varchar       │
├───────────────────┼─────────┼─────────┼────────────┼─────────────────────┤
│ American Samoa    │ ASM     │ Oceania │ Polynesia  │ NULL                │
│ Cook Islands      │ COK     │ Oceania │ Polynesia  │ NULL                │
│ French Polynesia  │ PYF     │ Oceania │ Polynesia  │ NULL                │
│ Niue              │ NIU     │ Oceania │ Polynesia  │ NULL                │
│ Pitcairn          │ PCN     │ Oceania │ Polynesia  │ NULL                │
│ Samoa             │ WSM     │ Oceania │ Polynesia  │ NULL                │
│ Tokelau           │ TKL     │ Oceania │ Polynesia  │ NULL                │
│ Tonga             │ TON     │ Oceania │ Polynesia  │ NULL                │
│ Tuvalu            │ TUV     │ Oceania │ Polynesia  │ NULL                │

In [14]:
# you can also query directly from the file

con.sql('''
        SELECT  *
            FROM
                'countries.csv'
            WHERE
                region = 'Oceania'
                AND "sub-region" = 'Polynesia'
''')

┌───────────────────┬─────────┬─────────┬────────────┬─────────────────────┐
│       name        │  code   │ region  │ sub-region │ intermediate-region │
│      varchar      │ varchar │ varchar │  varchar   │       varchar       │
├───────────────────┼─────────┼─────────┼────────────┼─────────────────────┤
│ American Samoa    │ ASM     │ Oceania │ Polynesia  │ NULL                │
│ Cook Islands      │ COK     │ Oceania │ Polynesia  │ NULL                │
│ French Polynesia  │ PYF     │ Oceania │ Polynesia  │ NULL                │
│ Niue              │ NIU     │ Oceania │ Polynesia  │ NULL                │
│ Pitcairn          │ PCN     │ Oceania │ Polynesia  │ NULL                │
│ Samoa             │ WSM     │ Oceania │ Polynesia  │ NULL                │
│ Tokelau           │ TKL     │ Oceania │ Polynesia  │ NULL                │
│ Tonga             │ TON     │ Oceania │ Polynesia  │ NULL                │
│ Tuvalu            │ TUV     │ Oceania │ Polynesia  │ NULL                │

In [20]:
#you can use pandas dataframe

df = pd.DataFrame({
    'id': [4, 5, 6],
    'name': ['Person 4', 'Person 5', 'Person 6'],
    'age': [45, 40, 35],
    'salary': [100000, 85000, 75000]
})

con.sql('''
        INSERT INTO employees (id, name, age, salary)
        SELECT * FROM df
''')

con.sql('select * from employees')

┌───────┬──────────┬───────┬──────────┐
│  id   │   name   │  age  │  salary  │
│ int32 │ varchar  │ int32 │  double  │
├───────┼──────────┼───────┼──────────┤
│     1 │ Person 1 │    30 │  70000.0 │
│     2 │ Person 2 │    25 │  55000.0 │
│     3 │ Person 3 │    35 │  80000.0 │
│     4 │ Person 4 │    45 │ 100000.0 │
│     5 │ Person 5 │    40 │  85000.0 │
│     6 │ Person 6 │    35 │  75000.0 │
└───────┴──────────┴───────┴──────────┘

In [22]:
#you can generate a pandas dataframe from a result
df = con.sql('''
        SELECT  *
            FROM
                'countries.csv'
            WHERE
                region = 'Oceania'
                AND "sub-region" = 'Polynesia'
''').df()

df

Unnamed: 0,name,code,region,sub-region,intermediate-region
0,American Samoa,ASM,Oceania,Polynesia,
1,Cook Islands,COK,Oceania,Polynesia,
2,French Polynesia,PYF,Oceania,Polynesia,
3,Niue,NIU,Oceania,Polynesia,
4,Pitcairn,PCN,Oceania,Polynesia,
5,Samoa,WSM,Oceania,Polynesia,
6,Tokelau,TKL,Oceania,Polynesia,
7,Tonga,TON,Oceania,Polynesia,
8,Tuvalu,TUV,Oceania,Polynesia,
9,Wallis and Futuna,WLF,Oceania,Polynesia,


In [23]:
# Read data from a Parquet file
con.sql("SELECT * FROM 'countries.parquet'").df().head()

Unnamed: 0,name,code,region,sub-region,intermediate-region
0,Afghanistan,AFG,Asia,Southern Asia,
1,Åland Islands,ALA,Europe,Northern Europe,
2,Albania,ALB,Europe,Southern Europe,
3,Algeria,DZA,Africa,Northern Africa,
4,American Samoa,ASM,Oceania,Polynesia,


In [24]:
# you can have the usual analyticals from sql
con.sql('''
        SELECT  region
                , COUNT(DISTINCT country) AS country_counts
            FROM
                countries
            GROUP BY
                region
            ORDER BY
                country_counts DESC
''')

┌──────────┬────────────────┐
│  region  │ country_counts │
│ varchar  │     int64      │
├──────────┼────────────────┤
│ Africa   │             60 │
│ Americas │             57 │
│ Europe   │             51 │
│ Asia     │             51 │
│ Oceania  │             29 │
│ NULL     │              1 │
└──────────┴────────────────┘

In [25]:
# even CTE's
con.sql('''
        WITH avg_salary AS (
            SELECT
                    ROUND(AVG(salary),2) AS avg_salary
                FROM
                    employees
            )
        
        SELECT
                *
            FROM
                employees
            WHERE
                salary > (SELECT avg_salary FROM avg_salary)
        
''')

┌───────┬──────────┬───────┬──────────┐
│  id   │   name   │  age  │  salary  │
│ int32 │ varchar  │ int32 │  double  │
├───────┼──────────┼───────┼──────────┤
│     3 │ Person 3 │    35 │  80000.0 │
│     4 │ Person 4 │    45 │ 100000.0 │
│     5 │ Person 5 │    40 │  85000.0 │
└───────┴──────────┴───────┴──────────┘

In [26]:
con.close()