In [1]:
!pip install duckdb

Collecting duckdb
  Downloading duckdb-1.3.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.0 kB)
Downloading duckdb-1.3.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (21.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: duckdb
Successfully installed duckdb-1.3.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
!python3 -m pip install --upgrade pip

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.1.1
    Uninstalling pip-25.1.1:
      Successfully uninstalled pip-25.1.1
Successfully installed pip-25.2


In [3]:
import duckdb

# create a connection to a new DuckDB database file
conn = duckdb.connect('my_duckdb_database.db')

In [4]:
# alternatively, to create an in-memory database:
conn = duckdb.connect(':memory:')

In [5]:
# create a table
conn.execute('''
  CREATE TABLE employees (
    id INTEGER PRIMARY KEY,
    name VARCHAR,
    age INTEGER,
    department VARCHAR
  )
''')

<duckdb.duckdb.DuckDBPyConnection at 0x7a5482fa4cb0>

In [6]:
conn.execute('SHOW TABLES').df()

Unnamed: 0,name
0,employees


In [7]:
# insert data into the table
conn.execute('''
  INSERT INTO employees VALUES
    (1, 'Alice', 30, 'HR'), 
    (2, 'Bob', 35, 'Engineering'),
    (3, 'Charlie', 28, 'Marketing'),
    (4, 'David', 40, 'Engineering')
''')

<duckdb.duckdb.DuckDBPyConnection at 0x7a5482fa4cb0>

In [8]:
conn.execute('''
  SELECT * FROM employees
''').df()

Unnamed: 0,id,name,age,department
0,1,Alice,30,HR
1,2,Bob,35,Engineering
2,3,Charlie,28,Marketing
3,4,David,40,Engineering


In [9]:
conn.execute('''
  SELECT 
    department, 
    COUNT(*) AS employee_count
  FROM 
    employees
  GROUP BY 
    department
''').df()

Unnamed: 0,department,employee_count
0,Engineering,2
1,HR,1
2,Marketing,1


In [10]:
conn.execute('''
  SELECT 
    AVG(age) AS average_age
  FROM 
    employees
''').df()

Unnamed: 0,average_age
0,33.25


In [11]:
conn.execute('''
  SELECT 
    department, 
    MAX(age) AS oldest_age
  FROM 
    employees
  GROUP BY 
    department
''').df()

Unnamed: 0,department,oldest_age
0,Engineering,40
1,HR,30
2,Marketing,28


In [12]:
conn.execute('''
  SELECT 
    department, 
    AVG(age) AS average_age
  FROM 
    employees
  GROUP BY 
    department
''').df()

Unnamed: 0,department,average_age
0,Engineering,37.5
1,HR,30.0
2,Marketing,28.0


In [13]:
# create an in-memory copy of the database
conn = duckdb.connect()

# create first table - orders
conn.execute('''
  CREATE TABLE orders (
    order_id INTEGER, 
    customer_id INTEGER, 
    amount FLOAT)
  ''')

# add some records to the orders table
conn.execute('''
  INSERT INTO orders 
  VALUES (1, 1, 100.0), 
         (2, 2, 200.0), 
         (3, 1, 150.0)
''')

# create second table - customers
conn.execute('''
  CREATE TABLE customers (
    customer_id INTEGER, 
    name VARCHAR)
''')

conn.execute('''
  INSERT INTO customers 
  VALUES (1, 'Alice'), 
         (2, 'Bob')
''')


<duckdb.duckdb.DuckDBPyConnection at 0x7a5464331370>

In [14]:
display(conn.execute('''
  SELECT * FROM orders
''').df()
)
display(conn.execute('''
  SELECT * FROM customers
''').df()
)

Unnamed: 0,order_id,customer_id,amount
0,1,1,100.0
1,2,2,200.0
2,3,1,150.0


Unnamed: 0,customer_id,name
0,1,Alice
1,2,Bob


In [15]:
# join the two tables
conn.execute('''
  SELECT 
    customers.customer_id, 
    customers.name, 
    orders.amount,
  FROM 
    orders
  JOIN 
    customers 
  ON 
    orders.customer_id = customers.customer_id
  ORDER by customers.customer_id
''').df()


Unnamed: 0,customer_id,name,amount
0,1,Alice,100.0
1,1,Alice,150.0
2,2,Bob,200.0


In [16]:
# join the two tables
conn.execute('''
  SELECT 
    customers.customer_id, 
    customers.name, 
    SUM(orders.amount) as total_spent
  FROM 
    orders
  JOIN 
    customers 
  ON 
    orders.customer_id = customers.customer_id
  GROUP BY 
    customers.customer_id, 
    customers.name
  ORDER by 
    customers.customer_id
''').df()

Unnamed: 0,customer_id,name,total_spent
0,1,Alice,250.0
1,2,Bob,200.0


In [17]:
import pandas as pd

# Employee DataFrame
employees = pd.DataFrame({
    'employee_id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [30, 35, 28, 40],
    'department': ['HR', 'Engineering', 'Marketing', 'Engineering']
})

# Sales DataFrame
sales = pd.DataFrame({
    'sale_id': [101, 102, 103, 104, 105],
    'employee_id': [1, 2, 1, 3, 4],
    'sale_amount': [200, 500, 150, 300, 700],
    'sale_date': ['2023-01-01', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-07']
})
display(employees)
display(sales)

Unnamed: 0,employee_id,name,age,department
0,1,Alice,30,HR
1,2,Bob,35,Engineering
2,3,Charlie,28,Marketing
3,4,David,40,Engineering


Unnamed: 0,sale_id,employee_id,sale_amount,sale_date
0,101,1,200,2023-01-01
1,102,2,500,2023-01-03
2,103,1,150,2023-01-04
3,104,3,300,2023-01-05
4,105,4,700,2023-01-07


In [18]:
# create an in-memory copy of the database using DuckDB
conn = duckdb.connect()
# join the dataframes, group by department, and perform aggregations
query = '''
  SELECT
    e.department,
    SUM(s.sale_amount) AS total_sales,
    AVG(s.sale_amount) AS average_sale_per_employee,
    COUNT(DISTINCT e.employee_id) AS number_of_employees
  FROM
    employees e
  LEFT JOIN
    sales s ON e.employee_id = s.employee_id
  GROUP BY
    e.department
'''
conn.execute(query).df()

Unnamed: 0,department,total_sales,average_sale_per_employee,number_of_employees
0,Engineering,1200.0,600.0,2
1,Marketing,300.0,300.0,1
2,HR,350.0,175.0,1


In [19]:
query = '''
  SELECT
    e.department,
    e.name AS top_employee,
    MAX(s.sale_amount) AS top_sale_amount
  FROM
    employees e
  LEFT JOIN
    sales s ON e.employee_id = s.employee_id
  GROUP BY
    e.department,
    e.name
  ORDER BY
    top_sale_amount DESC
'''
conn.execute(query).df()

Unnamed: 0,department,top_employee,top_sale_amount
0,Engineering,David,700
1,Engineering,Bob,500
2,Marketing,Charlie,300
3,HR,Alice,200


In [1]:
import pandas as pd

# load the CSV file and time it
%timeit df = pd.read_csv('./datasets/flights/flights_sample.csv')

3.5 ms ± 134 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [2]:
df = pd.read_csv('./datasets/flights/flights_sample.csv')
%timeit df.groupby('AIRLINE')['ARRIVAL_DELAY'].mean().reset_index()

567 μs ± 19.4 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [3]:
df.groupby('AIRLINE')['ARRIVAL_DELAY'].mean().reset_index()

Unnamed: 0,AIRLINE,ARRIVAL_DELAY
0,AA,3.442478
1,AS,-6.050847
2,B6,-1.747253
3,DL,-10.068376
4,EV,5.792208
5,F9,5.5
6,HA,1.0
7,MQ,2.894737
8,NK,6.5
9,OO,3.557522


In [4]:
import duckdb

conn = duckdb.connect()
query = '''
  SELECT
    AIRLINE,
    AVG(ARRIVAL_DELAY) AS MEAN_ARRIVAL_DELAY
  FROM
    read_csv_auto('./datasets/flights/flights_sample.csv')
  GROUP BY
    AIRLINE
    ORDER BY
    AIRLINE;
'''
%timeit df = conn.execute(query).df()

30.8 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
import psutil

def memory_usage():
    process = psutil.Process()
    return process.memory_info().rss / (1024 ** 2) # convert bytes to MB

In [8]:
import pandas as pd

# measure memory before query execution
memory_before = memory_usage()
print(f"Memory used before query: {memory_before:.2f} MB")

# load the CSV file
df = pd.read_csv('./datasets/flights/flights_sample.csv')

# measure memory after query execution
memory_after = memory_usage()

print(f"Memory used after query: {memory_after:.2f} MB")

Memory used before query: 161.59 MB
Memory used after query: 161.96 MB


In [9]:
df

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2015,1,1,4,WN,552,N291WN,SAT,ELP,705,...,810.0,30.0,0,0,,1.0,0.0,29.0,0.0,0.0
996,2015,1,1,4,WN,3239,N905WN,TPA,BWI,705,...,907.0,-18.0,0,0,,,,,,
997,2015,1,1,4,WN,1912,N554WN,TUS,DEN,705,...,846.0,-9.0,0,0,,,,,,
998,2015,1,1,4,UA,1660,N17730,SNA,EWR,705,...,1433.0,-51.0,0,0,,,,,,


In [10]:
import duckdb
conn = duckdb.connect()
query = '''
  SELECT
    AIRLINE,
    AVG(ARRIVAL_DELAY) AS MEAN_ARRIVAL_DELAY
  FROM
    read_csv_auto('./datasets/flights/flights_sample.csv')
  GROUP BY
    AIRLINE
    ORDER BY
    AIRLINE;
'''
# measure memory before query execution
memory_before = memory_usage()
print(f"Memory used before query: {memory_before:.2f} MB")

# run the query
df = conn.execute(query).df()

# measure memory after query execution
memory_after = memory_usage()
print(f"Memory used after query: {memory_after:.2f} MB")

Memory used before query: 154.89 MB
Memory used after query: 159.90 MB
