In [None]:
# !pip install duckdb

In [None]:
import duckdb

# create a connection to a new DuckDB database file
conn = duckdb.connect('my_duckdb_database.db')

In [None]:
# alternatively, to create an in-memory database:
conn = duckdb.connect(':memory:')

In [None]:
# create a table
conn.execute('''
  CREATE TABLE employees (
    id INTEGER PRIMARY KEY,
    name VARCHAR,
    age INTEGER,
    department VARCHAR
  )
''')

In [None]:
conn.execute('SHOW TABLES').df()

In [None]:
# insert data into the table
conn.execute('''
  INSERT INTO employees VALUES
    (1, 'Alice', 30, 'HR'), 
    (2, 'Bob', 35, 'Engineering'),
    (3, 'Charlie', 28, 'Marketing'),
    (4, 'David', 40, 'Engineering')
''')

In [None]:
conn.execute('''
  SELECT * FROM employees
''').df()

In [None]:
conn.execute('''
  SELECT 
    department, 
    COUNT(*) AS employee_count
  FROM 
    employees
  GROUP BY 
    department
''').df()

In [None]:
conn.execute('''
  SELECT 
    AVG(age) AS average_age
  FROM 
    employees
''').df()

In [None]:
conn.execute('''
  SELECT 
    department, 
    MAX(age) AS oldest_age
  FROM 
    employees
  GROUP BY 
    department
''').df()

In [None]:
conn.execute('''
  SELECT 
    department, 
    AVG(age) AS average_age
  FROM 
    employees
  GROUP BY 
    department
''').df()

In [None]:
# create an in-memory copy of the database
conn = duckdb.connect()

# create first table - orders
conn.execute('''
  CREATE TABLE orders (
    order_id INTEGER, 
    customer_id INTEGER, 
    amount FLOAT)
  ''')

# add some records to the orders table
conn.execute('''
  INSERT INTO orders 
  VALUES (1, 1, 100.0), 
         (2, 2, 200.0), 
         (3, 1, 150.0)
''')

# create second table - customers
conn.execute('''
  CREATE TABLE customers (
    customer_id INTEGER, 
    name VARCHAR)
''')

conn.execute('''
  INSERT INTO customers 
  VALUES (1, 'Alice'), 
         (2, 'Bob')
''')


In [None]:
display(conn.execute('''
  SELECT * FROM orders
''').df()
)
display(conn.execute('''
  SELECT * FROM customers
''').df()
)

In [None]:
# join the two tables
conn.execute('''
  SELECT 
    customers.customer_id, 
    customers.name, 
    orders.amount,
  FROM 
    orders
  JOIN 
    customers 
  ON 
    orders.customer_id = customers.customer_id
  ORDER by customers.customer_id
''').df()


In [None]:
# join the two tables
conn.execute('''
  SELECT 
    customers.customer_id, 
    customers.name, 
    SUM(orders.amount) as total_spent
  FROM 
    orders
  JOIN 
    customers 
  ON 
    orders.customer_id = customers.customer_id
  GROUP BY 
    customers.customer_id, 
    customers.name
  ORDER by 
    customers.customer_id
''').df()

In [None]:
import pandas as pd

# Employee DataFrame
employees = pd.DataFrame({
    'employee_id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [30, 35, 28, 40],
    'department': ['HR', 'Engineering', 'Marketing', 'Engineering']
})

# Sales DataFrame
sales = pd.DataFrame({
    'sale_id': [101, 102, 103, 104, 105],
    'employee_id': [1, 2, 1, 3, 4],
    'sale_amount': [200, 500, 150, 300, 700],
    'sale_date': ['2023-01-01', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-07']
})
display(employees)
display(sales)

In [None]:
# create an in-memory copy of the database using DuckDB
conn = duckdb.connect()
# join the dataframes, group by department, and perform aggregations
query = '''
  SELECT
    e.department,
    SUM(s.sale_amount) AS total_sales,
    AVG(s.sale_amount) AS average_sale_per_employee,
    COUNT(DISTINCT e.employee_id) AS number_of_employees
  FROM
    employees e
  LEFT JOIN
    sales s ON e.employee_id = s.employee_id
  GROUP BY
    e.department
'''
conn.execute(query).df()

In [None]:
query = '''
  SELECT
    e.department,
    e.name AS top_employee,
    MAX(s.sale_amount) AS top_sale_amount
  FROM
    employees e
  LEFT JOIN
    sales s ON e.employee_id = s.employee_id
  GROUP BY
    e.department,
    e.name
  ORDER BY
    top_sale_amount DESC
'''
conn.execute(query).df()

In [None]:
import pandas as pd

# load the CSV file and time it
%timeit df = pd.read_csv('flights.csv')

In [None]:
df = pd.read_csv('flights.csv')
%timeit df.groupby('AIRLINE')['ARRIVAL_DELAY'].mean().reset_index()

In [None]:
df.groupby('AIRLINE')['ARRIVAL_DELAY'].mean().reset_index()

In [None]:
import duckdb

conn = duckdb.connect()
query = '''
  SELECT
    AIRLINE,
    AVG(ARRIVAL_DELAY) AS MEAN_ARRIVAL_DELAY
  FROM
    read_csv_auto('flights.csv')
  GROUP BY
    AIRLINE
    ORDER BY
    AIRLINE;
'''
%timeit df = conn.execute(query).df()

In [None]:
import psutil

def memory_usage():
    process = psutil.Process()
    return process.memory_info().rss / (1024 ** 2) # convert bytes to MB

In [None]:
import pandas as pd

# measure memory before query execution
memory_before = memory_usage()
print(f"Memory used before query: {memory_before:.2f} MB")

# load the CSV file
df = pd.read_csv('flights.csv')

# measure memory after query execution
memory_after = memory_usage()

print(f"Memory used after query: {memory_after:.2f} MB")

In [None]:
df

In [None]:
import duckdb
conn = duckdb.connect()
query = '''
  SELECT
    AIRLINE,
    AVG(ARRIVAL_DELAY) AS MEAN_ARRIVAL_DELAY
  FROM
    read_csv_auto('flights.csv')
  GROUP BY
    AIRLINE
    ORDER BY
    AIRLINE;
'''
# measure memory before query execution
memory_before = memory_usage()
print(f"Memory used before query: {memory_before:.2f} MB")

# run the query
df = conn.execute(query).df()

# measure memory after query execution
memory_after = memory_usage()
print(f"Memory used after query: {memory_after:.2f} MB")