# Connect duckdb

In [None]:
import duckdb

with duckdb.connect("databases/hotels.duckdb") as conn:
    conn.sql("""--sql
    CREATE TABLE IF NOT EXISTS bookings (
        date DATE,
        room_type STRING,
        price FLOAT,
        breakfast BOOLEAN,
        name STRING,
        email STRING,
        guests UINT8
    )
    """);

    conn.sql("""--sql
    INSERT INTO bookings VALUES
        ('2024-01-15', 'Single', 150.00, True, 'John Doe', 'john@email.com', 2),
        ('2024-01-16', 'Double', 220.00, False, 'Jane Smith', 'jane@email.com', 3),
        ('2024-01-17', 'Suite', 450.00, True, 'Bob Johnson', 'bob@email.com', 1),
        ('2024-01-18', 'Single', 150.00, True, 'Alice Brown', 'alice@email.com', 1),
        ('2024-01-19', 'Double', 220.00, True, 'Charlie Davis', 'charlie@email.com', 4),
        ('2024-01-20', 'Suite', 450.00, False, 'Eva Wilson', 'eva@email.com', 2),
        ('2024-01-21', 'Single', 150.00, False, 'Frank Miller', 'frank@email.com', 1),
        ('2024-01-22', 'Double', 220.00, True, 'Grace Lee', 'grace@email.com', 3);
    """);
    bookings = conn.sql("FROM bookings;").df()
bookings

## Read csv file directly

In [14]:
invoice = duckdb.sql("""
    FROM 'data/Leverantorsfaktura202408.csv';
""").df()

invoice.head()

Unnamed: 0,Förvaltning,Leverantör,Organisationsnummer,Verifikationsnummer,Konto,Kontotext,Belopp exkl moms
0,Stadsmiljönämnden,TRACK TEC GMBH,106/5727/0626,4001291513,4101,Inköp anläggnings och underhållsmaterial,"9 835 315,00"
1,Kretslopp och Vatten,POLISMYNDIGHETEN I VÄSTRA GÖTALAND,2021000076,5601378982,6185,Anläggningsentreprenad,87000
2,Kretslopp och Vatten,POLISMYNDIGHETEN I VÄSTRA GÖTALAND,2021000076,5601377374,6185,Anläggningsentreprenad,87000
3,Kretslopp och Vatten,POLISMYNDIGHETEN I VÄSTRA GÖTALAND,2021000076,5601378519,6185,Anläggningsentreprenad,87000
4,Exploateringsnämnden,POLISMYNDIGHETEN I VÄSTRA GÖTALAND,2021000076,2001226894,7641,Diverse skatter och offentliga avgifter,"1 000,00"


In [17]:
invoice.shape

(92989, 7)

In [21]:
finance = duckdb.sql(
    """
    FROM 'data/financial_data.csv';
""").df()

finance.head()

Unnamed: 0,transaction_id,date,company,transaction_type,category,amount,currency,account_number,description,status,payment_method,tax_amount,net_amount
0,TXN1000,2024-01-15,UnitedHealth Group,Investment,Marketing,498789.6,JPY,ACC90616,Q1 revenue for r&d,Completed,Check,15640.11,483149.49
1,TXN1001,2024-12-22,Broadcom Inc,Expense,Administrative,407890.44,GBP,ACC72475,Q4 capital gain for it,Completed,Wire Transfer,25883.85,382006.59
2,TXN1002,2024-10-11,Starbucks,Capital Gain,Operations,363927.61,USD,ACC26222,Q3 expense for administrative,Pending,ACH,24814.56,339113.05
3,TXN1003,2024-09-30,Lockheed Martin,Dividend,R&D,336378.01,EUR,ACC86805,Q4 investment for marketing,Completed,Check,46359.17,290018.84
4,TXN1004,2024-07-02,IBM Corp,Operating Cost,Administrative,193284.51,GBP,ACC72617,Q2 expense for distribution,Completed,Credit Card,41529.35,151755.16


## Read several csv files and combine

In [23]:
bookings = duckdb.sql("""
    FROM 'data/hotel*.csv';
""").df()
bookings.head()

Unnamed: 0,booking_id,guest_name,check_in,check_out,room_type,guests,price_sek_per_night,breakfast,source,status
0,H2025-01-001,Anna Berg,2025-01-05,2025-01-07,Standard,2,1150,True,Direct,Confirmed
1,H2025-01-002,Johan Nilsson,2025-01-10,2025-01-11,Single,1,890,False,Booking.com,Confirmed
2,H2025-01-003,Sofia Lind,2025-01-14,2025-01-16,Deluxe,2,1450,True,Expedia,Confirmed
3,H2025-01-004,Erik Svensson,2025-01-20,2025-01-22,Standard,3,1250,True,Direct,Confirmed
4,H2025-01-005,Maria Johansson,2025-01-27,2025-01-28,Single,1,920,False,Hotels.com,Cancelled


In [24]:
bookings.tail()

Unnamed: 0,booking_id,guest_name,check_in,check_out,room_type,guests,price_sek_per_night,breakfast,source,status
10,H2025-03-011,Maria Berg,2025-03-21,2025-03-22,Standard,3,1250,True,Hotels.com,Confirmed
11,H2025-03-012,Isak Axelsson,2025-03-15,2025-03-18,Single,1,1012,True,Direct,Confirmed
12,H2025-03-013,Elin Håkansson,2025-03-12,2025-03-13,Standard,2,1151,True,Direct,Cancelled
13,H2025-03-014,Elias Bengtsson,2025-03-08,2025-03-09,Single,2,1097,True,Hotels.com,Confirmed
14,H2025-03-015,Albin Lundberg,2025-03-01,2025-03-03,Single,1,1027,False,Expedia,Confirmed


### Read json data

In [26]:
duckdb.sql("FROM 'data/Library.json';").df()

Unnamed: 0,name,books
0,Coolu Libraru,"[{'id': 1, 'title': 'The Hitchhiker's Guide to..."


#### unnesting

In [31]:
books_df = duckdb.sql("""
    SELECT 
        l.name as library_name,
        UNNEST(l.books, max_depth := 2)
    FROM 'data/library.json' AS l 
""").df()

books_df

Unnamed: 0,library_name,id,title,author,year
0,Coolu Libraru,1,The Hitchhiker's Guide to the Galaxy,Douglas Adams,1979
1,Coolu Libraru,2,Pride and Prejudice,Jane Austen,1813
2,Coolu Libraru,3,1984,George Orwell,1949
3,Coolu Libraru,4,To Kill a Mockingbird,Harper Lee,1960
4,Coolu Libraru,5,The Great Gatsby,F. Scott Fitzgerald,1925
5,Coolu Libraru,6,Moby Dick,Herman Melville,1851
6,Coolu Libraru,7,War and Peace,Leo Tolstoy,1869
7,Coolu Libraru,8,The Lord of the Rings,J.R.R. Tolkien,1954
8,Coolu Libraru,9,Crime and Punishment,Fyodor Dostoevsky,1866
9,Coolu Libraru,10,Don Quixote,Miguel de Cervantes,1605


### use cross join

In [37]:
books_df = duckdb.sql("""
SELECT
l.name,
b.unnest.title,
b.unnest.author,
b.unnest.year,
FROM 'data/library.json' l
CROSS JOIN UNNEST(l.books) b
""").df()

In [40]:
books_df.to_csv("data/books.csv", index=False)