# Chapter 2. Importing Data into DuckDB

## Creating DuckDB Databases

In [None]:
%pip install duckdb

In [16]:
import duckdb

conn = duckdb.connect()

In [None]:
conn - duckdb.connect(':memory:')

In [2]:
conn = duckdb.connect(database = 'mydb.duckdb', read_only = False)

## Loading Data from Different Data Sources and Formats

### Working with CSV Files

#### Loading using the SQL query method

In [4]:
import duckdb

conn = duckdb.connect()
conn.execute('''
  CREATE TABLE flights 
    as 
  SELECT 
    * 
  FROM read_csv_auto('./datasets/flights/flights_sample.csv')    
''')

<duckdb.duckdb.DuckDBPyConnection at 0x79f8286ab870>

In [None]:
conn.execute('''
  CREATE TABLE flights 
    as 
  SELECT 
    * 
  FROM read_csv_auto('./datasets/flights/flights_sample.csv')    
''').df()

Unnamed: 0,Count
0,1000


In [None]:
# SELECT clause is optional, as is read_csv_auto()
conn.execute('''
  CREATE TABLE flights 
    as 
  FROM './datasets/flights/flights_sample.csv'
''').df()

Unnamed: 0,Count
0,1000


In [None]:
# To avoid errors when it already exists, drop the existing table before creating a new one.
conn.execute('''
  DROP TABLE IF EXISTS flights;
  CREATE TABLE flights
  AS
  FROM './datasets/flights/flights_sample.csv'
''').df()

In [None]:
# Alternatively, use CREATE OR REPLACE TABLE to overwrite an existing table.
conn.execute('''
  CREATE OR REPLACE TABLE flights
  AS
  FROM './datasets/flights/flights_sample.csv'
''').df()

In [2]:
# If your CSV file is large, you can use the LIMIT clause to restrict the number of rows read.
conn.execute('''
  DROP TABLE IF EXISTS flights;
  CREATE TABLE flights
  AS
  FROM read_csv_auto('./datasets/flights/flights_sample.csv')
    LIMIT 100
''').df()

Unnamed: 0,Count
0,100


In [3]:
# To verify the table created in DuckDB database, use the SHOW TABLES command.
display(conn.execute('SHOW TABLES').df())

Unnamed: 0,name
0,flights


In [1]:
import duckdb
# flights.csv broken into 12 parquet files by year and month as GH does not allow large files in repo
# import all 12 parquet files in a directory using wildcard *
# ./datasets/flights/year=2015/month=??/flights.parquet
conn = duckdb.connect()
conn.execute('''
  CREATE TABLE flights 
    as 
  SELECT
    *
  FROM read_parquet('./datasets/flights/year=2015/month=??/flights.parquet')
             
''').df()

Unnamed: 0,Count
0,5819079


In [None]:
import duckdb
# flights.csv broken into 12 parquet files by year and month as GH does not allow large files in repo
# import all 12 parquet files in a directory using wildcard *
# ./datasets/flights/year=2015/month=??/flights.parquet
conn = duckdb.connect()
conn.execute('''
  CREATE TABLE flights 
    as 
  SELECT
    *
  FROM './datasets/flights/year=2015/month=??/flights.parquet' (FORMAT PARQUET)
             
''').df()

In [2]:
display(conn.execute('SHOW TABLES').df())

Unnamed: 0,name
0,flights


In [4]:
display(conn.execute('SELECT * FROM flights LIMIT 10').df())

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408,-22,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741,-9,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811,5,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756,-9,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259,-21,0,0,,,,,,
5,2015,1,1,4,DL,806,N3730B,SFO,MSP,25,...,610,8,0,0,,,,,,
6,2015,1,1,4,NK,612,N635NK,LAS,MSP,25,...,509,-17,0,0,,,,,,
7,2015,1,1,4,US,2013,N584UW,LAX,CLT,30,...,753,-10,0,0,,,,,,
8,2015,1,1,4,AA,1112,N3LAAA,SFO,DFW,30,...,532,-13,0,0,,,,,,
9,2015,1,1,4,DL,1173,N826DN,LAS,ATL,30,...,656,-15,0,0,,,,,,


In [None]:
# Another way to load a CSV file is to create a table with the appropriate schema and then use the COPY command to load the data from the CSV file into the table.
conn.execute('''
  CREATE TABLE airports(
    IATA_CODE VARCHAR, AIRPORT VARCHAR, CITY VARCHAR,
    STATE VARCHAR, COUNTRY VARCHAR, LATITUDE VARCHAR,
    LONGITUDE VARCHAR); 
  COPY airports FROM './datasets/flights/airports.csv' (AUTO_DETECT TRUE);
''')

display(conn.execute('SELECT * FROM airports').df())

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447
...,...,...,...,...,...,...,...
317,WRG,Wrangell Airport,Wrangell,AK,USA,56.48433,-132.36982
318,WYS,Westerly State Airport,West Yellowstone,MT,USA,44.6884,-111.11764
319,XNA,Northwest Arkansas Regional Airport,Fayetteville/Springdale/Rogers,AR,USA,36.28187,-94.30681
320,YAK,Yakutat Airport,Yakutat,AK,USA,59.50336,-139.66023


In [5]:
# An alternative is to set the names of the columns in the names parameter of read_csv_auto() function.
conn.execute('''
  DROP TABLE IF EXISTS airports;
  CREATE TABLE airports
   AS
  FROM read_csv('./datasets/flights/airports.csv',
             names=['IATA_CODE', 'AIRPORT', 'CITY', 'STATE', 'COUNTRY', 'LATITUDE', 'LONGITUDE'])
''')

display(conn.execute('SELECT * FROM airports').df())

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.44040
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.68190
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447
...,...,...,...,...,...,...,...
317,WRG,Wrangell Airport,Wrangell,AK,USA,56.48433,-132.36982
318,WYS,Westerly State Airport,West Yellowstone,MT,USA,44.68840,-111.11764
319,XNA,Northwest Arkansas Regional Airport,Fayetteville/Springdale/Rogers,AR,USA,36.28187,-94.30681
320,YAK,Yakutat Airport,Yakutat,AK,USA,59.50336,-139.66023


In [7]:
# If you want to check total number of columns created for a table, use the information_schema.columns table.
result = conn.execute('''
  SELECT COUNT(*) AS column_count
  FROM information_schema.columns
  WHERE table_name = 'airports';
''').fetchall()

# Print the number of columns
print(f"Number of columns: {result[0][0]}")

Number of columns: 7


In [None]:
# If you want to treat all the columns in CSV file as string types (regardless of the actual data types in teh file), use the all_varchar=true parameter in read_csv() function.
conn.execute('''
  DROP TABLE IF EXISTS airports;
  CREATE TABLE airports
  AS
  FROM read_csv('./datasets/flights/airports.csv', all_varchar=true)
''')

<duckdb.duckdb.DuckDBPyConnection at 0x72a7e81e1af0>

In [6]:
display(conn.execute('SHOW TABLES').df())

Unnamed: 0,name
0,airports
1,flights


#### Loading using the register() method

In [7]:
airlines = conn.execute('''
  SELECT 
    * 
  FROM read_csv('./datasets/flights/airlines.csv',  
                Header = True, 
                Columns = {'IATA_CODE': 'VARCHAR', 'AIRLINE': 'VARCHAR'})
''').df()

airlines

Unnamed: 0,IATA_CODE,AIRLINE
0,UA,United Air Lines Inc.
1,AA,American Airlines Inc.
2,US,US Airways Inc.
3,F9,Frontier Airlines Inc.
4,B6,JetBlue Airways
5,OO,Skywest Airlines Inc.
6,AS,Alaska Airlines Inc.
7,NK,Spirit Air Lines
8,WN,Southwest Airlines Co.
9,DL,Delta Air Lines Inc.


In [None]:
# READ_CSV_AUTO() Versus Read_CSV()
airlines = conn.execute('''
  SELECT 
    * 
  FROM read_csv_auto('./datasets/flights/airlines.csv')
''').df()
airlines

Unnamed: 0,IATA_CODE,AIRLINE
0,UA,United Air Lines Inc.
1,AA,American Airlines Inc.
2,US,US Airways Inc.
3,F9,Frontier Airlines Inc.
4,B6,JetBlue Airways
5,OO,Skywest Airlines Inc.
6,AS,Alaska Airlines Inc.
7,NK,Spirit Air Lines
8,WN,Southwest Airlines Co.
9,DL,Delta Air Lines Inc.


In [9]:
conn.register("airlines", airlines)

<duckdb.duckdb.DuckDBPyConnection at 0x72a7e81e1af0>

In [10]:
display(conn.execute('SHOW TABLES').df())

Unnamed: 0,name
0,airlines
1,airports
2,flights


In [11]:
display(conn.execute('SELECT * FROM airlines').df())

Unnamed: 0,IATA_CODE,AIRLINE
0,UA,United Air Lines Inc.
1,AA,American Airlines Inc.
2,US,US Airways Inc.
3,F9,Frontier Airlines Inc.
4,B6,JetBlue Airways
5,OO,Skywest Airlines Inc.
6,AS,Alaska Airlines Inc.
7,NK,Spirit Air Lines
8,WN,Southwest Airlines Co.
9,DL,Delta Air Lines Inc.


In [None]:
import pandas as pd

# load the CSV using pandas
df_airlines = pd.read_csv("./datasets/flights/airlines.csv")

# associate the dataframe with the DuckDB database
conn.register("airlines", df_airlines)

#### Exporting a table to CSV

In [12]:
conn.execute('''
COPY
  (SELECT IATA_CODE, LATITUDE, LONGITUDE FROM airports)
TO
  './datasets/flights/airports_location.csv' WITH (HEADER 1, DELIMITER ',');
''')

<duckdb.duckdb.DuckDBPyConnection at 0x72a7e81e1af0>

In [17]:
conn.execute('''
COPY
 (SELECT
   IATA_CODE, LATITUDE, LONGITUDE
  FROM './datasets/flights/airports.csv'
  LIMIT 10)
  TO
    './datasets/flights/airports_location_sample.csv' WITH (HEADER 1, DELIMITER ',');
''')

<duckdb.duckdb.DuckDBPyConnection at 0x72a703346ef0>

In [18]:
conn.close()

### Working with Parquet Files

#### Loading Parquet files

In [None]:
%pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading fastparquet-2024.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m6.4 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hDownloading cramjam-2.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m14.3 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: cramjam, fastparquet
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [fastparquet]
[1A[2KSuccessfully installed cramjam-2.11.0 fastparquet-2024.11.0


In [19]:
import pandas as pd

df_airports = pd.read_csv("./datasets/flights/airports.csv")
df_airports.to_parquet('./datasets/flights/airports.parquet', engine='fastparquet')

In [20]:
import duckdb

conn = duckdb.connect()
conn.execute('''
  CREATE TABLE airports
    as
  SELECT * FROM read_parquet('./datasets/flights/airports.parquet')
  LIMIT 100
''')

<duckdb.duckdb.DuckDBPyConnection at 0x72a70237ad30>

In [21]:
display(conn.execute('SELECT * FROM airports').df())

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.44040
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.68190
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447
...,...,...,...,...,...,...,...
95,EAU,Chippewa Valley Regional Airport,Eau Claire,WI,USA,44.86526,-91.48507
96,ECP,Northwest Florida Beaches International Airport,Panama City,FL,USA,,
97,EGE,Eagle County Regional Airport,Eagle,CO,USA,39.64257,-106.91770
98,EKO,Elko Regional Airport,Elko,NV,USA,40.82493,-115.79170


In [22]:
conn.execute('''
  INSERT INTO airports
  SELECT * FROM read_parquet('./datasets/flights/airports.parquet')
  ORDER BY 1 DESC
  LIMIT 100
''')
display(conn.execute('SELECT * FROM airports').df())

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.44040
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.68190
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447
...,...,...,...,...,...,...,...
195,OME,Nome Airport,Nome,AK,USA,64.51220,-165.44525
196,OMA,Eppley Airfield,Omaha,NE,USA,41.30252,-95.89417
197,OKC,Will Rogers World Airport,Oklahoma City,OK,USA,35.39309,-97.60073
198,OGG,Kahului Airport,Kahului,HI,USA,20.89865,-156.43046


In [23]:
conn.execute('''
  COPY airports
  FROM './datasets/flights/airports.parquet' (FORMAT PARQUET);
''')

<duckdb.duckdb.DuckDBPyConnection at 0x72a70237ad30>

#### Exporting Parquet files

In [24]:
conn.execute('''
  COPY airports
  TO
    './datasets/flights/airports_all.parquet' (FORMAT PARQUET);
''')

<duckdb.duckdb.DuckDBPyConnection at 0x72a70237ad30>

In [25]:
conn.execute('''
  COPY
    (SELECT * FROM airports LIMIT 100)
  TO
    './datasets/flights/airports_100.parquet' (FORMAT PARQUET);
''')

<duckdb.duckdb.DuckDBPyConnection at 0x72a70237ad30>

## Working with Excel Files

### Loading Excel files

In [None]:
# older versions of DuckDB may not have the Excel extension
import duckdb

conn = duckdb.connect()

conn.execute('INSTALL spatial')
conn.execute('LOAD spatial')
conn.execute('''
  CREATE TABLE airports
  as
  SELECT * FROM st_read('./datasets/flights/airports_and_airlines.xlsx', layer='airports');
''')

display(conn.execute('SELECT * FROM airports').df())

In [28]:
# newer versions of DuckDB have the Excel extension
import duckdb

conn = duckdb.connect()

conn.execute('INSTALL excel')
conn.execute('LOAD excel')
conn.execute('''
  CREATE TABLE airports
  as
  SELECT * FROM read_xlsx('./datasets/flights/airports_and_airlines.xlsx', sheet='airports');
''')

display(conn.execute('SELECT * FROM airports').df())

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.44040
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.68190
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447
...,...,...,...,...,...,...,...
317,WRG,Wrangell Airport,Wrangell,AK,USA,56.48433,-132.36982
318,WYS,Westerly State Airport,West Yellowstone,MT,USA,44.68840,-111.11764
319,XNA,Northwest Arkansas Regional Airport,Fayetteville/Springdale/Rogers,AR,USA,36.28187,-94.30681
320,YAK,Yakutat Airport,Yakutat,AK,USA,59.50336,-139.66023


In [12]:
# DuckDB supports a number of extensions.
conn.execute('''SELECT extension_name, installed, description
FROM duckdb_extensions();''').df()

Unnamed: 0,extension_name,installed,description
0,autocomplete,False,Adds support for autocomplete in the shell
1,aws,False,Provides features that depend on the AWS SDK
2,azure,False,Adds a filesystem abstraction for Azure blob s...
3,core_functions,True,Core function library
4,delta,False,Adds support for Delta Lake
5,ducklake,False,"Adds support for DuckLake, SQL as a Lakehouse ..."
6,encodings,False,All unicode encodings to UTF-8
7,excel,True,Adds support for Excel-like format strings
8,fts,False,Adds support for Full-Text Search Indexes
9,httpfs,True,Adds support for reading and writing files ove...


In [None]:
import os
os.environ['OGR_XLSX_HEADERS'] = 'DISABLE'

In [None]:
os.environ['OGR_XLSX_HEADERS'] = 'FORCE'

In [None]:
os.environ['OGR_XLSX_HEADERS'] = 'AUTO'

In [None]:
conn.execute('''
  CREATE TABLE airlines
  AS
  SELECT * FROM read_xlsx('./datasets/flights/airports_and_airlines.xlsx', sheet='airlines');
''')
display(conn.execute('SELECT * FROM airlines').df())

In [29]:
conn = duckdb.connect()

conn.execute('INSTALL excel')
conn.execute('LOAD excel')
conn.execute('''
  CREATE TABLE airlines (
    IATA_CODE STRING,
    AIRLINES STRING
  );
  INSERT INTO airlines
  SELECT * FROM read_xlsx('./datasets/flights/airports_and_airlines.xlsx', sheet='airlines');
''')
display(conn.execute('SELECT * FROM airlines').df())

Unnamed: 0,IATA_CODE,AIRLINES
0,AA,American Airlines Inc.
1,US,US Airways Inc.
2,F9,Frontier Airlines Inc.
3,B6,JetBlue Airways
4,OO,Skywest Airlines Inc.
5,AS,Alaska Airlines Inc.
6,NK,Spirit Air Lines
7,WN,Southwest Airlines Co.
8,DL,Delta Air Lines Inc.
9,EV,Atlantic Southeast Airlines


In [None]:
os.environ['OGR_XLSX_FIELD_TYPES'] = 'STRING' # default is AUTO

#### Export tables to Excel

In [None]:
# This is needed when running earlier versions of DuckDB without the Excel extension
conn.execute('''
  COPY airlines
  TO './datasets/flights/airlines.xlsx' WITH (FORMAT GDAL, DRIVER 'xlsx');
''')

In [None]:
# if you have installed DuckDB's Excel extension
# no need to set OGR_XLSX_FIELD_TYPES
conn.execute('''
  COPY airlines
  TO './datasets/flights/airlines.xlsx'
''')

<duckdb.duckdb.DuckDBPyConnection at 0x72a7030c73f0>

In [31]:
conn.execute('''
    SELECT * FROM read_xlsx('./datasets/flights/airlines.xlsx');
''').df()

Unnamed: 0,AA,American Airlines Inc.
0,US,US Airways Inc.
1,F9,Frontier Airlines Inc.
2,B6,JetBlue Airways
3,OO,Skywest Airlines Inc.
4,AS,Alaska Airlines Inc.
5,NK,Spirit Air Lines
6,WN,Southwest Airlines Co.
7,DL,Delta Air Lines Inc.
8,EV,Atlantic Southeast Airlines
9,HA,Hawaiian Airlines Inc.


## Working with MySQL

In [None]:
%pip install mysql-connector-python

In [None]:
import mysql.connector
import duckdb

# MySQL connection information
mysql_host = 'localhost'
mysql_user = 'user1'
mysql_password = 'password'
mysql_database = 'My_DB'
mysql_table = 'airlines'

# create a DuckDB connection
duckdb_conn = duckdb.connect()

# connect to MySQL
mysql_conn = mysql.connector.connect(
host = mysql_host,
user = mysql_user,
password = mysql_password,
database = mysql_database
)

# create a cursor for MySQL
mysql_cursor = mysql_conn.cursor()

# query data from MySQL
mysql_query = f'SELECT * FROM {mysql_table}'
mysql_cursor.execute(mysql_query)

# create a DuckDB table with the same schema as MySQL
duckdb_create_table_query = \
f'CREATE TABLE airlines (IATA_CODE VARCHAR(2), AIRLINES VARCHAR)'
duckdb_conn.execute(duckdb_create_table_query)

# get column names from MySQL result
mysql_columns = [column[0] for column in mysql_cursor.description]

# fetch data from MySQL and insert into DuckDB table
duckdb_insert_query = \
f'INSERT INTO airlines VALUES ({", ".join(["?" for _ in mysql_columns])})'

for row in mysql_cursor.fetchall():
    duckdb_conn.execute(duckdb_insert_query, row)

# query the data in DuckDB
display(duckdb_conn.execute('SELECT * FROM airlines').df())

# close the MySQL and DuckDB connections
mysql_cursor.close()
mysql_conn.close()
duckdb_conn.close()

In [None]:
import duckdb

# create a DuckDB connection
conn = duckdb.connect()

# install and load the MySQL extension
conn.execute('INSTALL mysql')
conn.execute('LOAD mysql')

# define MySQL connection parameters
mysql_host     = 'localhost'
mysql_user     = 'user1'
mysql_password = 'password'
mysql_database = 'My_DB'
mysql_table    = 'airlines'
mysql_port     = 3306

# create a MySQL connection
mysql_connection = f'mysql://{mysql_user}:{mysql_password}@{mysql_host}/{mysql_database}'

attach_command = f'''
    ATTACH 'host={mysql_host} 
    user={mysql_user} 
    password={mysql_password} 
    port={mysql_port} 
    database={mysql_database}' 
    AS mysqldb (TYPE MYSQL);
'''
conn.execute(attach_command)
conn.execute('USE mysqldb;')

display(conn.execute(f'''
    SELECT * FROM {mysql_table}
''').df())

display(conn.execute(f'''
    show tables
''').df())

# close the DuckDB connection
conn.close()

In [None]:
import duckdb
conn = duckdb.connect()
conn.execute('''
  CREATE OR REPLACE TABLE flights 
    as 
  FROM './datasets/flights/flights_sample.csv'
''').df()


Unnamed: 0,Count
0,1000


In [4]:
display(conn.execute('SELECT * FROM flights').df())

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,0005,...,0408,-22,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,0010,...,0741,-9,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,0020,...,0811,5,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,0020,...,0756,-9,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,0025,...,0259,-21,0,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2015,1,1,4,WN,552,N291WN,SAT,ELP,0705,...,0810,30,0,0,,1,0,29,0,0
996,2015,1,1,4,WN,3239,N905WN,TPA,BWI,0705,...,0907,-18,0,0,,,,,,
997,2015,1,1,4,WN,1912,N554WN,TUS,DEN,0705,...,0846,-9,0,0,,,,,,
998,2015,1,1,4,UA,1660,N17730,SNA,EWR,0705,...,1433,-51,0,0,,,,,,


In [8]:
# identify the top origin airport and destination airport pairs by month
display(conn.execute('''
  SELECT 
    ORIGIN_AIRPORT, DESTINATION_AIRPORT, COUNT(*) AS flight_count, MONTH AS month
  FROM flights
  GROUP BY ORIGIN_AIRPORT, DESTINATION_AIRPORT, month
  ORDER BY flight_count DESC
  LIMIT 10;
''').df())

Unnamed: 0,ORIGIN_AIRPORT,DESTINATION_AIRPORT,flight_count,month
0,ANC,SEA,8,1
1,SFO,IAH,4,1
2,JFK,LAX,4,1
3,HNL,KOA,3,1
4,ORD,PHX,3,1
5,BOS,JFK,3,1
6,LAX,DFW,3,1
7,LAX,IAH,3,1
8,SEA,DFW,3,1
9,DFW,MIA,3,1
