# Chapter 6. Using DuckDB with JSON Files

JSON supports the following data types:
- Object
- String
- Boolean
- Number
- Array
- null

## Loading JSON Files into DuckDB

### Using read_json_auto() Function

In [1]:
import duckdb

conn = duckdb.connect()
conn.execute('''
  SELECT 
  *
  FROM read_json_auto('./datasets/json1.json')
''').df()

Unnamed: 0,id,name,address,email,weight
0,1,Sarah Johnson,"4321 Oak Street Apartment 304 Los Angeles, CA ...",sarah_johnson478@gmail.com,140.5
1,2,David Martinez,"789 Maple Avenue Suite 102 New York, NY 10001",david_martinez431@gmail.com,155.0
2,3,Emily Wilson,"567 Pine Road Unit 5B Chicago, IL 60601",emily_wilson998@gmail.com,200.1


In [4]:
conn.execute('''
  CREATE TABLE People
  as
  FROM './datasets/json1.json'
''')

<duckdb.duckdb.DuckDBPyConnection at 0x7e4efc3679f0>

In [5]:
conn.execute('''
  SELECT 
  *
  FROM read_json_auto('./datasets/json1.json', records = false)
''').df()

Unnamed: 0,json
0,"{'id': 1, 'name': 'Sarah Johnson', 'address': ..."
1,"{'id': 2, 'name': 'David Martinez', 'address':..."
2,"{'id': 3, 'name': 'Emily Wilson', 'address': '..."


In [6]:
conn.execute('''
  SELECT
    name, email
  FROM read_json_auto('./datasets/json1.json')
''').df()

Unnamed: 0,name,email
0,Sarah Johnson,sarah_johnson478@gmail.com
1,David Martinez,david_martinez431@gmail.com
2,Emily Wilson,emily_wilson998@gmail.com


### Using the read_json() Function

#### Array of JSON Objects

In [None]:
conn.execute('''
  SELECT 
    *
  FROM read_json('./datasets/json1.json',
                 format = 'auto',
                 columns = {
                   id:'INTEGER',
                   name:'STRING',
                   weight:'FLOAT'
                 })
''').df()
# JSON supports the following data types: Object | String | Boolean | Number | Array | null
# For the format parameter, you can specify one of the following values: array | newline_delimited or nd | unstructured | auto

In [7]:
conn.execute('''
  SELECT
    *
  FROM read_json('./datasets/json1.json',
                 format = 'auto')
''').df()


Unnamed: 0,id,name,address,email,weight
0,1,Sarah Johnson,"4321 Oak Street Apartment 304 Los Angeles, CA ...",sarah_johnson478@gmail.com,140.5
1,2,David Martinez,"789 Maple Avenue Suite 102 New York, NY 10001",david_martinez431@gmail.com,155.0
2,3,Emily Wilson,"567 Pine Road Unit 5B Chicago, IL 60601",emily_wilson998@gmail.com,200.1


#### Newline-delimited (ND) JSON

In [8]:
conn.execute('''
  SELECT
    *
  FROM read_json('./datasets/json1_a.json',
                 format = 'newline_delimited',
                 columns = {
                   id:'INTEGER',
                   name:'STRING',
                   weight:'FLOAT'
                })
''').df()

Unnamed: 0,id,name,weight
0,1,Sarah Johnson,140.5
1,2,David Martinez,155.0
2,3,Emily Wilson,200.100006


In [9]:
conn.execute('''
  SELECT
    *
  FROM read_ndjson_auto('./datasets/json1_a.json',
                        columns = {
                          id:'INTEGER',
                          name:'STRING',
                          weight:'FLOAT'
                        })
''').df()

Unnamed: 0,id,name,weight
0,1,Sarah Johnson,140.5
1,2,David Martinez,155.0
2,3,Emily Wilson,200.100006


#### Nested JSON

In [10]:
conn.execute('''
  SELECT
    *
  FROM read_json('./datasets/json2.json')
''').df()

Unnamed: 0,id,name,address,email,weight
0,1,Sarah Johnson,"{'line1': '4321 Oak Street Apartment', 'line2'...",sarah_johnson478@gmail.com,140.5
1,2,David Martinez,"{'line1': '789 Maple Avenue ', 'line2': 'Suite...",david_martinez431@gmail.com,155.0
2,3,Emily Wilson,"{'line1': '567 Pine Road Unit 5B Chicago', 'li...",emily_wilson998@gmail.com,200.1


In [11]:
conn.execute('''
  SELECT
    id,
    name,
    address['line1'] as line1,
    address['line2'] as line2,
    address['state'] as state,
    address['zip'] as zip,
    email,
    weight
  FROM read_json('./datasets/json2.json')
''').df()

Unnamed: 0,id,name,line1,line2,state,zip,email,weight
0,1,Sarah Johnson,4321 Oak Street Apartment,304 Los Angeles,CA,90001,sarah_johnson478@gmail.com,140.5
1,2,David Martinez,789 Maple Avenue,Suite 102 New York,NY,10001,david_martinez431@gmail.com,155.0
2,3,Emily Wilson,567 Pine Road Unit 5B Chicago,,IL,60601,emily_wilson998@gmail.com,200.1


In [12]:
conn.execute('''
  SELECT
    address['line1'] as line1,
    address['line2'] as line2,
    address['location']['state'] as state,
    address['location']['city'] as city,
    address['location']['zip'] as zip,
    email,
    weight
  FROM read_json('./datasets/json2_a.json')
''').df()

Unnamed: 0,line1,line2,state,city,zip,email,weight
0,4321 Oak Street Apartment,304 Los Angeles,CA,Calexico,90001,sarah_johnson478@gmail.com,140.5
1,789 Maple Avenue,Suite 102 New York,NY,Coney Island,10001,david_martinez431@gmail.com,155.0
2,567 Pine Road Unit 5B Chicago,,IL,Brookfield,60601,emily_wilson998@gmail.com,200.1


#### Custom JSON File

In [13]:
conn.execute('''
  SELECT 
    *
  FROM read_json('./datasets/json3.json')
''').df()

Unnamed: 0,people
0,"[{'id': 1, 'name': 'Sarah Johnson', 'address':..."


In [14]:
conn.execute('''
  SELECT unnest(people) p
    FROM read_json('./datasets/json3.json')
''').df()

Unnamed: 0,p
0,"{'id': 1, 'name': 'Sarah Johnson', 'address': ..."
1,"{'id': 2, 'name': 'David Martinez', 'address':..."
2,"{'id': 3, 'name': 'Emily Wilson', 'address': {..."


In [15]:
conn.execute('''
  SELECT
    p.id,
    p.name,
    p.address['line1'] as line1,
    p.address['line2'] as line2,
    p.address['state'] as state,
    p.address['zip'] as zip,
    p.email,
    p.weight
  FROM
    (
      SELECT unnest(people) p
      FROM read_json('./datasets/json3.json')
    )
''').df()

Unnamed: 0,id,name,line1,line2,state,zip,email,weight
0,1,Sarah Johnson,4321 Oak Street Apartment,304 Los Angeles,CA,90001,sarah_johnson478@gmail.com,140.5
1,2,David Martinez,789 Maple Avenue,Suite 102 New York,NY,10001,david_martinez431@gmail.com,155.0
2,3,Emily Wilson,567 Pine Road Unit 5B Chicago,,IL,60601,emily_wilson998@gmail.com,200.1


In [16]:
conn.execute('''
  SELECT
    p['id'],
    p['name'],
    p['address']['line1'] as line1,
    p['address']['line2'] as line2,
    p['address']['state'] as state,
    p['address']['zip'] as zip,
    p['email'],
    p['weight']
  FROM
    (
    SELECT unnest(people) p
    FROM read_json('./datasets/json3.json')
    )
''').df()

Unnamed: 0,p['id'],p['name'],line1,line2,state,zip,p['email'],p['weight']
0,1,Sarah Johnson,4321 Oak Street Apartment,304 Los Angeles,CA,90001,sarah_johnson478@gmail.com,140.5
1,2,David Martinez,789 Maple Avenue,Suite 102 New York,NY,10001,david_martinez431@gmail.com,155.0
2,3,Emily Wilson,567 Pine Road Unit 5B Chicago,,IL,60601,emily_wilson998@gmail.com,200.1


#### Loading multiple JSON files

In [17]:
conn.execute('''
  SELECT
    *
  FROM read_json(['./datasets/json4.json','./datasets/json5.json'])
''').df()

Unnamed: 0,id,name,address,email,weight,height
0,1,Sarah Johnson,"""4321 Oak Street Apartment 304 Los Angeles, CA...",sarah_johnson478@gmail.com,140.5,
1,2,David Martinez,"""789 Maple Avenue Suite 102 New York, NY 10001""",david_martinez431@gmail.com,155.0,
2,3,Emily Wilson,"{""line1"":""567 Pine Road Unit 5B Chicago"",""stat...",,,66.0


In [None]:
conn.execute('''
  SELECT
    *
  FROM read_json('./datasets/json*.json')
''').df()
# Wildcard symbols: *, **, ?, [abc], [a-z]
# Try increasing 'sample_size', reducing 'maximum_depth', specifying 'columns', 'format' or 'records' manually, setting 'ignore_errors' to true, or setting 'union_by_name' to true when reading multiple files with a different structure.

InvalidInputException: Invalid Input Error: JSON transform error in file "./datasets/json5.json", in record/value 1: Object {"id":3,"name":"Emily Wilson","address":{"line1":"... has unknown key "height"
Try increasing 'sample_size', reducing 'maximum_depth', specifying 'columns', 'format' or 'records' manually, setting 'ignore_errors' to true, or setting 'union_by_name' to true when reading multiple files with a different structure.

In [19]:
conn.execute('''
  SELECT
    *
  FROM read_json('./datasets/json?.json')
''').df()

InvalidInputException: Invalid Input Error: JSON transform error in file "./datasets/json5.json", in record/value 1: Object {"id":3,"name":"Emily Wilson","address":{"line1":"... has unknown key "height"
Try increasing 'sample_size', reducing 'maximum_depth', specifying 'columns', 'format' or 'records' manually, setting 'ignore_errors' to true, or setting 'union_by_name' to true when reading multiple files with a different structure.

### Using the COPY-FROM Statement

In [None]:
conn = duckdb.connect()
conn.execute('''
  CREATE TABLE people (id INT, name STRING, address STRING,
                       email STRING, weight FLOAT);
  COPY people FROM './datasets/json1.json' (FORMAT JSON, AUTO_DETECT true);
  SELECT * FROM people;
''').df()
# This offers significant advantages over read_json(), particularly when dealing with large datasets. COPY-FROM is optimized for performance and scalability.

Unnamed: 0,id,name,address,email,weight
0,1,Sarah Johnson,"4321 Oak Street Apartment 304 Los Angeles, CA ...",sarah_johnson478@gmail.com,140.5
1,2,David Martinez,"789 Maple Avenue Suite 102 New York, NY 10001",david_martinez431@gmail.com,155.0
2,3,Emily Wilson,"567 Pine Road Unit 5B Chicago, IL 60601",emily_wilson998@gmail.com,200.100006


### Exporting Tables to JSON

In [21]:
conn = duckdb.connect()
conn.execute('''
  CREATE OR REPLACE TABLE people
    as
  SELECT
    name,
    weight
  FROM read_ndjson_auto('./datasets/json1_a.json')
''')
display(conn.execute('SELECT * FROM people').df())

Unnamed: 0,name,weight
0,Sarah Johnson,140.5
1,David Martinez,155.0
2,Emily Wilson,200.1


In [24]:
conn.execute('''
  COPY people
    TO
  './datasets/people.json' (FORMAT JSON);
''')

<duckdb.duckdb.DuckDBPyConnection at 0x7e4ea9111530>

In [23]:
conn.execute('''
  COPY people
    TO
  './datasets/people_array.json' (ARRAY TRUE);
''')

<duckdb.duckdb.DuckDBPyConnection at 0x7e4ea9111530>