In [1]:
import json
from json_schema_to_glue_columns import *

In [2]:
schema_location_1 = "./sample1.schema.json"
schema_location_2 = "./sample2.schema.json"
schema_location_3 = "./sample3.schema.json"

In [3]:
with open(schema_location_1) as schema_file_1:
    schema_1 = json.load(schema_file_1)

with open(schema_location_2) as schema_file_2:
    schema_2 = json.load(schema_file_2)

with open(schema_location_3) as schema_file_3:
    schema_3 = json.load(schema_file_3)

In [4]:
convert_json_schema_to_glue_columns(schema_1)

[{'Name': 'id', 'Type': 'DOUBLE'},
 {'Name': 'isbn', 'Type': 'STRING'},
 {'Name': 'author', 'Type': 'STRUCT<lastname:STRING,firstname:STRING>'},
 {'Name': 'editor', 'Type': 'STRUCT<lastname:STRING,firstname:STRING>'},
 {'Name': 'title', 'Type': 'STRING'},
 {'Name': 'category', 'Type': 'ARRAY<STRING>'},
 {'Name': 'tags', 'Type': 'ARRAY<STRUCT<key:STRING,value:STRING>>'}]

In [5]:
convert_json_schema_to_glue_columns(schema_1, flatten=True, delimiter='__')

[{'Name': 'id', 'Type': 'DOUBLE'},
 {'Name': 'isbn', 'Type': 'STRING'},
 {'Name': 'author__lastname', 'Type': 'STRING'},
 {'Name': 'author__firstname', 'Type': 'STRING'},
 {'Name': 'editor__lastname', 'Type': 'STRING'},
 {'Name': 'editor__firstname', 'Type': 'STRING'},
 {'Name': 'title', 'Type': 'STRING'},
 {'Name': 'category', 'Type': 'ARRAY<STRING>'},
 {'Name': 'tags', 'Type': 'ARRAY<STRUCT<key:STRING,value:STRING>>'}]

In [6]:
convert_json_schema_to_glue_columns(schema_2)

[{'Name': 'a', 'Type': 'STRING'},
 {'Name': 'b', 'Type': 'BIGINT'},
 {'Name': 'c', 'Type': 'DOUBLE'},
 {'Name': 'd', 'Type': 'BOOLEAN'},
 {'Name': 'e', 'Type': 'ARRAY<BIGINT>'},
 {'Name': 'f',
  'Type': 'STRUCT<fa:STRING,fb:BIGINT,fc:DOUBLE,fd:BOOLEAN,fe:ARRAY<STRING>,ff:STRUCT<ffa:ARRAY<STRUCT<fffa:BIGINT,fffb:STRING>>>>'},
 {'Name': 'g', 'Type': 'ARRAY<STRUCT<ga:BIGINT,gb:ARRAY<BIGINT>>>'}]

In [7]:
convert_json_schema_to_glue_columns(schema_2, flatten=True, delimiter='__')

[{'Name': 'a', 'Type': 'STRING'},
 {'Name': 'b', 'Type': 'BIGINT'},
 {'Name': 'c', 'Type': 'DOUBLE'},
 {'Name': 'd', 'Type': 'BOOLEAN'},
 {'Name': 'e', 'Type': 'ARRAY<BIGINT>'},
 {'Name': 'f__fa', 'Type': 'STRING'},
 {'Name': 'f__fb', 'Type': 'BIGINT'},
 {'Name': 'f__fc', 'Type': 'DOUBLE'},
 {'Name': 'f__fd', 'Type': 'BOOLEAN'},
 {'Name': 'f__fe', 'Type': 'ARRAY<STRING>'},
 {'Name': 'f__ff__ffa', 'Type': 'ARRAY<STRUCT<fffa:BIGINT,fffb:STRING>>'},
 {'Name': 'g', 'Type': 'ARRAY<STRUCT<ga:BIGINT,gb:ARRAY<BIGINT>>>'}]

In [6]:
convert_json_schema_to_glue_columns(schema_3)

Exception: Empty arrays are not allowed in Glue

In [6]:
def flatten_json_schema(json_schema, prefix='', delimiter='_'):
    """
    Flattens a JSON schema by unwrapping nested structs and creating flattened property names.

    Args:
        json_schema (dict): The JSON schema.
        prefix (str): Prefix for the flattened property names (used for recursion).
        delimiter (str): Delimiter to separate flattened property names.

    Returns:
        dict: The flattened JSON schema.
    """
    flattened_schema = {}

    for key, value in json_schema["properties"].items():
        if value["type"] == "object":
            # Flatten the nested struct recursively
            flattened_properties = flatten_json_schema(value, prefix=f"{prefix}{key}{delimiter}", delimiter=delimiter)
            flattened_schema.update(flattened_properties)
        else:
            # Create the flattened property name
            flattened_key = f"{prefix}{key}"
            flattened_schema[flattened_key] = value

    return flattened_schema

In [7]:
flatten_json_schema(schema_2)

{'a': {'type': 'string',
  'default': '',
  'title': 'The a Schema',
  'examples': ['some_string']},
 'b': {'type': 'integer',
  'default': 0,
  'title': 'The b Schema',
  'examples': [1]},
 'c': {'type': 'number',
  'default': 0.0,
  'title': 'The c Schema',
  'examples': [3.14]},
 'd': {'type': 'boolean',
  'default': False,
  'title': 'The d Schema',
  'examples': [True]},
 'e': {'type': 'array',
  'default': [],
  'title': 'The e Schema',
  'items': {'type': 'integer', 'title': 'A Schema', 'examples': [1, 2, 3]},
  'examples': [[1, 2, 3]]},
 'f_fa': {'type': 'string',
  'default': '',
  'title': 'The fa Schema',
  'examples': ['some_other_string']},
 'f_fb': {'type': 'integer',
  'default': 0,
  'title': 'The fb Schema',
  'examples': [2]},
 'f_fc': {'type': 'number',
  'default': 0.0,
  'title': 'The fc Schema',
  'examples': [4.25]},
 'f_fd': {'type': 'boolean',
  'default': False,
  'title': 'The fd Schema',
  'examples': [False]},
 'f_fe': {'type': 'array',
  'default': [],
  '