In [1]:
from json_schema_to_glue_columns import *

In [2]:
glue_data_formats_mapping ={
    "csv": {
        "input_format": "org.apache.hadoop.mapred.TextInputFormat",
        "output_format": "org.apache.hadoop.hive.ql.io.HivelgnoreKeyTextOutputFormat",
        "serde_info": {
            "SerializationLibrary": "org.apache.hadoop.hive.serde2.OpenCSVSerde",
            "Parameters": {
                "separatorChar": ","
            }
        },
        "parameters": {
            "classification": "csv"
        }
    },
    "json": {
        "input_format": "org.apache.hadoop.mapred.TextInputFormat",
        "output_format": "org.apache.hadoop.hive.qlio.HivelgnoreKeyTextOutputFormat",
        "serde_info": {
            "SerializationLibrary": "org.openx.data.jsonserde.JsonSerDe"
        },
        "parameters": {
            "classification": "json"
        }
    },
    "parquet": {
        "input_format": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
        "output_format": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
        "serde_info": {
            "SerializationLibrary": "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
            "Parameters": {
                "serialization.format": "1"
            }
        },
        "parameters": {
            "classification": "parquet"
        }
    }
}

In [3]:
schema_file_location = "./sample1.schema.json"
config_file_location = "./sample1.config.yml"

In [4]:
schema = load_json_schema(schema_file_location)
schema

{'title': 'Model',
 'type': 'object',
 'properties': {'glossary': {'title': 'Glossary',
   'type': 'object',
   'properties': {'title': {'title': 'Title', 'type': 'string'},
    'GlossDiv': {'title': 'GlossDiv',
     'type': 'object',
     'properties': {'title': {'title': 'Title', 'type': 'string'},
      'GlossList': {'title': 'GlossList',
       'type': 'object',
       'properties': {'GlossEntry': {'title': 'GlossEntry',
         'type': 'object',
         'properties': {'ID': {'title': 'Id', 'type': 'string'},
          'SortAs': {'title': 'Sortas', 'type': 'string'},
          'GlossTerm': {'title': 'Glossterm', 'type': 'string'},
          'Acronym': {'title': 'Acronym', 'type': 'string'},
          'Abbrev': {'title': 'Abbrev', 'type': 'string'},
          'GlossDef': {'title': 'GlossDef',
           'type': 'object',
           'properties': {'para': {'title': 'Para', 'type': 'string'},
            'GlossSeeAlso': {'title': 'Glossseealso',
             'type': 'array',
       

In [5]:
placeholders = {
    "[aws_env]": "dev",
    "[logical_env]": "abc"
}

config = load_yaml_config(config_file_location, placeholders)
config

{'data_format': 'json',
 'pipeline_type': 'SCD1',
 'storage_zones': {'raw': {'database': 'abc_source_raw',
   'table': 'sample',
   's3_location': 's3://org-dev-abc-raw/prefix'},
  'staging': {'database': 'abc_source_staging',
   'table': 'sample',
   's3_location': 's3://org-dev-abc-staging/prefix'}}}

In [6]:
partition_key_list = ["year:int", "month:int", "day:int", "hour:int", "minute:int"]
partition_keys = create_partition_keys(partition_key_list)
partition_keys

[{'Name': 'year', 'Type': 'int'},
 {'Name': 'month', 'Type': 'int'},
 {'Name': 'day', 'Type': 'int'},
 {'Name': 'hour', 'Type': 'int'},
 {'Name': 'minute', 'Type': 'int'}]

In [7]:
raw_glue_columns = convert_json_schema_to_glue_columns(schema)
raw_glue_columns

[{'Name': 'glossary',
  'Type': 'STRUCT<title:STRING,glossdiv:STRUCT<title:STRING,glosslist:STRUCT<glossentry:STRUCT<id:STRING,sortas:STRING,glossterm:STRING,acronym:STRING,abbrev:STRING,glossdef:STRUCT<para:STRING,glossseealso:ARRAY<STRING>>,glosssee:STRING>>>>'},
 {'Name': 'local',
  'Type': 'STRUCT<menu:STRUCT<id:STRING,value:STRING,popup:STRUCT<menuitem:ARRAY<STRUCT<value:STRING,onclick:STRING>>>>>'},
 {'Name': 'external',
  'Type': 'STRUCT<viewer:STRUCT<header:STRING,items:ARRAY<STRUCT<id:STRING,label:STRING>>>>'}]

In [8]:
raw_zone_config = config["storage_zones"]["raw"]

raw_database = raw_zone_config["database"]
raw_table = raw_zone_config["table"]
raw_s3_location = raw_zone_config["s3_location"]

raw_glue_data_formats = glue_data_formats_mapping[config["data_format"].lower()]

raw_input_format = raw_glue_data_formats["input_format"]
raw_output_format = raw_glue_data_formats["output_format"]
raw_serde_info = raw_glue_data_formats["serde_info"]
raw_parameters = raw_glue_data_formats["parameters"]

In [9]:
raw_table_manager = GlueTableManager(
    table_type='EXTERNAL_TABLE',
    description='This is a sample table',
    database_name=raw_database,
    table_name=raw_table,
    columns=raw_glue_columns,
    location=raw_s3_location,
    input_format=raw_input_format,
    output_format=raw_output_format,
    serde_info=raw_serde_info,
    partition_keys=partition_keys, 
    parameters=raw_parameters
)

In [10]:
raw_table_manager.create_or_update_table()

{'ResponseMetadata': {'RequestId': 'fe776c54-9f74-4708-8805-1704072f5b04',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 08 Jun 2023 14:22:10 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'fe776c54-9f74-4708-8805-1704072f5b04'},
  'RetryAttempts': 0}}

In [11]:
staging_glue_columns = convert_json_schema_to_glue_columns(schema, flatten=True, delimiter='__')
staging_glue_columns

[{'Name': 'glossary__title', 'Type': 'STRING'},
 {'Name': 'glossary__glossdiv__title', 'Type': 'STRING'},
 {'Name': 'glossary__glossdiv__glosslist__glossentry__id', 'Type': 'STRING'},
 {'Name': 'glossary__glossdiv__glosslist__glossentry__sortas',
  'Type': 'STRING'},
 {'Name': 'glossary__glossdiv__glosslist__glossentry__glossterm',
  'Type': 'STRING'},
 {'Name': 'glossary__glossdiv__glosslist__glossentry__acronym',
  'Type': 'STRING'},
 {'Name': 'glossary__glossdiv__glosslist__glossentry__abbrev',
  'Type': 'STRING'},
 {'Name': 'glossary__glossdiv__glosslist__glossentry__glossdef__para',
  'Type': 'STRING'},
 {'Name': 'glossary__glossdiv__glosslist__glossentry__glossdef__glossseealso',
  'Type': 'ARRAY<STRING>'},
 {'Name': 'glossary__glossdiv__glosslist__glossentry__glosssee',
  'Type': 'STRING'},
 {'Name': 'local__menu__id', 'Type': 'STRING'},
 {'Name': 'local__menu__value', 'Type': 'STRING'},
 {'Name': 'local__menu__popup__menuitem',
  'Type': 'ARRAY<STRUCT<value:STRING,onclick:STRIN

In [12]:
pipeline_type = config["pipeline_type"]
additional_columns = get_additional_columns(pipeline_type)
staging_glue_columns+=additional_columns
staging_glue_columns

[{'Name': 'glossary__title', 'Type': 'STRING'},
 {'Name': 'glossary__glossdiv__title', 'Type': 'STRING'},
 {'Name': 'glossary__glossdiv__glosslist__glossentry__id', 'Type': 'STRING'},
 {'Name': 'glossary__glossdiv__glosslist__glossentry__sortas',
  'Type': 'STRING'},
 {'Name': 'glossary__glossdiv__glosslist__glossentry__glossterm',
  'Type': 'STRING'},
 {'Name': 'glossary__glossdiv__glosslist__glossentry__acronym',
  'Type': 'STRING'},
 {'Name': 'glossary__glossdiv__glosslist__glossentry__abbrev',
  'Type': 'STRING'},
 {'Name': 'glossary__glossdiv__glosslist__glossentry__glossdef__para',
  'Type': 'STRING'},
 {'Name': 'glossary__glossdiv__glosslist__glossentry__glossdef__glossseealso',
  'Type': 'ARRAY<STRING>'},
 {'Name': 'glossary__glossdiv__glosslist__glossentry__glosssee',
  'Type': 'STRING'},
 {'Name': 'local__menu__id', 'Type': 'STRING'},
 {'Name': 'local__menu__value', 'Type': 'STRING'},
 {'Name': 'local__menu__popup__menuitem',
  'Type': 'ARRAY<STRUCT<value:STRING,onclick:STRIN

In [13]:
staging_zone_config = config["storage_zones"]["staging"]

staging_database = staging_zone_config["database"]
staging_table = staging_zone_config["table"]
staging_s3_location = staging_zone_config["s3_location"]

staging_glue_data_formats = glue_data_formats_mapping["parquet"]

staging_input_format = staging_glue_data_formats["input_format"]
staging_output_format = staging_glue_data_formats["output_format"]
staging_serde_info = staging_glue_data_formats["serde_info"]
staging_parameters = staging_glue_data_formats["parameters"]

In [14]:
staging_table_manager = GlueTableManager(
    table_type='EXTERNAL_TABLE',
    description='This is a sample table',
    database_name=staging_database,
    table_name=staging_table,
    columns=staging_glue_columns,
    location=staging_s3_location,
    input_format=staging_input_format,
    output_format=staging_output_format,
    serde_info=staging_serde_info,
    partition_keys=partition_keys, 
    parameters=staging_parameters
)

In [15]:
staging_table_manager.create_or_update_table()

{'ResponseMetadata': {'RequestId': 'f44f4e95-daaa-49a4-a0b9-b65465c699a7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 08 Jun 2023 14:22:39 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'f44f4e95-daaa-49a4-a0b9-b65465c699a7'},
  'RetryAttempts': 0}}