In [1]:
# Generate the SparkSession
from lib.spark_session import get_spark_session

spark = get_spark_session("Generate DDL")
print("SPARK_APP: Spark Session UI - "+ spark.sparkContext.uiWebUrl)

SPARK_APP: Spark Session UI - http://03205cdd01e3:4040


In [16]:
# Get all Tables in Landing Schema
schema_name = 'edw_ld'
table_list_df = spark.sql(f"show tables in {schema_name}")

for table in table_list_df.collect():
    table_name = f"{schema_name}.{table['tableName']}"
    #print(table_name)
    print(f"-- DDL for {table_name}")
    df = spark.read.table(table_name)
    cols:str = ""
    for col in df.dtypes:
        #print(col)
        cols += f"\t {col[0]} {col[1]}, \r\n"
    #print(cols)
    print(f"CREATE EXTERNAL TABLE {table_name} (")
    print(cols[:-4])
    print(")")
    print(f"""ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://deltalake12/dw-with-pyspark/warehouse/{schema_name}.db/{table['tableName']}/_symlink_format_manifest/'
;""")


-- DDL for edw_ld.dim_customer_ld
CREATE EXTERNAL TABLE edw_ld.dim_customer_ld (
	 customer_id string, 
	 name string, 
	 address string, 
	 city string, 
	 state string, 
	 zip_code string, 
	 phone_number string, 
	 email string, 
	 date_of_birth string, 
	 plan_type string, 
	 insert_dt timestamp, 
	 rundate string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://deltalake12/dw-with-pyspark/warehouse/edw_ld.db/dim_customer_ld/_symlink_format_manifest/'
;
-- DDL for edw_ld.dim_date_ld
CREATE EXTERNAL TABLE edw_ld.dim_date_ld (
	 date string, 
	 day string, 
	 month string, 
	 year string, 
	 day_of_week string, 
	 insert_dt timestamp, 
	 rundate string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.Symlink

In [17]:
# Get all Tables in Staging Schema
schema_name = 'edw_stg'
table_list_df = spark.sql(f"show tables in {schema_name}")

for table in table_list_df.collect():
    table_name = f"{schema_name}.{table['tableName']}"
    #print(table_name)
    print(f"-- DDL for {table_name}")
    df = spark.read.table(table_name)
    cols:str = ""
    for col in df.dtypes:
        #print(col)
        cols += f"\t {col[0]} {col[1]}, \r\n"
    #print(cols)
    print(f"CREATE EXTERNAL TABLE {table_name} (")
    print(cols[:-4])
    print(")")
    print(f"""ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://deltalake12/dw-with-pyspark/warehouse/{schema_name}.db/{table['tableName']}/_symlink_format_manifest/'
;""")

-- DDL for edw_stg.dim_customer_stg
CREATE EXTERNAL TABLE edw_stg.dim_customer_stg (
	 customer_id string, 
	 name string, 
	 address string, 
	 city string, 
	 state string, 
	 zip_code string, 
	 phone_number string, 
	 email string, 
	 date_of_birth date, 
	 plan_type string, 
	 insert_dt timestamp, 
	 rundate string, 
	 first_name string, 
	 last_name string, 
	 effective_start_date timestamp, 
	 effective_end_date timestamp, 
	 active_flag int, 
	 update_dt timestamp
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://deltalake12/dw-with-pyspark/warehouse/edw_stg.db/dim_customer_stg/_symlink_format_manifest/'
;
-- DDL for edw_stg.dim_date_stg
CREATE EXTERNAL TABLE edw_stg.dim_date_stg (
	 date date, 
	 day int, 
	 month int, 
	 year int, 
	 day_of_week string, 
	 insert_dt timestamp, 
	 run

In [18]:
# Get all Tables in Final Schema
schema_name = 'edw'
table_list_df = spark.sql(f"show tables in {schema_name}")

for table in table_list_df.collect():
    table_name = f"{schema_name}.{table['tableName']}"
    #print(table_name)
    print(f"-- DDL for {table_name}")
    df = spark.read.table(table_name)
    cols:str = ""
    for col in df.dtypes:
        #print(col)
        cols += f"\t {col[0]} {col[1]}, \r\n"
    #print(cols)
    print(f"CREATE EXTERNAL TABLE {table_name} (")
    print(cols[:-4])
    print(")")
    print(f"""ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://deltalake12/dw-with-pyspark/warehouse/{schema_name}.db/{table['tableName']}/_symlink_format_manifest/'
;""")

-- DDL for edw.dim_customer
CREATE EXTERNAL TABLE edw.dim_customer (
	 row_wid string, 
	 customer_id string, 
	 first_name string, 
	 last_name string, 
	 address string, 
	 city string, 
	 state string, 
	 zip_code string, 
	 phone_number string, 
	 email string, 
	 date_of_birth date, 
	 plan_type string, 
	 effective_start_date timestamp, 
	 effective_end_date timestamp, 
	 active_flag int, 
	 rundate string, 
	 insert_dt timestamp, 
	 update_dt timestamp
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://deltalake12/dw-with-pyspark/warehouse/edw.db/dim_customer/_symlink_format_manifest/'
;
-- DDL for edw.dim_date
CREATE EXTERNAL TABLE edw.dim_date (
	 row_wid string, 
	 date date, 
	 day int, 
	 month int, 
	 year int, 
	 day_of_week string, 
	 rundate string, 
	 insert_dt timestamp, 
	 up