In [1]:
#Spark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
#Standard Library
import os
#Custom
from spark_job_functions import spark_config

In [2]:
iceberg_warehouse = os.environ.get('ICEBERG_WAREHOUSE')
spark = SparkSession.builder.config(conf=spark_config(iceberg_warehouse)).getOrCreate()

In [3]:
create_article_ids = """
                CREATE OR REPLACE TABLE nyt.db.article_ids (
                fact_id INT,
                article_id VARCHAR(255)
                ) USING iceberg
                """

create_facts = """
                CREATE OR REPLACE TABLE nyt.db.facts (
                fact_id INT,
                publication_date DATE,
                word_count INT,
                total_keywords INT,
                total_authors INT,
                words_in_headline INT,
                in_print BOOLEAN,
                print_page FLOAT,
                print_section CHAR(5),
                article_type_id INT,
                news_desk_id INT,
                section_name_id INT
                ) USING iceberg
                PARTITIONED BY (publication_date)
                """

create_authors = """
                CREATE OR REPLACE TABLE nyt.db.authors (
                table_id FLOAT,
                fact_id INT,
                author_role VARCHAR(255),
                role_rank INT,
                first_name VARCHAR(255),
                middle VARCHAR(255),
                last_name VARCHAR(255),
                qualifier VARCHAR(255)
                ) USING iceberg
                """
create_subject_people = """
                CREATE OR REPLACE TABLE nyt.db.subject_people (
                table_id FLOAT,
                fact_id INT,
                subject_id INT,
                subject_rank INT,
                major_subject BOOLEAN,
                first_name VARCHAR(255),
                middle VARCHAR(255),
                last_name VARCHAR(255),
                qualifier VARCHAR(255)
                ) USING iceberg
                """

create_subject_others = """
                CREATE OR REPLACE TABLE nyt.db.subject_others (
                table_id FLOAT,
                fact_id INT,
                subject_id INT,
                subject_rank INT,
                major_subject BOOLEAN,
                subject VARCHAR(255)
                ) USING iceberg
                PARTITIONED BY (subject_id)
                """

create_dim_article_types = """
                CREATE OR REPLACE TABLE nyt.db.dim_article_types (
                article_type_id INT,
                article_type VARCHAR(255)
                ) USING iceberg
                """

create_dim_news_desks = """
                CREATE OR REPLACE TABLE nyt.db.dim_news_desks (
                news_desk_id INT,
                news_desk VARCHAR(255)
                ) USING iceberg
                """

create_dim_section_names = """
                CREATE OR REPLACE TABLE nyt.db.dim_section_names (
                section_name_id INT,
                section_name VARCHAR(255)
                ) USING iceberg
                """

create_dim_subject_ids = """
                CREATE OR REPLACE TABLE nyt.db.dim_subject_ids (
                subject_id INT,
                subject_name VARCHAR(255)
                ) USING iceberg
                """

In [4]:
#Create statements in a list
create_tables = [
    create_article_ids, create_facts, create_authors,
    create_subject_people, create_subject_others,
    create_dim_article_types, create_dim_news_desks,
    create_dim_section_names, create_dim_subject_ids
]

#Iterate and create
for table in create_tables:
    spark.sql(table)

In [5]:
#Fill dimensional tables

dim_dir = 'dimensional_tables'
article_path = os.path.join(dim_dir, 'article_types.csv')
news_path = os.path.join(dim_dir, 'news_desks.csv')
section_path = os.path.join(dim_dir, 'section_names.csv')

dim_article = spark.read.option('header', True).csv(article_path)
dim_news = spark.read.option('header', True).csv(news_path)
dim_section = spark.read.option('header', True).csv(section_path)

subject_ids = [
    (1, 'persons'),
    (2, 'organizations'),
    (3, 'glocations'),
    (4, 'subject')
]
dim_subjects = spark.createDataFrame(subject_ids, schema=['subject_id', 'subject_name'])

#Make fields ints
dim_article = dim_article.withColumn('article_type_id', dim_article['article_type_id'].cast(IntegerType()))
dim_news = dim_news.withColumn('news_desk_id', dim_news['news_desk_id'].cast(IntegerType()))
dim_section = dim_section.withColumn('section_name_id', dim_section['section_name_id'].cast(IntegerType()))
dim_subjects = dim_subjects.withColumn('subject_id', dim_subjects['subject_id'].cast(IntegerType()))
                                                    

#Write to tables
dim_article.writeTo('nyt.db.dim_article_types').append()
dim_news.writeTo('nyt.db.dim_news_desks').append()
dim_section.writeTo('nyt.db.dim_section_names').append()
dim_subjects.writeTo('nyt.db.dim_subject_ids').append()