In [1]:
#Spark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import *
#Python standard library
import json
import os
#Custom
from spark_job_functions import *

In [2]:
iceberg_warehouse = os.environ.get('ICEBERG_WAREHOUSE')
spark = SparkSession.builder.config(conf=spark_config(iceberg_warehouse)).getOrCreate()

In [3]:
model_scores = '../../tensorflow_model_train/DATA/nyt_text_scores.json'

In [9]:
f = open(model_scores)
data = json.load(f)

In [14]:
spark_list = []
for k, v in data.items():
    article_id = v[0].get('article_id')
    score = float(v[0].get('score'))
    flattened = (article_id, score)
    spark_list.append(flattened)

In [17]:
schema = StructType([
   StructField('article_id', StringType(), False),
   StructField('headline_score', FloatType(), False)])
spark_df = spark.createDataFrame(spark_list, schema)
spark_df.show(5)

+--------------------+--------------+
|          article_id|headline_score|
+--------------------+--------------+
|000e1b1f-0b85-5fa...|       0.10597|
|0034945d-a1db-5d4...|        5.0E-5|
|005c6dda-b20c-56d...|       0.04274|
|0065ad23-e757-5eb...|       0.00866|
|008676d5-264e-52d...|        0.0831|
+--------------------+--------------+
only showing top 5 rows



### Get Fact ID and add headline_score to fact table

In [20]:
id_df = spark.sql('SELECT * FROM nyt.db.article_ids')
joined_scores = id_df.join(spark_df, ['article_id']).drop('article_id')
joined_scores.show(5)

+-------+--------------+
|fact_id|headline_score|
+-------+--------------+
|   1000|       0.10597|
|   1001|        5.0E-5|
|   1002|       0.04274|
|   1003|       0.00866|
|   1004|        0.0831|
+-------+--------------+
only showing top 5 rows



### ALTER Fact table

In [22]:
alter_statement = """
                ALTER TABLE nyt.db.facts 
                ADD COLUMN
                headline_sentiment_score float
                ;
                """
spark.sql(alter_statement)

DataFrame[]

### Add the scores to the table

In [31]:
joined_scores.registerTempTable("scores")

merge_sql = """
    MERGE INTO nyt.db.facts t USING scores s
    ON (t.fact_id = s.fact_id)
    WHEN MATCHED
        THEN UPDATE SET headline_sentiment_score = s.headline_score
"""
spark.sql(merge_sql)





DataFrame[]

In [35]:
#Check for NULLS - Should be no rows returning
test = spark.sql('SELECT * FROM nyt.db.facts WHERE headline_sentiment_score IS NULL')
test.show()

+-------+----------------+----------+--------------+-------------+-----------------+--------+----------+-------------+---------------+------------+---------------+------------------------+
|fact_id|publication_date|word_count|total_keywords|total_authors|words_in_headline|in_print|print_page|print_section|article_type_id|news_desk_id|section_name_id|headline_sentiment_score|
+-------+----------------+----------+--------------+-------------+-----------------+--------+----------+-------------+---------------+------------+---------------+------------------------+
+-------+----------------+----------+--------------+-------------+-----------------+--------+----------+-------------+---------------+------------+---------------+------------------------+

