In [1]:
#Custom
from nyt_article_search import JSONParse
from spark_job_functions import *
#Spark
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql import functions as f
from pyspark.sql.types import *
#Python standard library
from itertools import chain
import json
import os

### Load JSONs and transform in to lists of tuples

In [2]:
# Lists that the data from each json response will be added to
article_ids = []
fact_data = []
author_data = []
subject_data = []
people_data = []
org_data = []
loc_data = []
# Big text will be a dictionary where we'll add headline, lead paragraph,
# abstract and web url
big_text = {}

In [3]:
#Data folder where JSONs are located from api_call
# - Should be 1 folder for every state
data_folder = r'../../api_call/DATA'
print(os.listdir(data_folder))

['ALABAMA', 'ALASKA', 'ARIZONA', 'ARKANSAS', 'CALIFORNIA', 'COLORADO', 'CONNECTICUT', 'DELAWARE', 'FLORIDA', 'GEORGIA', 'HAWAII', 'IDAHO', 'ILLINOIS', 'INDIANA', 'IOWA', 'KANSAS', 'KENTUCKY', 'LOUISIANA', 'MAINE', 'MARYLAND', 'MASSACHUSETTS', 'MICHIGAN', 'MINNESOTA', 'MISSISSIPPI', 'MISSOURI', 'MONTANA', 'NEBRASKA', 'NEVADA', 'NEW HAMPSHIRE', 'NEW JERSEY', 'NEW MEXICO', 'NEW YORK', 'NORTH CAROLINA', 'NORTH DAKOTA', 'OHIO', 'OKLAHOMA', 'OREGON', 'PENNSYLVANIA', 'RHODE ISLAND', 'SOUTH CAROLINA', 'SOUTH DAKOTA', 'TENNESSEE', 'TEXAS', 'UTAH', 'VERMONT', 'VIRGINIA', 'WASHINGTON', 'WEST VIRGINIA', 'WISCONSIN', 'WYOMING']


In [4]:

states = [f for f in os.listdir(data_folder)]
for state in states:
    state_folder = os.path.join(data_folder, state)
    json_files = os.listdir(state_folder)
    for file in json_files:
        filepath = os.path.join(state_folder, file)
        json_file = open(filepath, 'r')
        data = json.load(json_file)
        responses = data['response']['docs']
        # Parse each json response in the json file
        for respsonse in responses:
            j = JSONParse(respsonse)
            #Some JSON files share the same articles
            #Only append if the article_id is not currently in the list
            if j.article_id not in article_ids:
                article_ids.append(j.article_id)
                fact_data.append(j.get_article_facts())
                extend_list(author_data, j.get_article_authors())
                extend_list(subject_data, j.search_article_keywords('subject'))
                extend_list(people_data, j.search_article_keywords('persons'))
                extend_list(org_data, j.search_article_keywords('organizations'))
                extend_list(loc_data, j.search_article_keywords('glocations'))
                id_text_dict = {
                    j.article_id:{
                                    'headline':j.get_text('headline')[1],
                                    'abstract':j.get_text('abstract')[1],
                                    'lead_paragraph':j.get_text('lead_paragraph')[1],
                                    'web_url':j.get_text('web_url')[1]
                                }
                    }
                big_text.update(id_text_dict)

In [5]:
iceberg_warehouse = os.environ.get('ICEBERG_WAREHOUSE')
spark = SparkSession.builder.config(conf=spark_config(iceberg_warehouse)).getOrCreate()

In [6]:
#Create spark dfs
facts = spark.createDataFrame(fact_data, schema=get_table_headers('facts'))
authors = spark.createDataFrame(author_data, schema=get_table_headers('authors'))
subjects = spark.createDataFrame(subject_data, schema=get_table_headers('subjects'))
people = spark.createDataFrame(people_data, schema=get_table_headers('subjects'))
organizations = spark.createDataFrame(org_data, schema=get_table_headers('subjects'))
locations = spark.createDataFrame(loc_data, schema=get_table_headers('subjects'))

### Step 1: Create Primary Keys in tables

In [7]:
# First step is to create an interger primary key for the article_ids
ids = [(e + 1000, i) for e, i in enumerate(article_ids)]
# Create spark df out of id list
# Integer Primary Key is fact_id
id_schema = StructType([
    StructField('fact_id', IntegerType(), False),
    StructField('article_id', StringType(), False)
    ])

### THIS SHOULD NOT CHANGE ANY FURTHER
id_df = spark.createDataFrame(ids, schema=id_schema)



#Merge the id_df dataframe into the existing frames
# To put fact_id in all the other tables
facts = id_df.join(facts, ['article_id'], how = 'inner').drop('article_id')
authors = id_df.join(authors, ['article_id'], how = 'inner').drop('article_id')
subjects = id_df.join(subjects, ['article_id'], how = 'inner').drop('article_id')
people = id_df.join(people, ['article_id'], how = 'inner').drop('article_id')
organizations = id_df.join(organizations, ['article_id'], how = 'inner').drop('article_id')
locations = id_df.join(locations, ['article_id'], how = 'inner').drop('article_id')

In [8]:
# Union the subjects, organizations, and locations dfs togehter
places_and_things = subjects.union(organizations).union(locations).orderBy(['fact_id', 'rank'])
places_and_things = create_primary_key(places_and_things, 'table_id', 'fact_id', 'rank')
authors = create_primary_key(authors, 'table_id', 'fact_id', 'rank')
people = create_primary_key(people, 'table_id', 'fact_id', 'rank')

### Step 2: Text standardization

In [9]:
# # Read in dimensional tables

news_desk_df = spark.table('nyt.db.dim_news_desks')
material_df = spark.table('nyt.db.dim_article_types')
section_df = spark.table('nyt.db.dim_section_names')
subject_id_df = spark.table('nyt.db.dim_subject_ids')

In [10]:
#standardize any text in the article_type, news_desk, and section_name
#columns that aren't in the dimensinal tables
#EX: National Desk vs National - should be National
facts = standardize_text(facts, 'article_type', material_df)
facts = standardize_text(facts, 'news_desk', news_desk_df)
facts = standardize_text(facts, 'section_name', section_df)



In [11]:
#clean the print_page column in facts df
#Some print pages aren't numeric: 25A - A is already in print_section column
#Create a list of unique page numbers 
page_rdd = facts.select(['print_page']).distinct().rdd.flatMap(lambda x: x).collect()
#include pages that aren't None
pages = [p for p in set(page_rdd) if p != None]

pages_cleaned = {}
for page in pages:
    if page != None:
        try:
            p = float(page)
            p_dict = {page:p}
            pages_cleaned.update(p_dict)
        except:
            if page == '':
                p = None
                p_dict = {page:p}
                pages_cleaned.update(p_dict)
            else:
                p = page[:-1]
                p = float(p)
                p_dict = {page:p}
                pages_cleaned.update(p_dict)
    else:
        pass
#create mapping
mapping = f.create_map([f.lit(x) for x in chain(*pages_cleaned.items())])

#update the print_page column with mapping
facts = facts.withColumn('print_page', mapping[facts['print_page']])


In [12]:
# IN the people df create a standardized first name, middle name, and last name columns

# Get the last name (left of comma) and upper case it
people = people.withColumn('last_name', f.upper(f.substring_index('value', ',', 1)))
# Get the first and middle name (right of comma) and upper case it
people = people.withColumn('first_middle', f.upper(f.substring_index('value', ',', -1)))
# Replace special characters in first and middle names with empty string,
# But keep inner white space to separate middle name from first
people = people.withColumn('first_middle', f.trim(f.regexp_replace('first_middle', '[^a-zA-Z ]', '')))
# Identify name qualifiers - SR, JR, II, III, IV, V
# EX - Dale Earnhardt vs Dale Earnhardt Jr
qualifier_list = ['JR', 'SR', 'II', 'III', 'IV', 'V']
qualifiers = people.where(f.substring_index('first_middle', ' ', -1).isin(qualifier_list)).select(['table_id', 'first_middle'])

#Map replacement string for names with ending qualifiers
name_replace = qualifiers.select('first_middle')
name_list = name_replace.select(['first_middle']).distinct().rdd.flatMap(lambda x: x).collect()
name_dict = {name:name[0:name.rindex(' ')] for name in name_list}

#update name_dict to include rest of names so mapping is correct
non_qualifiers = people.where(~f.substring_index('first_middle', ' ', -1).isin(qualifier_list)).select(['table_id', 'first_middle'])
non_qual_list = non_qualifiers.select(['first_middle']).distinct().rdd.flatMap(lambda x: x).collect()
for n in non_qual_list:
    name_dict.update({n:n})


#create mapping
name_map = f.create_map([f.lit(x) for x in chain(*name_dict.items())])
people = people.withColumn('first_middle', name_map[people['first_middle']])

#Get middle name / initial
people = people.withColumn('middle', 
    f.when(f.size(f.split(people['first_middle'], ' ', -1)) == 1, None).otherwise(f.substring_index(people['first_middle'], ' ', -1)))

#Get first name
people = people.withColumn('first_name', f.substring_index(people['first_middle'], ' ', 1))

#Merge qualifiers back in as a separate column
qualifiers = qualifiers.withColumn('qualifier', f.substring_index('first_middle', ' ', -1)).select('table_id', 'qualifier')
people = people.join(qualifiers, ['table_id'], how = 'left').drop('value').drop('first_middle')


In [13]:
# Authors - rename columns to match people df
rename_authors_cols = {
    'lastname':'last_name',
    'firstname':'first_name',
    'middlename':'middle', 
    'rank':'role_rank',
    'role':'author_role'
}
authors = authors.withColumnsRenamed(rename_authors_cols)

#uppercase names
authors = authors.withColumn('last_name', f.upper('last_name'))
authors = authors.withColumn('first_name', f.upper('first_name'))
authors = authors.withColumn('middle', f.upper('middle'))


#people and plances_and_things - update columns
peole_col_renamed = {
    'rank':'subject_rank',
    'major':'major_subject'
}
people = people.withColumnsRenamed(peole_col_renamed)

places_and_things_col_renamed = {
    'rank':'subject_rank',
    'major':'major_subject',
    'value':'subject'
}
places_and_things = places_and_things.withColumn('value', f.upper('value'))
places_and_things = places_and_things.withColumnsRenamed(places_and_things_col_renamed)

#Make major_role column true or false
people = people.withColumn('major_subject', f.when(people['major_subject'] == 'Y', True).otherwise(False))
places_and_things = places_and_things.withColumn('major_subject', f.when(places_and_things['major_subject'] == 'Y', True).otherwise(False))


#### Step 3: Use Dimensional Id's instead of text

- Dimensional Tables:
    - article_type_ids: material_df
    - news_desk_ids: news_desk_df
    - section_ids: section_df
    - subjects_ids: create df

In [14]:
#Make Sure Id's are ints
material_df = material_df.withColumn('article_type_id', material_df['article_type_id'].cast(IntegerType()))
news_desk_df = news_desk_df.withColumn('news_desk_id', news_desk_df['news_desk_id'].cast(IntegerType()))
section_df = section_df.withColumn('section_name_id', section_df['section_name_id'].cast(IntegerType()))
subject_id_df = subject_id_df.withColumn('subject_id', subject_id_df['subject_id'].cast(IntegerType()))


In [15]:

facts = facts.join(material_df, ['article_type'], how = 'left')\
        .join(news_desk_df, ['news_desk'], how = 'left')\
        .join(section_df, ['section_name'], how = 'left')\
        .drop(*('section_name', 'news_desk', 'article_type'))


In [16]:
#Join subject id to places_and_things df
subject_join = subject_id_df.withColumnRenamed('subject_name', 'name')
places_and_things = places_and_things.join(subject_join, ['name'], how = 'left').drop('name')


In [17]:
facts = facts.select(final_col_order('facts'))
authors = authors.select(final_col_order('authors'))
#add subject id to people
people = people.withColumn('subject_id', f.lit(1))
people = people.select(final_col_order('subject_people'))
places_and_things = places_and_things.select(final_col_order('subject_others'))

In [18]:
quality_check_dict = {}
quality_checker = [
            (id_df, 'article_id', 'article_ids'),
            (facts, 'fact_id', 'facts'), 
            (authors, 'table_id', 'authors'),
            (people, 'table_id', 'subject_people'),
            (places_and_things, 'table_id', 'subject_others')
            ]

for table in quality_checker:
    dup_check = duplicate_check(table[0], table[1], table[2])
    quality_check_dict.update(dup_check)


In [19]:
# #If the quality check passes, write the tables
if all(quality_check_dict.values()) == True:
    #Validate Schemas
    facts = spark.createDataFrame(facts.rdd.collect(), schema=StructType(schema_validation('facts')))
    authors = spark.createDataFrame(authors.rdd.collect(), schema=StructType(schema_validation('authors')))
    people = spark.createDataFrame(people.rdd.collect(), schema=StructType(schema_validation('subject_people')))
    places_and_things = spark.createDataFrame(places_and_things.rdd.collect(), schema=StructType(schema_validation('subject_others')))
    id_df = spark.createDataFrame(id_df.rdd.collect(), schema=StructType(schema_validation('article_ids')))

    #Sort partitioned tables
    facts = facts.sort(facts['publication_date'])
    places_and_things = places_and_things.sort(places_and_things['subject_id'])

    #Write to iceberg tables
    #No partitions
    authors.writeTo('nyt.db.authors').append()
    people.writeTo('nyt.db.subject_people').append()
    id_df.writeTo('nyt.db.article_ids').append()
    #Partitions
    facts.writeTo('nyt.db.facts').partitionedBy('publication_date').append()
    places_and_things.writeTo('nyt.db.subject_others').partitionedBy('subject_id').append()
else:
    table_failures = [k for k, v in quality_check_dict.items() if v == False]
    tables_joined = ', '.join(table_failures)
    message = f'The following tables failed the duplicate primary key quality check: {tables_joined}'
    print(message)
    raise Exception(message)


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 64390)
Traceback (most recent call last):
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\socketserver.py", line 747, in __init__
    self.handle()
  File "d:\Projects\news\lib\site-packages\pyspar

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "d:\Projects\news\lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\socket.py", line 705, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "d:\Projects\news\lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "d:\Projects\news\lib\site-packages\py4j\clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

In [34]:
#Write text files for sentiment analysis

#Create directory if it doesn't exist
data_folder = '..\DATA'
if not os.path.exists(data_folder):
    os.mkdir(data_folder)

analysis_folder = os.path.join(data_folder, 'SENTIMENT_ANALYSIS_FILES')
if not os.path.exists(analysis_folder):
    os.mkdir(analysis_folder)




In [35]:
#Function to write text files for sentiment analysis
def write_text_file(text_type):
    text = v[text_type]
    if text != None:
        text_file = os.path.join(analysis_folder, f'{i}-{text_type}.txt')
        text_writer = open(text_file, 'w', encoding='utf-8')
        text_writer.write(text)
        text_writer.close()


#Files will be written with the fact_id + text type with the text string as the content
#EX: fact_id 1000 = 1000-headline.txt, 1000-abstract.txt, 1000-lead_paragraph.txt
id_dict = {t[1]:t[0] for t in ids}

for k, v in big_text.items():
    i = id_dict.get(k)
    # #Headline
    write_text_file('headline')
    # # Abstact
    write_text_file('abstract')
    # # Lead Paragraph
    write_text_file('lead_paragraph')




