In [5]:
import time
time.sleep(3000) # so we don't overlap with the other thread.

In [6]:
# NYT Article Data Parser

import csv
from collections import defaultdict


# pull in the data from the Boydstun study
data_path = '/home/ryan/work/nytparser/data/'
data_file = 'Boydstun_random_1000.csv'
#data_file = 'Boydstun_NYT_FrontPage_Dataset TEST9Items.csv'
data_location = data_path + data_file

# topic codes used in the Boydstun study
boydstun_topic_codes = {'1':'Macroeconomics',
                        '2':'Civil Rights, Minority Issues, and Civil Liberties',
                        '3':'Health',
                        '4':'Agriculture',
                        '5':'Labor, Employment, and Immigration',
                        '6':'Education',
                        '7':'Environment',
                        '8':'Energy',
                        "9":'Quality of Life',
                        '10':'Transportation',
                        '12':'Law, Crime, and Family Issues',
                        '13':'Social Welfare',
                        '14':'Community Development and Housing Issues',
                        '15':'Banking, Finance, and Domestic Commerce',
                        '16':'Defense',
                        '17':'Space, Science, Technology and Communications',
                        '18':'Foreign Trade',
                        '19':'International Affairs and Foreign Aid',
                        '20':'Government Operations',
                        '21':'Public Lands and Water Management',
                        '24':'State and Local Government Administration',
                        '26':'Weather and Natural Disasters',
                        '27':'Fires',
                        '28':'Arts and Entertainment',
                        '29':'Sports and Recreation',
                        '30':'Death Notices',
                        '31':'Churches and Religion',
                        '99':'Other, Miscellaneous, and Human Interest',
                       }
    
columns = defaultdict(list)

with open(data_location, newline='') as csvfile:
    rdr = csv.DictReader(csvfile)
    for row in rdr:
        for(k,v) in row.items():
            columns[k].append(v)

boydstun_article_titles = columns['title']
boydstun_article_classes = columns['topic_2digit']

In [7]:
import urllib

# Prepare article titles for search 
       
URL_encoded_titles = []

for title in boydstun_article_titles:
    title = "\"" + title + "\""
    #print(title)
    URL_encoded_title = urllib.parse.quote(title)
    #print(URL_encoded_title)
    URL_encoded_titles.append(URL_encoded_title)

#print(URL_encoded_titles)

In [8]:
import json
import time

# Get article data from NYT Developer API

# Build the query

# For reference
#base_url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?'
#API_key = 'MEt3Ki6vTVUvG4unee31Sb6MuSq1ACVO'
# Test URL
#url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?api-key=MEt3Ki6vTVUvG4unee31Sb6MuSq1ACVO&begin_date=19960101&end_date=20061231&fq=headline:(%22Tax%20Panel%20Says%20Popular%20Breaks%20Should%20Be%20Cut%22)&fq=print_page:1'

articles_to_parse = []
article_descriptions = []

print("Found " + str(len(URL_encoded_titles)) + " article titles to search for.")
i = 0

for title in URL_encoded_titles:
    print("---> Fetching desriptions and class for article " + str(i))
    url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?api-key=MEt3Ki6vTVUvG4unee31Sb6MuSq1ACVO&begin_date=19960101&end_date=20061231&fq=headline:(' + title + ')&fq=print_page:1'
    #print(title, url)
    response = urllib.request.urlopen(url)
    json_dict = json.loads(response.read())
    #print(json.dumps(json_dict, indent=4))
    articles_to_parse.append(json_dict)
    
    # Go through articles and extract what we need
    for doc in json_dict['response']['docs']:
        if 'snippet' in doc:
            #print(doc['snippet'])
            snippet = doc['snippet']
        else:
            snippet = ''
                
        if 'lead_paragraph' in doc:
            #print(doc['lead_paragraph'])
            lead_paragraph = doc['lead_paragraph']
        else:
            lead_paragraph = ''

        article_description = snippet + lead_paragraph
        article_descriptions.append(article_description)
    
    i += 1
    time.sleep(6) #limit enforced by NYT

print("Articles pulled:" + str(len(articles_to_parse)))

Found 1000 article titles to search for.
---> Fetching desriptions and class for article 0
---> Fetching desriptions and class for article 1
---> Fetching desriptions and class for article 2
---> Fetching desriptions and class for article 3
---> Fetching desriptions and class for article 4
---> Fetching desriptions and class for article 5
---> Fetching desriptions and class for article 6
---> Fetching desriptions and class for article 7
---> Fetching desriptions and class for article 8
---> Fetching desriptions and class for article 9
---> Fetching desriptions and class for article 10
---> Fetching desriptions and class for article 11
---> Fetching desriptions and class for article 12
---> Fetching desriptions and class for article 13
---> Fetching desriptions and class for article 14
---> Fetching desriptions and class for article 15
---> Fetching desriptions and class for article 16
---> Fetching desriptions and class for article 17
---> Fetching desriptions and class for article 18


HTTPError: HTTP Error 400: Bad Request

In [None]:
# Transform to useful format


#pickling the imported data so I don't have to wait for it forever again.
import pickle
try:
    pickle_file = open(data_location + '.pkl', 'wb')
    pickle.dump(article_descriptions, pickle_file)
    pickle_file.close()

"""
Goal output:

{"text": "some text", "label": "politics"}
{"text": "some other text", "label": "science"}
"""

output_list = []

print(len(article_descriptions))


for i in range(len(article_descriptions)):
    
    print('working on article ' + str(i) + ': topic: ' + str(boydstun_topic_codes[boydstun_article_classes[i]]))
    article_dict = {}
    try:
        article_dict['text'] = article_descriptions[i]
        article_dict['label'] = str(boydstun_topic_codes[boydstun_article_classes[i]])
        output_list.append(article_dict)
    except Exception as e:
        print(type(e))
        
        

with open(data_path + 'nyt_article_data.json', 'w') as outfile:
    json.dump(output_list, outfile, separators=(',', ':'))