# Scrape Data using NYT API

### Wharton S&P Data

In [1]:
import wrds
db = wrds.Connection()

In [9]:
db.list_tables(library='crsp')

['acti',
 'asia',
 'asib',
 'asic',
 'asio',
 'asix',
 'bmdebt',
 'bmheader',
 'bmpaymts',
 'bmquotes',
 'bmyield',
 'bndprt06',
 'bndprt12',
 'bxcalind',
 'bxdlyind',
 'bxmthind',
 'bxquotes',
 'bxyield',
 'cap',
 'ccm_lookup',
 'ccm_qvards',
 'ccmxpf_linktable',
 'ccmxpf_lnkhist',
 'ccmxpf_lnkrng',
 'ccmxpf_lnkused',
 'comphead',
 'comphist',
 'compmaster',
 'contact_info',
 'crsp_cik_map',
 'crsp_daily_data',
 'crsp_header',
 'crsp_monthly_data',
 'crsp_names',
 'crsp_portno_map',
 'crsp_ziman_daily_index',
 'crsp_ziman_monthly_index',
 'cs20yr',
 'cs5yr',
 'cs90d',
 'cst_hist',
 'daily_nav',
 'daily_nav_ret',
 'daily_returns',
 'dividends',
 'dport1',
 'dport2',
 'dport3',
 'dport4',
 'dport5',
 'dport6',
 'dport7',
 'dport8',
 'dport9',
 'dsbc',
 'dsbo',
 'dse',
 'dse62',
 'dse62delist',
 'dse62dist',
 'dse62exchdates',
 'dse62names',
 'dse62nasdin',
 'dse62shares',
 'dseall',
 'dseall62',
 'dsedelist',
 'dsedist',
 'dseexchdates',
 'dsenames',
 'dsenasdin',
 'dseshares',
 'dsf',


### NYT Data 
src: https://towardsdatascience.com/collecting-data-from-the-new-york-times-over-any-period-of-time-3e365504004

In [90]:
import os
import json
import time
import requests
import datetime
import dateutil
import pandas as pd
import glob
from dateutil.relativedelta import relativedelta

In [35]:
end = datetime.date.today()
start = end - relativedelta(years=40)
months_in_range = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %-m").tolist()]

In [88]:
def send_request(date):
    '''Sends a request to the NYT Archive API for given date.'''
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    month = str(int(date[0][5:7]))
    url = base_url + '/' + date[0][0:4] + '/' + month + '.json?api-key=' + "Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf"
    print(url)
    response = requests.get(url).json()
    time.sleep(6)
    return response


def is_valid(article, date):
    '''An article is only worth checking if it is in range, and has a headline.'''
    is_in_range = date > start and date < end
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    return is_in_range and has_headline


def parse_response(response):
    '''Parses and returns response as pandas data frame.'''
    data = {'headline': [],  
        'date': [], 
        'doc_type': [],
        'material_type': [],
        'section': [],
        'keywords': []}
    
    articles = response['response']['docs'] 
    for article in articles: # For each article, make sure it falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main']) 
            if 'section' in article:
                data['section'].append(article['section_name'])
            else:
                data['section'].append(None)
            data['doc_type'].append(article['document_type'])
            if 'type_of_material' in article: 
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
    return pd.DataFrame(data) 


def get_data(dates):
    '''Sends and parses request/response to/from NYT Archive API for given dates.'''
    total = 0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('headlines'):
        os.mkdir('headlines')
    for date in dates:
        print(date[5:7])
        response = send_request(date)
        df = parse_response(response)
        total += len(df)
        df.to_csv('headlines/' + date[0][0:4] + '-' + date[0][5:7] + '.csv', index=False)
        print('Saving headlines/' + date[0] + '-' + date[1] + '.csv...')
    print('Number of articles collected: ' + str(total))

In [None]:
get_data(months_in_range)
os.chdir('./headlines/')
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
combined_csv.to_csv( "NYTData.csv", index=False, encoding='utf-8-sig')
try:
    all_filenames.remove("NYTData.csv")
except ValueError:
    print("Success")
for file in all_filenames:
    os.remove(file)

Date range: ['1980-11-01', '00:00:00'] to ['2020-11-01', '00:00:00']
[]
https://api.nytimes.com/svc/archive/v1//1980/11.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1980-11-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1980/12.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1980-12-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1981/1.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1981-01-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1981/2.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1981-02-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1981/3.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1981-03-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1981/4.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1981-04-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1981/5.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAl

In [113]:
os.chdir('..')
os.getcwd()

'C:\\Users\\smuke\\OneDrive\\Desktop\\NYTvMarket\\NYTvMarket'