# Scrape Data using NYT API

### Wharton S&P Data

In [150]:
import wrds
import yfinance as yf
import pandas as pd
import numpy as np

In [None]:
db = wrds.Connection()

In [154]:
spy = yf.download("SPY")
spy.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1993-01-29,43.96875,43.96875,43.75,43.9375,26.079659,1003200
1993-02-01,43.96875,44.25,43.96875,44.25,26.265144,480500
1993-02-02,44.21875,44.375,44.125,44.34375,26.320782,201300
1993-02-03,44.40625,44.84375,44.375,44.8125,26.599014,529400
1993-02-04,44.96875,45.09375,44.46875,45.0,26.710312,531500


### NYT Data 
src: https://towardsdatascience.com/collecting-data-from-the-new-york-times-over-any-period-of-time-3e365504004

In [90]:
import os
import json
import time
import requests
import datetime
import dateutil
import pandas as pd
import glob
from dateutil.relativedelta import relativedelta

In [35]:
end = datetime.date.today()
start = end - relativedelta(years=40)
months_in_range = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %-m").tolist()]

In [88]:
def send_request(date):
    '''Sends a request to the NYT Archive API for given date.'''
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    month = str(int(date[0][5:7]))
    url = base_url + '/' + date[0][0:4] + '/' + month + '.json?api-key=' + "Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf"
    print(url)
    response = requests.get(url).json()
    time.sleep(6)
    return response


def is_valid(article, date):
    '''An article is only worth checking if it is in range, and has a headline.'''
    is_in_range = date > start and date < end
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    return is_in_range and has_headline


def parse_response(response):
    '''Parses and returns response as pandas data frame.'''
    data = {'headline': [],  
        'date': [], 
        'doc_type': [],
        'material_type': [],
        'section': [],
        'keywords': []}
    
    articles = response['response']['docs'] 
    for article in articles: # For each article, make sure it falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main']) 
            if 'section' in article:
                data['section'].append(article['section_name'])
            else:
                data['section'].append(None)
            data['doc_type'].append(article['document_type'])
            if 'type_of_material' in article: 
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
    return pd.DataFrame(data) 


def get_data(dates):
    '''Sends and parses request/response to/from NYT Archive API for given dates.'''
    total = 0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('headlines'):
        os.mkdir('headlines')
    for date in dates:
        print(date[5:7])
        response = send_request(date)
        df = parse_response(response)
        total += len(df)
        df.to_csv('headlines/' + date[0][0:4] + '-' + date[0][5:7] + '.csv', index=False)
        print('Saving headlines/' + date[0][0:4] + '-' + date[1][5:7] + '.csv...')
    print('Number of articles collected: ' + str(total))

In [114]:
get_data(months_in_range)
os.chdir('./headlines/')
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
combined_csv.to_csv( "NYTData.csv", index=False, encoding='utf-8-sig')
try:
    all_filenames.remove("NYTData.csv")
except ValueError:
    print("Success")
for file in all_filenames:
    os.remove(file)

Date range: ['1980-11-01', '00:00:00'] to ['2020-11-01', '00:00:00']
[]
https://api.nytimes.com/svc/archive/v1//1980/11.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1980-11-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1980/12.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1980-12-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1981/1.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1981-01-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1981/2.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1981-02-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1981/3.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1981-03-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1981/4.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1981-04-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1981/5.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAl

Saving headlines/1985-09-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1985/10.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1985-10-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1985/11.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1985-11-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1985/12.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1985-12-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1986/1.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1986-01-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1986/2.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1986-02-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1986/3.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1986-03-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1986/4.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/

Saving headlines/1990-08-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1990/9.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1990-09-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1990/10.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1990-10-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1990/11.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1990-11-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1990/12.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1990-12-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1991/1.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1991-01-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1991/2.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1991-02-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1991/3.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/

Saving headlines/1995-07-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1995/8.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1995-08-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1995/9.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1995-09-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1995/10.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1995-10-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1995/11.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1995-11-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1995/12.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1995-12-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1996/1.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/1996-01-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//1996/2.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/

Saving headlines/2000-06-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2000/7.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2000-07-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2000/8.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2000-08-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2000/9.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2000-09-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2000/10.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2000-10-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2000/11.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2000-11-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2000/12.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2000-12-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2001/1.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/

Saving headlines/2005-05-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2005/6.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2005-06-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2005/7.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2005-07-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2005/8.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2005-08-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2005/9.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2005-09-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2005/10.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2005-10-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2005/11.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2005-11-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2005/12.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/

Saving headlines/2010-04-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2010/5.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2010-05-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2010/6.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2010-06-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2010/7.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2010-07-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2010/8.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2010-08-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2010/9.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2010-09-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2010/10.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2010-10-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2010/11.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2

Saving headlines/2015-03-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2015/4.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2015-04-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2015/5.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2015-05-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2015/6.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2015-06-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2015/7.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2015-07-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2015/8.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2015-08-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2015/9.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2015-09-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2015/10.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/20

Saving headlines/2020-02-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2020/3.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2020-03-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2020/4.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2020-04-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2020/5.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2020-05-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2020/6.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2020-06-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2020/7.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2020-07-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2020/8.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/2020-08-01-00:00:00.csv...
[]
https://api.nytimes.com/svc/archive/v1//2020/9.json?api-key=Jha8a5QFDpLMrDm53n5aXK4vAlt5d1Sf
Saving headlines/202

In [115]:
os.chdir('..')
os.getcwd()

'C:\\Users\\smuke\\OneDrive\\Desktop\\NYTvMarket\\NYTvMarket'