In [11]:
import requests
import numpy as np
import csv, json
import pandas as pd
import config# get my NYT keys

# NYT API

In [3]:
class APIKeyException(Exception):
    def __init__(self, message): self.message = message 

class InvalidQueryException(Exception):
    def __init__(self, message): self.message = message 

class ArchiveAPI(object):
    def __init__(self, key=None):
        """
        Initializes the ArchiveAPI class. Raises an exception if no API key is given.
        :param key: New York Times API Key
        """
        self.key = key
        self.root = 'http://api.nytimes.com/svc/archive/v1/{}/{}.json?api-key={}' 
        if not self.key:
            nyt_dev_page = 'http://developer.nytimes.com/docs/reference/keys'
            exception_str = 'Warning: API Key required. Please visit {}'
            raise NoAPIKeyException(exception_str.format(nyt_dev_page))

    def query(self, year=None, month=None, key=None,):
        """
        Calls the archive API and returns the results as a dictionary.
        :param key: Defaults to the API key used to initialize the ArchiveAPI class.
        """
        if not key: key = self.key
        if (year < 1882) or not (0 < month < 13):
            # currently the Archive API only supports year >= 1882
            exception_str = 'Invalid query: See http://developer.nytimes.com/archive_api.json'
            raise InvalidQueryException(exception_str)
        url = self.root.format(year, month, key)
        r = requests.get(url)
        return r.json()


api = ArchiveAPI(config.nytkey)


years = [2009]
months = [1,2,3,4,5,6,7,8,9,10,11,12]

for year in years:
    for month in months:
        mydict = api.query(year, month)
        file_str = '/Users/zoe/' + str(year) + '-' + '{:02}'.format(month) + '.json'
        with open(file_str, 'w') as fout:
            json.dump(mydict, fout)
        fout.close()
    

In [12]:
years = [2009]
months = [1,2,3,4,5,6,7,8,9,10,11,12]


In [13]:
file_str = '/Users/zoe/' + str(years) + '-' + '{:02}'.format(months) + '.json'

TypeError: unsupported format string passed to list.__format__

In [6]:
df = pd.read_csv('DJIA indices data.csv')

In [14]:
with open('DJIA indices data.csv', 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    # Converting the csv file reader to a lists 
    data_list = list(spamreader)

# Separating header from the data
header = data_list[0] 
data_list = data_list[1:] 

data_list = np.asarray(data_list)

# Selecting date and close value for each day
selected_data = data_list[:, [0, 4, 6]]

In [15]:
selected_data

array([['2016-12-30', '19762.599609', '19762.599609'],
       ['2016-12-29', '19819.779297', '19819.779297'],
       ['2016-12-28', '19833.679688', '19833.679688'],
       ...,
       ['2007-01-04', '12480.69043', '12480.69043'],
       ['2007-01-03', '12474.519531', '12474.519531'],
       ['2006-12-29', '12463.150391', '12463.150391']], dtype='<U12')

In [10]:
df = pd.DataFrame(data=selected_data[0:,1:],
             index=selected_data[0:,0],
                                columns=['close', 'adj close'],
                                        dtype='float64')

# Reference for pandas interpolation http://pandas.pydata.org/pandas-docs/stable/missing_data.html
# Adding missing dates to the dataframe
df1 = df
idx = pd.date_range('12-29-2008', '12-31-2009')
df1.index = pd.DatetimeIndex(df1.index)
df1 = df1.reindex(idx, fill_value=np.NaN)
# df1.count() # gives 2518 count
interpolated_df = df1.interpolate()
interpolated_df.count() # gives 3651 count

# Removing extra date rows added in data for calculating interpolation
interpolated_df = interpolated_df[3:]

In [11]:
interpolated_df

Unnamed: 0,close,adj close
2009-01-01,8905.540039,8905.540039
2009-01-02,9034.690430,9034.690430
2009-01-03,9007.423503,9007.423503
2009-01-04,8980.156575,8980.156575
2009-01-05,8952.889648,8952.889648
...,...,...
2009-12-27,10540.332519,10540.332519
2009-12-28,10547.080078,10547.080078
2009-12-29,10545.410156,10545.410156
2009-12-30,10548.509766,10548.509766


In [67]:
date_format = ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S+%f"]
def try_parsing_date(text):
    for fmt in date_format:
        #return datetime.strptime(text, fmt)
        try:
            return datetime.strptime(text, fmt).strftime('%Y-%m-%d')
        except ValueError:
            pass
    raise ValueError('no valid date format found')


## Preparing NYTimes data
# Function to parse and convert date format

In [3]:
years = [2009]
months = [1,2,3,4,5,6,7,8,9,10,11,12]

dict_keys = ['pub_date', 'headline'] #, 'lead_paragraph']
articles_dict = dict.fromkeys(dict_keys)
# Filtering list for type_of_material
type_of_material_list = ['blog', 'brief', 'news', 'editorial', 'op-ed', 'list','analysis']
# Filtering list for section_name
section_name_list = ['business', 'national', 'world', 'u.s.' , 'politics', 'opinion', 'tech', 'science',  'health']
news_desk_list = ['business', 'national', 'world', 'u.s.' , 'politics', 'opinion', 'tech', 'science',  'health', 'foreign']

current_date = '2020-01-09'
from datetime import datetime


current_article_str = ''      

## Adding article column to dataframe
interpolated_df["articles"] = ''
count_articles_filtered = 0
count_total_articles = 0
count_main_not_exist = 0               
count_unicode_error = 0     
count_attribute_error = 0   


for date, row in interpolated_df.T.iteritems():   
    if len(interpolated_df.loc[date, 'articles']) <= 400:
        #print interpolated_df.loc[date, 'articles']
        #print date
        month = date.month
        year = date.year
        file_str = '/Users/zoe/' + str(2020) + '-' + '{:02}'.format(month) + '.json'
        with open(file_str) as data_file:    
            NYTimes_data = json.load(data_file)
        count_total_articles = count_total_articles + len(NYTimes_data["response"]["docs"][:])
        interpolated_df.set_value(date.strftime('%Y-%m-%d'), 'articles', '')
        for i in range(len(NYTimes_data["response"]["docs"][:])):
            try:
                
                articles_dict = { your_key: NYTimes_data["response"]["docs"][:][i][your_key] for your_key in dict_keys }
                articles_dict['headline'] = articles_dict['headline']['main'] # Selecting just 'main' from headline
                #articles_dict['headline'] = articles_dict['lead_paragraph'] # Selecting lead_paragraph       
                pub_date = try_parsing_date(articles_dict['pub_date'])
                #print 'article_dict: ' + articles_dict['headline']
                if date.strftime('%Y-%m-%d') == pub_date: 
                    interpolated_df.set_value(pub_date, 'articles', interpolated_df.loc[pub_date, 'articles'] + '. ' + articles_dict['headline'])  
                
            except KeyError:
                print ('key error')
                #print NYTimes_data["response"]["docs"][:][i]
                #count_main_not_exist += 1
                pass   
            except TypeError:
                print ("type error")
                #print NYTimes_data["response"]["docs"][:][i]
                #count_main_not_exist += 1
                pass





NameError: name 'interpolated_df' is not defined

In [70]:
print (count_articles_filtered) 
print (count_total_articles)                     
print (count_main_not_exist)
print (count_unicode_error)

0
43741
0
0


In [None]:
interpolated_df

## Putting all articles if no section_name or news_desk not found


In [1]:
months = [1,2,3,4,5,6,7,8,9,10,11,12]


In [2]:
for date, row in interpolated_df.T.iteritems():   
    if len(interpolated_df.loc[date,'articles']) <= 400:
        #print interpolated_df.loc[date, 'articles']
        #print date
        month = date.month
        year = date.year
        file_str = '/Users/zoe/' + str(2009) + '-' + '{:02}'.format(month) + '.json'
        with open(file_str) as data_file:    
            NYTimes_data = json.load(data_file)
        count_total_articles = count_total_articles + len(NYTimes_data["response"]["docs"][:])
        interpolated_df.set_value(date.strftime('%Y-%m-%d'), 'articles', '')
       
        for i in range(len(NYTimes_data["response"]["docs"][:])):
            try:
                
                articles_dict = { your_key: NYTimes_data["response"]["docs"][:][i][your_key] for your_key in dict_keys }
                articles_dict['headline'] = articles_dict['headline']['main'] # Selecting just 'main' from headline
                #articles_dict['headline'] = articles_dict['lead_paragraph'] # Selecting lead_paragraph       
                pub_date = try_parsing_date(articles_dict['pub_date'])
                #print 'article_dict: ' + articles_dict['headline']
                if date.strftime('%Y-%m-%d') == pub_date: 
                    interpolated_df.set_value(pub_date, 'articles', interpolated_df.loc[pub_date, 'articles'] + '. ' + articles_dict['headline'])  
                
            except KeyError:
                print ('key error')
                #print NYTimes_data["response"]["docs"][:][i]
                #count_main_not_exist += 1
                pass   
            except TypeError:
                print ("type error")
                #print NYTimes_data["response"]["docs"][:][i]
                #count_main_not_exist += 1
                pass


# Saving the data as pickle file
interpolated_df.to_pickle('/Data/zoe_pickeled.pkl')  


# Save pandas frame in csv form
interpolated_df.to_csv('/Data/zoe_csv.csv',
                       sep='\t', encoding='utf-8')

NameError: name 'interpolated_df' is not defined

In [50]:
interpolated_df.to_csv('2017.csv')

In [49]:
interpolated_df

Unnamed: 0,close,adj close,articles
2017-01-01,,,". Angela Merkel, Russia’s Next Target. Little ..."
2017-01-02,,,". John Berger, Provocative Art Critic, Dies at..."
2017-01-03,19881.759766,339180000.0,. 5 Must-See Shows if You’re in New York This ...
2017-01-04,19942.160156,280010000.0,". Karel Husa, Pulitzer Prize-Winning Composer,..."
2017-01-05,19899.289063,269920000.0,". A Higher Minimum Wage in 2017. Dan Coats, th..."
...,...,...,...
2017-12-27,24774.300781,225890000.0,". Coping With Alzheimer’s, Together and Apart...."
2017-12-28,24837.509766,200960000.0,". Redefining the Ballet Dancer, 50 Years Later..."
2017-12-29,24719.220703,270760000.0,. Listen to ‘The Daily’: Blue-Collar Jobs in t...
2017-12-30,24719.220703,270760000.0,". In Deadly Bronx Blaze, Responders Battled Fi..."


In [82]:
interpolated_df = interpolated_df.iloc[:8,]

In [80]:
interpolated_df

Unnamed: 0,close,adj close,articles
2020-01-01,28703.620117,222580000.0,. Elizabeth Warren Isn’t Talking Much About ‘M...
2020-01-02,28868.800781,251820000.0,". N.B.A. Superstars, Growth and Lockouts: The ..."
2020-01-03,28634.880859,239590000.0,. ‘The New Pope’ Looks a Lot Like John Malkovi...
2020-01-04,28657.714192,243980000.0,. Does the Buyer Have to Pay if the Seller Wal...
2020-01-05,28680.547526,248370000.0,. Vikings’ Overtime Touchdown Upsets Saints’ P...
2020-01-06,28703.380859,252760000.0,". No Corrections: Jan. 6, 2020. A Typical Trum..."


In [78]:
interpolated_df.to_csv('2020.csv')