In [1]:
import requests

API_ROOT = 'http://api.nytimes.com/svc/search/v2/articlesearch.'

API_SIGNUP_PAGE = 'http://developer.nytimes.com/docs/reference/keys'


class NoAPIKeyException(Exception):
    def __init__(self, value):
        self.value = value

    def __str__(self):
        return repr(self.value)


class articleAPI(object):
    def __init__(self, key=None):
        """
        Initializes the articleAPI class with a developer key. Raises an exception if a key is not given.
        Request a key at http://developer.nytimes.com/docs/reference/keys
        :param key: New York Times Developer Key
        """
        self.key = key
        self.response_format = 'json'

        if self.key is None:
            raise NoAPIKeyException('Warning: Missing API Key. Please visit ' + API_SIGNUP_PAGE + ' to register for a key.')

    def _bool_encode(self, d):
        """
        Converts bool values to lowercase strings
        """
        for k, v in d.items():
            if isinstance(v, bool):
                d[k] = str(v).lower()

        return d

    def _options(self, **kwargs):
        """
        Formats search parameters/values for use with API
        :param \*\*kwargs: search parameters/values
        """
        def _format_fq(d):
            for k, v in d.items():
                if isinstance(v, list):
                    d[k] = ' '.join(map(lambda x: '"' + x + '"', v))
                else:
                    d[k] = '"' + str(v) + '"'
            values = []
            for k, v in d.items():
                value = '%s:(%s)' % (k, v)
                values.append(value)
            values = ' AND '.join(values)
            return values

        kwargs = self._bool_encode(kwargs)

        values = ''

        for k, v in kwargs.items():
            if k is 'fq' and isinstance(v, dict):
                v = _format_fq(v)
            elif isinstance(v, list):
                v = ','.join(v)
            values += '%s=%s&' % (k, v)

        return values

    def search(self,
               response_format=None,
               key=None,
               **kwargs):
        """
        Calls the API and returns a dictionary of the search results
        :param response_format: the format that the API uses for its response,
                                includes JSON (.json) and JSONP (.jsonp).
                                Defaults to '.json'.
        :param key: a developer key. Defaults to key given when the articleAPI class was initialized.
        """
        if response_format is None:
            response_format = self.response_format
        if key is None:
            key = self.key

        url = '%s%s?%sapi-key=%s' % (
            API_ROOT, response_format, self._options(**kwargs), key
        )

        r = requests.get(url)
        return r.json()

In [2]:
api = articleAPI('234342221c0c4f2fa969f69d92a6f700')

In [3]:
def parse_articles(articles):
    '''
    This function takes in a response to the NYT api and parses
    the articles into a list of dictionaries
    '''
    news = []
    for i in articles['response']['docs']:
        dic = {}
        dic['id'] = i['_id']
        if i['abstract'] is not None:
            dic['abstract'] = i['abstract'].encode("utf8")
        dic['headline'] = i['headline']['main'].encode("utf8")
        dic['desk'] = i['news_desk']
        dic['date'] = i['pub_date'][0:10] # cutting time of day.
        dic['section'] = i['section_name']
        if i['snippet'] is not None:
            dic['snippet'] = i['snippet'].encode("utf8")
        dic['source'] = i['source']
        dic['type'] = i['type_of_material']
        dic['url'] = i['web_url']
        dic['word_count'] = i['word_count']
        # locations
        locations = []
        for x in range(0,len(i['keywords'])):
            if 'glocations' in i['keywords'][x]['name']:
                locations.append(i['keywords'][x]['value'])
        dic['locations'] = locations
        # subject
        subjects = []
        for x in range(0,len(i['keywords'])):
            if 'subject' in i['keywords'][x]['name']:
                subjects.append(i['keywords'][x]['value'])
        dic['subjects'] = subjects   
        news.append(dic)
    return(news)

In [10]:
def get_articles(query):
    '''
    This function needs to change  begin_date  and number of pages 10 artilces per page
    '''
    all_articles = []
    for i in range(0,30): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
        articles = api.search(q = query,
               fq = {'source':['Reuters','AP', 'The New York Times']},
               begin_date = 20160101,
               sort='newest',
               page = str(i))
        articles = parse_articles(articles)
        all_articles = all_articles + articles
    return(all_articles)

# GIVE ME A KEY WORD OR COMPANY NAME TO RUN 

In [11]:
UPSall = get_articles("United Parcel Service")
len(UPSall)

112

In [38]:
import pandas as pd
ups = pd.DataFrame(UPSall)

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [40]:
ups.head()

Unnamed: 0,abstract,date,desk,headline,id,locations,section,snippet,source,subjects,type,url,word_count
0,,2017-05-05,,b'Two Die in Cargo Plane Crash at West Virgini...,590c782a7c459f24986ddea5,[],U.S.,b'Two people died when an incoming cargo plane...,Reuters,[],News,https://www.nytimes.com/reuters/2017/05/05/us/...,260
1,,2017-04-30,,b'UPS Air Maintenance Workers Threaten Strike ...,590607567c459f24986dcf3c,[],Business Day,"b'A union representing 1,200 U.S. air maintena...",Reuters,[],News,https://www.nytimes.com/reuters/2017/04/30/bus...,408
2,,2017-04-27,,b'Markets Right Now: Meager Gains Are Enough f...,5901f5707c459f24986dc718,[],U.S.,b'The latest on developments in financial mark...,AP,[],News,https://www.nytimes.com/aponline/2017/04/27/us...,340
3,,2017-04-27,,b'UPS First-Quarter Profit Tops Estimates as R...,5901df937c459f24986dc6c0,[],Business Day,b'United Parcel Service Inc reported a higher-...,Reuters,[],News,https://www.nytimes.com/reuters/2017/04/27/bus...,380
4,,2017-04-27,,b'UPS Tops Street 1Q Forecasts',5901ded77c459f24986dc6be,[],Business Day,b'United Parcel Service Inc. is reporting firs...,AP,[],News,https://www.nytimes.com/aponline/2017/04/27/bu...,161


In [41]:
ds = ups
ds.section = ds["section"].str.decode("utf-8")
ds.headline =ds["headline"].str.decode("utf-8")
ds.snippet = ds["snippet"].str.decode("utf-8")

In [47]:
ds["headline+snippet"] = ds.headline.astype(str) + ds.snippet

In [51]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(ds["headline+snippet"])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

results = {}

for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], ds['headline'][i]) for i in similar_indices]

    # First item is the item itself, so remove it.
    # Each dictionary entry is like: [(1,2), (3,4)], with each tuple being (score, item_id)
    results[row['headline']] = similar_items[1:]
    
print('done!')
print(results)

done!
{'Ford Motor Criticizes Trump Immigration Order': [(0.0581160612078266, 'Trump, Aviation Executives to Discuss Infrastructure Thursday: Sources'), (0.057993628945975879, 'Anti-Trump Energy Flags During a Second Day of Protests'), (0.044903361068034039, "Factbox: Contenders for Senior Jobs in Trump's Administration"), (0.044903361068034039, "Factbox: Contenders for Senior Jobs in Trump's Administration"), (0.041273411292472154, 'In Trump Cabinet, Commerce Secretary Will Run Trade Policy'), (0.023813281630649413, "Gorsuch Rulings in Some Notable Employee's Rights Cases"), (0.022238129354081267, "Notable Workers' Rights Cases Heard by Neil Gorsuch"), (0.014314419700491609, 'Companies Bail Out on GOP Convention After Facing Pressure'), (0.010712014782420877, 'UPS Tests Drone Deliveries, Eyes Future Price Changes'), (0.010502212181656562, "Worried Analysts Question Amazon's Logistics Plans"), (0.010107495466796176, 'Man Who Impersonated Dad in Maxim Magazine Bid Gets Prison'), (0.0100

In [70]:
# Just reads the results out of the dictionary. 
def recommend(headline, num):
    print("Recommending all related articles similar to " + headline)
    print("-------")
    recs = results[headline][:num]
    for rec in recs:
        print("Recommended: " + rec[1] + " (score:" + str(rec[0]) + ")")

In [71]:
##look for 10 relavant similar articles to the headline you include first
recommend('UPS, SAP Team Up for On-Demand 3D Printing Network', num=5)

Recommending all related articles similar to UPS, SAP Team Up for On-Demand 3D Printing Network
-------
Recommended: With Deliv Investment, UPS Hopes to Study Same-Day Delivery Market (score:0.138217644993)
Recommended: UPS Reaches Tentative Five-Year Deal With Pilots Union (score:0.120744221136)
Recommended: New York, UPS Spar Over Proposed $872 Million Fine for Cigarette Shipments (score:0.0930604333523)
Recommended: UPS Air Maintenance Workers Vote 98 Percent to Authorize Strike (score:0.0838024261687)
Recommended: UPS Wins Court Challenge Against EU Block to TNT Bid (score:0.0793597916567)


In [109]:
##create a hot news table
hotTopic = pd.DataFrame(columns = ["Headline", "HeatLevel"])
hotTopic["Headline"] = [i for i in results.keys()]
for i in range(len(hotTopic["Headline"])):
    hotTopic["HeatLevel"][i] = len([t for t in results[hotTopic["Headline"][i]] if t[0] > 0.05])   

In [114]:
pd.options.display.max_colwidth = 100
hotTopic.sort(columns="HeatLevel", axis=0, ascending=False)

  from ipykernel import kernelapp as app


Unnamed: 0,Headline,HeatLevel
24,"New York, UPS Spar Over Proposed $872 Million Fine for Cigarette Shipments",10
42,"UPS 1st-Quarter Profit Rises 10 Percent, Beating Forecasts",8
15,Judge Finds UPS Liable to New York Over Cigarette Shipments,8
84,FedEx Fails to End New York Lawsuit Over Illegal Cigarettes,8
71,UPS Reaches Tentative Five-Year Deal With Pilots Union,7
33,Amazon Starts Flexing Muscle in New Space-Air Cargo,7
17,Amazon Starts Flexing Muscle in New Space: Air Cargo,7
7,UPS 3Q Profit Up 1 Percent as US Revenue Increases,7
58,UPS Wins Court Challenge Against EU Block to TNT Bid,7
57,"With Deliv Investment, UPS Hopes to Study Same-Day Delivery Market",7
