In [None]:
import requests

API_ROOT = 'http://api.nytimes.com/svc/search/v2/articlesearch.'

API_SIGNUP_PAGE = 'http://developer.nytimes.com/docs/reference/keys'


class NoAPIKeyException(Exception):
    def __init__(self, value):
        self.value = value

    def __str__(self):
        return repr(self.value)


class articleAPI(object):
    def __init__(self, key=None):
        """
        Initializes the articleAPI class with a developer key. Raises an exception if a key is not given.
        Request a key at http://developer.nytimes.com/docs/reference/keys
        :param key: New York Times Developer Key
        """
        self.key = key
        self.response_format = 'json'

        if self.key is None:
            raise NoAPIKeyException('Warning: Missing API Key. Please visit ' + API_SIGNUP_PAGE + ' to register for a key.')

    def _bool_encode(self, d):
        """
        Converts bool values to lowercase strings
        """
        for k, v in d.items():
            if isinstance(v, bool):
                d[k] = str(v).lower()

        return d

    def _options(self, **kwargs):
        """
        Formats search parameters/values for use with API
        :param \*\*kwargs: search parameters/values
        """
        def _format_fq(d):
            for k, v in d.items():
                if isinstance(v, list):
                    d[k] = ' '.join(map(lambda x: '"' + x + '"', v))
                else:
                    d[k] = '"' + str(v) + '"'
            values = []
            for k, v in d.items():
                value = '%s:(%s)' % (k, v)
                values.append(value)
            values = ' AND '.join(values)
            return values

        kwargs = self._bool_encode(kwargs)

        values = ''

        for k, v in kwargs.items():
            if k is 'fq' and isinstance(v, dict):
                v = _format_fq(v)
            elif isinstance(v, list):
                v = ','.join(v)
            values += '%s=%s&' % (k, v)

        return values

    def search(self,
               response_format=None,
               key=None,
               **kwargs):
        """
        Calls the API and returns a dictionary of the search results
        :param response_format: the format that the API uses for its response,
                                includes JSON (.json) and JSONP (.jsonp).
                                Defaults to '.json'.
        :param key: a developer key. Defaults to key given when the articleAPI class was initialized.
        """
        if response_format is None:
            response_format = self.response_format
        if key is None:
            key = self.key

        url = '%s%s?%sapi-key=%s' % (
            API_ROOT, response_format, self._options(**kwargs), key
        )

        r = requests.get(url)
        return r.json()

In [None]:
api = articleAPI('234342221c0c4f2fa969f69d92a6f700')

In [None]:
def parse_articles(articles):
    '''
    This function takes in a response to the NYT api and parses
    the articles into a list of dictionaries
    '''
    news = []
    for i in articles["response"]['docs']:
        dic = {}
        dic['id'] = i['_id']
        if i['abstract'] is not None:
            dic['abstract'] = i['abstract'].encode("utf8")
        dic['headline'] = i['headline']['main'].encode("utf8")
        dic['desk'] = i['news_desk']
        dic['date'] = i['pub_date'][0:10] # cutting time of day.
        dic['section'] = i['section_name']
        if i['snippet'] is not None:
            dic['snippet'] = i['snippet'].encode("utf8")
        dic['source'] = i['source']
        dic['type'] = i['type_of_material']
        dic['url'] = i['web_url']
        dic['word_count'] = i['word_count']
        # locations
        locations = []
        for x in range(0,len(i['keywords'])):
            if 'glocations' in i['keywords'][x]['name']:
                locations.append(i['keywords'][x]['value'])
        dic['locations'] = locations
        # subject
        subjects = []
        for x in range(0,len(i['keywords'])):
            if 'subject' in i['keywords'][x]['name']:
                subjects.append(i['keywords'][x]['value'])
        dic['subjects'] = subjects   
        news.append(dic)
    return(news)

In [None]:
def get_articles(begindate, enddate, query):
    '''
    This function needs to change  begin_date  and number of pages 10 artilces per page
    '''
    all_articles = []
    for i in range(0, 20): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
        articles = api.search(q = query,
               fq = {'source':['Reuters','AP', 'The New York Times']},
               begin_date = begindate,
               end_date = enddate,
               sort='newest',
               page = str(i))
        print ("page" + str(i), list(articles.keys()))
        if list(articles.keys()) == ['message']:
            articles = []
        else:
            articles = parse_articles(articles)
        all_articles = all_articles + articles
    return(all_articles)    


# GIVE ME A KEY WORD OR COMPANY NAME TO RUN 

In [None]:
UPSall = get_articles(20160101, 20170514, "supply chain management")
len(UPSall)

In [None]:
import pandas as pd
ups = pd.DataFrame(UPSall)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [None]:
ds = ups
ds.section = ds["section"].str.decode("utf-8")
ds.headline =ds["headline"].str.decode("utf-8")
ds.snippet = ds["snippet"].str.decode("utf-8")
ds["headline+snippet"] = ds.headline.astype(str) + ds.snippet

In [None]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(ds["headline+snippet"])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

results = {}

for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], ds['headline'][i]) for i in similar_indices]

    # First item is the item itself, so remove it.
    # Each dictionary entry is like: [(1,2), (3,4)], with each tuple being (score, item_id)
    results[row['headline']] = similar_items[1:]
    
print('done!')

# Rank the popularity of articles by the similarity among other artiles; in other words, this top ranks the stories by popularity in the news agency

In [None]:
##create a hot news table
ups["HeatLevel"] = pd.Series()
for i in range(len(ups["headline"])):
    ups["HeatLevel"][i] = len([t for t in results[ups["headline"][i]] if t[0] > 0.05])   
pd.options.display.max_colwidth = 100
Ranking = ups.sort(columns="HeatLevel", axis=0, ascending=False)

In [None]:
Ranking.to_csv("ranked_ups.csv")

# What general topics are there among all the news and frequency of each modeling?

In [None]:
import numpy as np  
import sklearn.feature_extraction.text as text
vectorizer = text.CountVectorizer(ds["headline+snippet"].tolist(), stop_words='english')
dtm = vectorizer.fit_transform(ds["headline+snippet"].tolist()).toarray()
vocab = np.array(vectorizer.get_feature_names())
print(len(vocab))

In [None]:
from sklearn import decomposition
def print_topic_words(num_topics, num_top_words):
    topic_words = []
    clf = decomposition.NMF(n_components=num_topics, random_state=1)
    doctopic = clf.fit_transform(dtm)
    for topic in clf.components_:
        word_idx = np.argsort(topic)[::-1][0:num_top_words]
        topic_words.append([vocab[i] for i in word_idx])
    return topic_words

In [None]:
## USER NEED TO SPEICIFIC PICK THE NUMBER OF TOPICS AND NUMBER OF TOP WORDS
topic_words = print_topic_words(num_topics = 20, num_top_words = 15)
topic_words

In [None]:
num_topics = 20
clf = decomposition.NMF(n_components=num_topics, random_state=1)
doctopic = clf.fit_transform(dtm)

doctopic_orig = doctopic.copy()
doctopic_orig = np.sum(doctopic, axis=0)
ranking = np.argsort(doctopic_orig)[::-1]  

In [None]:
reordered = []
for i, x in enumerate(ranking):
    reordered.append(topic_words[x])
    
for t in range(len(reordered)):     
    print("Topic {}: {}".format(t, ' '.join(reordered[t][:15])))

In [None]:
novel_names = np.asarray(ds["headline+snippet"])
doctopic_grouped = np.zeros((91, num_topics))
for i, name in enumerate(sorted(set(novel_names))):
    doctopic_grouped[i, :] = np.mean(doctopic[novel_names == name, :], axis=0)
doctopic = doctopic_grouped

In [None]:
###put in our search words, if it's not in the topic, it will return error
test = ["logistics", "fullfillment", "UPS"]

In [None]:
doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True)
for i in range(len(test)):
    top_topics = np.argsort(doctopic[i,:])[::-1][:3]
    top_topics_str = ' '.join(str(t) for t in top_topics)
    print("{}: in topic {}".format(test[i], top_topics_str))

# This is to recommend similar articles for you to read further!
# Recommend("*headline of your artile*", numbr of articles, you want)

In [None]:
# Just reads the results out of the dictionary. 
def recommend(headline, num):
    print("Recommending all related articles similar to " + headline)
    print("-------")
    recs = results[headline][:num]
    for rec in recs:
        print("Recommended: " + rec[1] + " (score:" + str(rec[0]) + ")")