In [114]:
import pandas as pd 
import json
import requests
import re 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stopwords = set(stopwords.words("english"))
stemmer = PorterStemmer()

In [159]:
from watson_developer_cloud import AuthorizationV1 as WatsonAuthorization
from watson_developer_cloud import AlchemyLanguageV1 as AlchemyLanguage

In [162]:
import os 

In [163]:
alchemy = AlchemyLanguage(api_key=os.environ.get("ALCHEMY_API_KEY"))

In [68]:
def URL_builder(api_key, begin_date, end_date, page=0):
    '''
    INPUT: NYT API key, begin date (YYYYMMDD), end date, page
    OUTPUT: URL with JSON information about given day's headlines 
    '''
    URL = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?{}={}&{}={}&{}={}&{}={}&fq=news_desk:("World")'.\
        format('api-key', api_key, 'begin-date', begin_date, 'end-date', end_date, 'page', page)
    return URL

In [244]:
def get_week_strings(seed):
    return [str(i+1) for i in range(seed-7, seed)]

def create_headline_df(date_list):
    '''
    INPUT: date in format 'YYYYMMDD' (Calling multiple dates can time out API)
    OUTPUT: Pandas dataframe with columns for headlines and associated date 
    '''
    api_key = 'aa955d9919794bb9a66785e7c52c3cb5'
    data_tuples = []
    for date in date_list:
        print(date)
        re = requests.get(URL_builder(api_key, date, date), timeout = 1000)
        json_dict = json.loads(re.text)

    #         JSON Dictionaries are pretty icky ... we'll crawl through them and find what we need 
        for item in json_dict['response']['docs']: 
            if 'main' in item['headline'].keys():
    #           data_tuples.append((item['pub_date'], item['headline']['main']))
                data_tuples.append((date, item['headline']['main']))

    labels = ['Date', 'Headline']
    return pd.DataFrame.from_records(data_tuples, columns=labels)
        

In [257]:
api_key = 'aa955d9919794bb9a66785e7c52c3cb5'
date = '20170211'
re = requests.get(URL_builder(api_key, date, '20170129'), timeout = 1000)
json_dict = json.loads(re.text)
data_tuples = []
    #         JSON Dictionaries are pretty icky ... we'll crawl through them and find what we need 
for item in json_dict['response']['docs']: 
    if 'main' in item['headline'].keys():
#           data_tuples.append((item['pub_date'], item['headline']['main']))
        data_tuples.append((date, item['headline']['main']))


In [258]:
data_tuples

[('20170211', 'Deadly Earthquake in the Philippines'),
 ('20170211',
  "Trump to Abe: 'U.S.-Japan Alliance is the Cornerstone of Peace and Stability'"),
 ('20170211', 'Iranian President Urges "Respect and Reverence" for Country'),
 ('20170211', 'A Slap on the Wrist for Aiding Migrants'),
 ('20170211', 'Explanatory Memorandum by Muslim Brotherhood From 1991'),
 ('20170211', 'Scottish Lawmakers Whistle the European Anthem'),
 ('20170211', 'Pope’s Plea: Pray for Myanmar Muslims'),
 ('20170211', 'Before the Wall: Life Along the U.S.-Mexico Border'),
 ('20170211', 'Iran’s Supreme Leader ‘Thankful’ for Trump'),
 ('20170211', 'On a Secret Expedition to Old Hong Kong')]

In [None]:
get_week_strings(seed)

In [245]:
create_headline_df(get_week_strings(20170131))

20170125
20170126
20170127
20170128
20170129
20170130
20170131


Unnamed: 0,Date,Headline
0,20170125,Deadly Earthquake in the Philippines
1,20170125,Trump to Abe: 'U.S.-Japan Alliance is the Corn...
2,20170125,"Iranian President Urges ""Respect and Reverence..."
3,20170125,A Slap on the Wrist for Aiding Migrants
4,20170125,Explanatory Memorandum by Muslim Brotherhood F...
5,20170125,Scottish Lawmakers Whistle the European Anthem
6,20170125,Pope’s Plea: Pray for Myanmar Muslims
7,20170125,Before the Wall: Life Along the U.S.-Mexico Bo...
8,20170125,Iran’s Supreme Leader ‘Thankful’ for Trump
9,20170125,On a Secret Expedition to Old Hong Kong


In [246]:
df = word_cleaning(_)

In [247]:
df

Unnamed: 0,Date,headline_words
0,20170125,deadly earthquake philippines trump abe usjapa...
1,20170126,deadly earthquake philippines trump abe usjapa...
2,20170127,deadly earthquake philippines trump abe usjapa...
3,20170128,deadly earthquake philippines trump abe usjapa...
4,20170129,deadly earthquake philippines trump abe usjapa...
5,20170130,deadly earthquake philippines trump abe usjapa...
6,20170131,deadly earthquake philippines trump abe usjapa...


In [205]:
def word_cleaning(df):
    '''
    OUTPUT: Dataframe, grouped by date, Headline is stemmed headline words 
    '''
    data_tuples = []
    # Look at each date, clean strings, remove stopwords, etc.
    for row in df.groupby('Date').agg(lambda x : ' '.join(x)).iterrows():
        words = row[1]['Headline']
        #Clean darta 
        letters_only = re.sub("[^\sa-zA-Z]", "", words) 
        tokenized_words = [word for word in letters_only.lower().split() if not word in stopwords]
        word_cloud = ' '.join(tokenized_words)
        data_tuples.append((row[0], word_cloud))
    labels = ['Date', 'headline_words']
    
    #Return results as pandas dataframe 
    return pd.DataFrame.from_records(data_tuples, columns=labels)

In [229]:
def get_sentiment(df):
    '''
    INPUT: Pandas dataframe with date and words
    OUTPUT: Score of how terrible the day was
    '''
    row_avg_scores = []
    for row in df.iterrows():
        score = 0 
        for word in row[1]['headline_words'].split():
            print(word)
            print('-')
            result = alchemy.sentiment(word)
            try:
                val = float(result['docSentiment']['score']) 
            except KeyError: #neutral words 
                val = 0
            score += val 
        row_avg_scores.append((row[1]['Date'], score/len(row[1]['headline_words'])))
    labels = ['Date', 'avg_score']
    return pd.DataFrame.from_records(row_avg_scores, columns=labels)

In [177]:
sent_obj = alchemy.sentiment("earthquake")

In [182]:
float(sent_obj['docSentiment']['score'])

-0.406711

In [189]:
sent_obj = alchemy.sentiment("Zachary")

In [191]:
sent_obj['docSentiment']

{'type': 'neutral'}

In [185]:
try:
    val = float(sent_obj['docSentiment']['score']) #KeyError
except KeyError:
    if sent_obj['docSentiment']['type'] == 'neutral':
        val = 0 
    #I don't think words get assigned score with positive/negative value, but just in case ... 
    elif sent_obj['docSentiment']['type'] == 'positive': 
        val = .25
    else:
        val = -.25

KeyError: 'score'

In [69]:
df = Create_Headline_Df('20170131')

In [206]:
dfbig = word_cleaning(df)

In [270]:
dfbig.ix[0,'headline_words']

['d',
 'e',
 'a',
 'd',
 'l',
 'y',
 ' ',
 'e',
 'a',
 'r',
 't',
 'h',
 'q',
 'u',
 'a',
 'k',
 'e',
 ' ',
 'p',
 'h',
 'i',
 'l',
 'i',
 'p',
 'p',
 'i',
 'n',
 'e',
 's',
 ' ',
 't',
 'r',
 'u',
 'm',
 'p',
 ' ',
 'a',
 'b',
 'e',
 ' ',
 'u',
 's',
 'j',
 'a',
 'p',
 'a',
 'n',
 ' ',
 'a',
 'l',
 'l',
 'i',
 'a',
 'n',
 'c',
 'e',
 ' ',
 'c',
 'o',
 'r',
 'n',
 'e',
 'r',
 's',
 't',
 'o',
 'n',
 'e',
 ' ',
 'p',
 'e',
 'a',
 'c',
 'e',
 ' ',
 's',
 't',
 'a',
 'b',
 'i',
 'l',
 'i',
 't',
 'y',
 ' ',
 'i',
 'r',
 'a',
 'n',
 'i',
 'a',
 'n',
 ' ',
 'p',
 'r',
 'e',
 's',
 'i',
 'd',
 'e',
 'n',
 't',
 ' ',
 'u',
 'r',
 'g',
 'e',
 's',
 ' ',
 'r',
 'e',
 's',
 'p',
 'e',
 'c',
 't',
 ' ',
 'r',
 'e',
 'v',
 'e',
 'r',
 'e',
 'n',
 'c',
 'e',
 ' ',
 'c',
 'o',
 'u',
 'n',
 't',
 'r',
 'y',
 ' ',
 's',
 'l',
 'a',
 'p',
 ' ',
 'w',
 'r',
 'i',
 's',
 't',
 ' ',
 'a',
 'i',
 'd',
 'i',
 'n',
 'g',
 ' ',
 'm',
 'i',
 'g',
 'r',
 'a',
 'n',
 't',
 's',
 ' ',
 'e',
 'x',
 'p',
 'l',
 'a'

In [259]:
dfsent = get_sentiment(dfbig)

deadly
-
earthquake
-
philippines
-
trump
-
abe
-
usjapan
-
alliance
-
cornerstone
-
peace
-
stability
-
iranian
-
president
-
urges
-
respect
-
reverence
-
country
-
slap
-
wrist
-
aiding
-
migrants
-
explanatory
-
memorandum
-
muslim
-
brotherhood
-
scottish
-
lawmakers
-
whistle
-
european
-
anthem
-
popes
-
plea
-
pray
-
myanmar
-
muslims
-
wall
-
life
-
along
-
usmexico
-
border
-
irans
-
supreme
-
leader
-
thankful
-
trump
-
secret
-
expedition
-
old
-
hong
-
kong
-


In [262]:
dfsent.ix[0,'avg_score']

-0.0045865899470899476

In [215]:
for row in dfbig.iterrows():
    print(row[1]['headline_words'])
    print(row[1]['Date'])

deadly earthquake philippines trump abe usjapan alliance cornerstone peace stability iranian president urges respect reverence country slap wrist aiding migrants explanatory memorandum muslim brotherhood scottish lawmakers whistle european anthem popes plea pray myanmar muslims wall life along usmexico border irans supreme leader thankful trump secret expedition old hong kong
20170131


In [133]:
for row in df.groupby('Date').agg(lambda x : ' '.join(x)).iterrows():
    print(row[0])

20170131


In [118]:
df

Unnamed: 0,Date,Headline
0,20170131,Deadly Earthquake in the Philippines
1,20170131,Trump to Abe: 'U.S.-Japan Alliance is the Corn...
2,20170131,"Iranian President Urges ""Respect and Reverence..."
3,20170131,A Slap on the Wrist for Aiding Migrants
4,20170131,Explanatory Memorandum by Muslim Brotherhood F...
5,20170131,Scottish Lawmakers Whistle the European Anthem
6,20170131,Pope’s Plea: Pray for Myanmar Muslims
7,20170131,Before the Wall: Life Along the U.S.-Mexico Bo...
8,20170131,Iran’s Supreme Leader ‘Thankful’ for Trump
9,20170131,On a Secret Expedition to Old Hong Kong


In [116]:
for row in df.groupby('Date').agg(lambda x : ' '.join(x)).iterrows():
    words = row[1]['Headline']
    letters_only = re.sub("[^\sa-zA-Z]", "", words)
    tokenized_words = [word for word in letters_only.lower().split() if not word in stopwords]
#     cleaned_words = [stemmer.stem(word) for word in tokenized_words]

In [138]:
' '.join(tokenized_words)

'deadly earthquake philippines trump abe usjapan alliance cornerstone peace stability iranian president urges respect reverence country slap wrist aiding migrants explanatory memorandum muslim brotherhood scottish lawmakers whistle european anthem popes plea pray myanmar muslims wall life along usmexico border irans supreme leader thankful trump secret expedition old hong kong'

In [57]:
jd['response'].keys()

dict_keys(['meta', 'docs'])

In [12]:
URL_builder('aa955d9919794bb9a66785e7c52c3cb5', '20170101', '20170102')

'https://api.nytimes.com/svc/search/v2/articlesearch.json?api-key=aa955d9919794bb9a66785e7c52c3cb5&begin-date=20170101&end-date=20170102&page=0'

In [24]:
r = requests.get(URL_builder('aa955d9919794bb9a66785e7c52c3cb5', '20170101', '20170102'))
f1 = json.loads(r.text)

# soup = BeautifulSoup(content)


In [41]:
r2 = requests.get(URL_builder('aa955d9919794bb9a66785e7c52c3cb5', '20170101', '20170101'))

In [38]:
for item in f1['response']['docs']: 
    if 'main' in item['headline'].keys():
        print(item['headline']['main'])
        print(item['pub_date'])

A Presidential Golf Outing, With a Twist: Trump Owns the Place
2017-02-12T01:33:28+0000
Japan Defense Minister Says Gathering Information on North Korea Missile Launch
2017-02-12T01:30:30+0000
Cartel Power Struggle Fuelling Rising Violence in Mexico-Official
2017-02-12T01:30:28+0000
