In [1]:
import pandas as pd
import requests, json 
import math
import requests
import numpy as np
from bs4 import BeautifulSoup
import random

### Page Request
This URL contains links to all NYT politics stories

In [2]:
page = requests.get("https://www.nytimes.com/pages/politics/index.html")
page

<Response [200]>

In [3]:
soup = BeautifulSoup(page.content, 'html.parser')

### Extract Article URLs
The code below uses Python's BeautifulSoup library and parses the HTML content. It extracts all the links that are present on the page we requested earlier. The list displayed contains each URL. We then retain only those URLs that are specific to politics and remove duplicates.

In [4]:
soup = BeautifulSoup(page.content, 'html.parser')
all_links = soup.find_all('a')
links_list = []
for link in all_links:
    links_list.append(link.get("href"))

Extract only those articles that are specific to politics.

In [5]:
politics_article_list = []
for i in range(len(links_list)):
    if links_list[i] is not None and ("us/politics/" in links_list[i] or "ref=politics" in links_list[i]):
        politics_article_list.append(links_list[i])

Remove duplicate URLs.

In [77]:
politics_article_list = list(set(politics_article_list))
politics_article_list

[u'https://www.nytimes.com/2017/05/05/us/politics/pre-existing-conditions-health-care.html?ref=politics',
 u'https://www.nytimes.com/2017/05/05/us/politics/white-house-chief-usher-angella-reid-fired.html?ref=politics',
 u'https://www.nytimes.com/interactive/2017/01/15/us/politics/you-draw-obama-legacy.html?ref=politics',
 u'https://www.nytimes.com/2017/05/05/us/politics/white-house-proposes-cutting-drug-control-office-funding-by-95.html?ref=politics',
 u'https://www.nytimes.com/2017/05/05/us/politics/republican-health-care-bill-pre-existing-conditions.html?ref=politics',
 u'https://www.nytimes.com/2017/05/07/us/politics/epa-dismisses-members-of-major-scientific-review-board.html?ref=politics',
 u'https://www.nytimes.com/interactive/2016/12/08/us/trump-climate-change.html?ref=politics',
 u'https://www.nytimes.com/2017/05/07/us/politics/trump-wall-faces-barrier-in-texas.html?ref=politics',
 u'https://www.nytimes.com/2017/05/06/us/politics/health-care-vote.html?ref=politics',
 u'https://w

### Data Processing and Clearning
The text if first normalized by replacing writing contractions by their corresponding words/phrases.

In [78]:
import re
import string

def NormalizeContraction(text):
    text = text.replace("can't", "can not")
    text = text.replace("couldn't", "could not")
    text = text.replace("don't", "do not")
    text = text.replace("didn't", "did not")
    text = text.replace("doesn't", "does not")
    text = text.replace("shouldn't", "should not")
    text = text.replace("haven't", "have not")
    text = text.replace("aren't", "are not")
    text = text.replace("weren't", "were not")
    text = text.replace("wouldn't", "would not")
    text = text.replace("hasn't", "has not")
    text = text.replace("hadn't", "had not")
    text = text.replace("won't", "will not")
    text = text.replace("wasn't", "was not")
    text = text.replace("can't", "can not")
    text = text.replace("isn't", "is not")
    text = text.replace("ain't", "is not")
    text = text.replace("it's", "it is")
    text = text.replace("i'm", "i am")
    text = text.replace("i'm", "i am")
    text = text.replace("i've", "i have")
    text = text.replace("i'll", "i will")
    text = text.replace("i'd", "i would")
    text = text.replace("we've", "we have")
    text = text.replace("we'", "we will")
    text = text.replace("we'd", "we would")
    text = text.replace("we're", "we are")
    text = text.replace("you've", "you have")
    text = text.replace("you'll", "you will")
    text = text.replace("you'd", "you would")
    text = text.replace("you're", "you are")
    text = text.replace("he'll", "he will")
    text = text.replace("he'd", "he would")
    text = text.replace("he's", "he has")
    text = text.replace("she'll", "she will")
    text = text.replace("she'd", "she would")
    text = text.replace("she's", "she has")
    text = text.replace("they've", "they have")
    text = text.replace("they'll", "they will")
    text = text.replace("they'd", "they would")
    text = text.replace("they're", "they are")
    text = text.replace("that'll", "that will")
    text = text.replace("that's", "that is")
    text = text.replace("there's", "there is")
    return text

Text is then cleaned to strip line breaks, punctuation (excluding "$" and "%") and condense double spaces.

In [79]:
def cleanText(text):    
    r = re.compile(r"[\r\n]+")
    text = re.sub(r, "", text)

    text = NormalizeContraction(text)

    punctuations = string.punctuation
    excluded_punctuations = ["$", "%"]
    for p in punctuations:
        if p not in excluded_punctuations:
            text = text.replace(p, " ")

    text = text.replace("  ", " ")
    return text

Append text of all articles in one list

In [80]:
article_text =[]
for article_url in range(len(politics_article_list)):
    text = ""
    r = requests.get(politics_article_list[article_url])
    data = r.text
    soup = BeautifulSoup(data, "lxml")
    for i in range(len(soup.findAll('p'))):
        text = text + soup.findAll('p')[i].get_text()
        text = text.lower()
        text = cleanText(text)
    article_text.append(text)

Create a dataframe containing URLs of all articles

In [81]:
df_articles_wordcount= pd.DataFrame()
se = pd.Series(politics_article_list)
df_articles_wordcount['URL'] = se.values
df_articles_wordcount

Unnamed: 0,URL
0,https://www.nytimes.com/2017/05/05/us/politics...
1,https://www.nytimes.com/2017/05/05/us/politics...
2,https://www.nytimes.com/interactive/2017/01/15...
3,https://www.nytimes.com/2017/05/05/us/politics...
4,https://www.nytimes.com/2017/05/05/us/politics...
5,https://www.nytimes.com/2017/05/07/us/politics...
6,https://www.nytimes.com/interactive/2016/12/08...
7,https://www.nytimes.com/2017/05/07/us/politics...
8,https://www.nytimes.com/2017/05/06/us/politics...
9,https://www.nytimes.com/2017/05/07/business/ec...


### Article Wordcount
Calculate the number of words in each article and populate a new column in the dataframe with the wordcount

In [82]:
wordcount_list = []
word_count = 0
for item in article_text:
    word_count = word_count + len(item.split())
    wordcount_list.append(len(item.split()))
wordcount_se = pd.Series(wordcount_list)
df_articles_wordcount['Word_Count'] = wordcount_se.values
df_articles_wordcount

Unnamed: 0,URL,Word_Count
0,https://www.nytimes.com/2017/05/05/us/politics...,1119
1,https://www.nytimes.com/2017/05/05/us/politics...,376
2,https://www.nytimes.com/interactive/2017/01/15...,314
3,https://www.nytimes.com/2017/05/05/us/politics...,1139
4,https://www.nytimes.com/2017/05/05/us/politics...,1740
5,https://www.nytimes.com/2017/05/07/us/politics...,1267
6,https://www.nytimes.com/interactive/2016/12/08...,613
7,https://www.nytimes.com/2017/05/07/us/politics...,1415
8,https://www.nytimes.com/2017/05/06/us/politics...,1860
9,https://www.nytimes.com/2017/05/07/business/ec...,1374


Find the mean, median, minimum and maximum values for wordcount from the list of articles we collected earlier

In [83]:
print df_articles_wordcount['Word_Count'].mean()
print df_articles_wordcount['Word_Count'].median()
print df_articles_wordcount['Word_Count'].min()
print df_articles_wordcount['Word_Count'].max()

1182.16666667
1129.0
78
3168


Remove articles which have a wordcount of less than 500. Most of these articles are interactive vizualizations and they will not contain weasel words. Retaining these will only skew our statistics.

In [84]:
articles_count = []
articles_url = []
temp_article_count = []
num=0
def removeArticles(df):
    global num
    if df['Word_Count'] >= 500:
        articles_count.append(df['Word_Count'])
        articles_url.append(df['URL'])
    else:
        temp_article_count.append(num)
    num+=1

df_articles_wordcount.apply(removeArticles, axis=1)
df_articles= pd.DataFrame({'URL': articles_url, 'Word_Count': articles_count})
df_articles

Unnamed: 0,URL,Word_Count
0,https://www.nytimes.com/2017/05/05/us/politics...,1119
1,https://www.nytimes.com/2017/05/05/us/politics...,1139
2,https://www.nytimes.com/2017/05/05/us/politics...,1740
3,https://www.nytimes.com/2017/05/07/us/politics...,1267
4,https://www.nytimes.com/interactive/2016/12/08...,613
5,https://www.nytimes.com/2017/05/07/us/politics...,1415
6,https://www.nytimes.com/2017/05/06/us/politics...,1860
7,https://www.nytimes.com/2017/05/07/business/ec...,1374
8,https://www.nytimes.com/2017/05/06/us/politics...,2317
9,https://www.nytimes.com/2017/05/05/us/politics...,777


Initial number of articles, before the shorter ones were removed

In [85]:
len(article_text)

42

Number of articles remaining after removing the shorter ones

In [86]:
article_content = []
for i in range(len(article_text)):
    if i not in temp_article_count:
        article_content.append(article_text[i])
len(article_content)

33

### Weasel Words
Create a list of weasel words. There are approximately 175 weasel words/phrases in the list below.

In [87]:
weasel_words = ['condition of anonymity',
'with the promise of anonymity',
'who were granted anonymity',
'on the condition that only his first name',
'on the condition that only her first name',
'on the condition that I not use his real name',
'name not be used',
'who asked for anonymity',
'unauthorized to speak publicly',
'who asked not to be identified',
'according to law enforcement officials',
'said a law enforcement official',
'law enforcement officials said',
'law enforcement official said',
'law enforcement officials told',
'law enforcement official told',
'some officials said',
'some officials say',
'some officials told',
'police officials said',
'a police official said',
'police officials told',
'police official told',
'police said',
'said a government official',
'government officials said',
'government official said',
'government official told',
'administration official said',
'administration official told',
'administration officials said',
'administration officials told',
'member of the administration said',
'member of the administration told',
'security official said',
'the authorities said',
'security official told',
'military officials said',
'military officials told',
'said a person who',
'a Democratic operative close to',
'a House Democratic aide',
'a House Republican aide',
'a Republican operative close to',
'a Senate Democratic aide',
'a Senate Republican aide',
'a defense department official said',
'a government official said',
'a justice department official said',
'a person briefed on the matter',
'a person close to',
'a person familiar with the matter',
'a person familiar with the situation',
'a person involved in the negotiations',
'a senior Democratic aide',
'a senior Democratic committee aide',
'a senior Republican aide',
'a senior Republican committee aide',
'a senior White House official',
'a senior administration official said',
'a senior government official said',
'a source close to',
'a source familiar with the situation',
'a source involved in the negotiations',
'a source said',
'a state department official said',
'according to a person close to',
'according to a person familiar with',
'according to a person who was briefed on the matter',
'according to a person with direct knowledge',
'according to one person who was briefed on the matter',
'according to people briefed on the matter',
'according to people close to',
'according to people familiar with',
'according to people who were briefed on the matter',
'according to people with direct knowledge',
'according to someone close to',
'according to someone familiar with',
'according to three people familiar with',
'according to two people familiar with',
'according to two people with direct knowledge',
'an aide familiar with the situation',
'an aide involved in the negotiations',
'an anonymous source',
'an fbi official said',
'an official close to',
'an official familiar with the situation',
'an official involved in the negotiations',
'asked not to be named',
'asked that I identify her by her first name',
'asked that I identify her by her last name',
'asked that I identify her only by her first name',
'asked that I identify her only by her last name',
'asked that I identify him by his first name',
'asked that I identify him by his last name',
'asked that I identify him only by his first name',
'asked that I identify him only by his last name',
'asked to be identified by her first name',
'asked to be identified by her last name',
'asked to be identified by his first name',
'asked to be identified by his last name',
'asked to be identified only by her first name',
'asked to be identified only by her last name',
'asked to be identified only by his first name',
'asked to be identified only by his last name',
'asked to remain anonymous',
'chose to remain anonymous',
'comment off the record',
'declined to be identified',
'declined to be named',
'declined to give her first name',
'declined to give her last name',
'declined to give her name',
'declined to give his first name',
'declined to give his last name',
'declined to give his name',
'declined to provide her first name',
'declined to provide her last name',
'declined to provide his first name',
'declined to provide his last name',
'declined to speak for attribution',
'defense department officials said',
'did not want her first name used',
'did not want her last name used',
'did not want her name used',
'did not want his first name used',
'did not want his last name used',
'did not want his name used',
'did not want to be identified',
'did not wish to be identified',
'executive briefed on the matter',
'fbi officials said',
'insisted on anonymity',
'justice department officials said',
'not authorized to speak on the record',
'official briefed on the matter',
'officials briefed on the matter',
'people briefed on the matter',
'people familiar with the matter',
'person briefed on the matter',
'refused to be identified',
'refused to be named',
'refused to give her name',
'refused to give his name',
'refused to speak for attribution',
'requested anonymity',
'senior administration officials said',
'source briefed on the matter',
'sources briefed on the matter',
'sources close to',
'sources familiar with the matter',
'sources said',
'sources with specific knowledge',
'speak off the record',
'speaking off the record',
'speaking on background',
'spoke off the record',
'spoke on background',
'state department officials said',
'the source said',
'would not speak for attribution',
'would not give her name',
'would not give his name',
'many feel',
'people feel',
'many say',
'people say',
'people think',
'many think',
'overall opinion',
'according to general opinion',
'opinion polls say',
'according to opinion polls']

Calculate the number of weasel words per article and populate it in the dataframe. Also print the weasel phrases that were used.

In [88]:
weasel_word_count = []
for item in article_content:
    count = 0
    for word in weasel_words:
        if word in item:
            count = count + 1
            print word
    weasel_word_count.append(count)

df_articles['Weasel_Words'] = weasel_word_count
df_articles

declined to be identified
administration official said
condition of anonymity


Unnamed: 0,URL,Word_Count,Weasel_Words
0,https://www.nytimes.com/2017/05/05/us/politics...,1119,0
1,https://www.nytimes.com/2017/05/05/us/politics...,1139,0
2,https://www.nytimes.com/2017/05/05/us/politics...,1740,0
3,https://www.nytimes.com/2017/05/07/us/politics...,1267,0
4,https://www.nytimes.com/interactive/2016/12/08...,613,0
5,https://www.nytimes.com/2017/05/07/us/politics...,1415,0
6,https://www.nytimes.com/2017/05/06/us/politics...,1860,0
7,https://www.nytimes.com/2017/05/07/business/ec...,1374,0
8,https://www.nytimes.com/2017/05/06/us/politics...,2317,0
9,https://www.nytimes.com/2017/05/05/us/politics...,777,0


Find the maximum, minimum, mean and total number of weasel words used

In [89]:
print "Mean Number of Weasel Words:", df_articles['Weasel_Words'].mean()
print "Minimum number of Weasel Words in any given article:", df_articles['Weasel_Words'].min()
print "Maximum number of Weasel Words in any given article:", df_articles['Weasel_Words'].max()
print "Total number of Weasel words in all articles today:", df_articles['Weasel_Words'].sum()

Mean Number of Weasel Words: 0.0909090909091
Minimum number of Weasel Words in any given article: 0
Maximum number of Weasel Words in any given article: 1
Total number of Weasel words in all articles today: 3


### Posting on Twitter
Setup and authenticate Tweepy.
Follow the instructions here: http://www.compjour.org/tutorials/getting-started-with-tweepy/ to create a twitter app and generate the keys needed to authenticate.
Print screen name to test if authentication was successful.

In [129]:
import tweepy

In [130]:
CONSUMER_KEY = "bot_consumer_key"
CONSUMER_SECRET = "bot_consumer_secret"
ACCESS_TOKEN = "bot_access_token"
ACCESS_TOKEN_SECRET = "bot_access_token_secret"

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)

me = api.me()
print me.screen_name

NYTweaselbot


In [131]:
import jinja2 as jj
import math
from datetime import date

Use a jinja template for the tweet text that is posted. This template enable 3 different versions of weasel alerts that are tweeted by the bot.

In [132]:
tweet_summary = """WEASEL REPORT: NYT politics stories posted today contains {{ num_weasel_words }} weasel words."""

The code below will actually post the tweet. It tweets the total number of weasel words present in NYT politics stories posted today.

In [141]:
def tweet_text(df_articles):
    template = jj.Template(tweet_summary)
    return template.render(num_weasel_words = df_articles['Weasel_Words'].sum())

print tweet_text(df_articles)
api.update_status(status = tweet_text(df_articles))

WEASEL REPORT: NYT politics stories posted today contains 3 weasel words.


Status(contributors=None, truncated=False, text=u'WEASEL REPORT: NYT politics stories posted today contains 3 weasel words.', is_quote_status=False, in_reply_to_status_id=None, id=861482659796721664L, favorite_count=0, _api=<tweepy.api.API object at 0x00000000098FF8D0>, author=User(follow_request_sent=False, has_extended_profile=False, profile_use_background_image=False, _json={u'follow_request_sent': False, u'has_extended_profile': False, u'profile_use_background_image': False, u'default_profile_image': False, u'id': 859049107578650627L, u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme1/bg.png', u'verified': False, u'translator_type': u'none', u'profile_text_color': u'000000', u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/859067843622699009/HppSt_Is_normal.jpg', u'profile_sidebar_fill_color': u'000000', u'entities': {u'description': {u'urls': []}}, u'followers_count': 3, u'profile_sidebar_border_color': u'000000', u'id_str': u'859

Select a random number which decides which phrase is used in the tweet

In [139]:
random_tweet_text = [1,2,3]
random_tweet_text

[1, 2, 3]

Tweets the article url if the number of weasel words in the article is 3 or more

In [142]:
def TweetArticle(df_articles):
    random_number = random.choice(random_tweet_text)
    if df_articles['Weasel_Words'] > 2:
        if random_number == 1:
            api.update_status("WEASEL ALERT! High levels of weasel words detected! %s" % df_articles['URL'])
        elif random_number == 2:
            api.update_status("WEASEL ALERT! Sources not clearly identified! %s" % df_articles['URL'])
        else:
            api.update_status("WEASEL ALERT! I smell fake news! %s" % df_articles['URL'])

df_articles.apply(TweetArticle, axis=1)

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
30    None
31    None
32    None
dtype: object

### Historical Information For the Week
The dataframe below holds information regarding all articles published during the week. This has already been created and then commented so that it is not run again during the week. Purge this at the end of every Saturday's run.

In [123]:
# weekly_df_articles = pd.DataFrame(columns = df_articles.columns)

Dataframe containing url, word count and number of weasel words of all articles posted this week (Sunday to Saturday)

In [124]:
weekly_df_articles = weekly_df_articles.append(df_articles)

Unnamed: 0,URL,Word_Count,Weasel_Words
0,https://www.nytimes.com/2017/05/05/us/politics...,1119.0,0.0
1,https://www.nytimes.com/2017/05/05/us/politics...,1139.0,0.0
2,https://www.nytimes.com/2017/05/05/us/politics...,1740.0,0.0
3,https://www.nytimes.com/2017/05/07/us/politics...,1267.0,0.0
4,https://www.nytimes.com/interactive/2016/12/08...,613.0,0.0
5,https://www.nytimes.com/2017/05/07/us/politics...,1415.0,0.0
6,https://www.nytimes.com/2017/05/06/us/politics...,1860.0,0.0
7,https://www.nytimes.com/2017/05/07/business/ec...,1374.0,0.0
8,https://www.nytimes.com/2017/05/06/us/politics...,2317.0,0.0
9,https://www.nytimes.com/2017/05/05/us/politics...,777.0,0.0


Calculate the total number of weasel words used in NYT politics story this week. The variable representing total number of weasel words was initialized to 0 and then commented. Re-initialize to 0 at the end of every Saturday's run.

In [125]:
# total_weasel_words = 0
total_weasel_words = total_weasel_words + df_articles['Weasel_Words'].sum()
total_weasel_words

33

Find the article url with the maximum number of weasel words

In [126]:
weekly_df_articles_sort = pd.DataFrame(columns = weekly_df_articles.columns)
weekly_df_articles_sort = weekly_df_articles.sort_values(['Weasel_Words'],ascending = [False])[['URL']][0:1]
weekly_df_articles_sort

Unnamed: 0,URL
31,https://www.nytimes.com/2017/04/22/us/politics...


Find the index of the url in the dataframe

In [127]:
idx = weekly_df_articles_sort.index.tolist()
idx[0]

31

Tweet the weekly weasel words summary

In [145]:
if total_weasel_words > 0:
    api.update_status("NYT politics stories posted this week contained %d weasel words!!! This one was the most weasely of them all. %s" % (total_weasel_words, weekly_df_articles_sort['URL'].loc[idx[0]]))
else:
    api.update_status("NYT politics stories posted had no weasel words this week! Don't mind me, I'm just going to weasel my way out.")

In [1]:
# ---END OF CODE--- #