In [3]:
import requests
import urllib.request
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

In [4]:
hdr = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
    'Accept': 'text/html, application/xhtml+xml,application/xml;q=0.9,*/*q=0.8',
    'Accept-Charset': 'ISO-8859-1;utf-8,q=0.7,*;q=0.3',
    'Accept-Encoding': 'none',
    'Accept-Language': 'en-US,en;q=0.8',
    'Connection': 'keep-aive'
}

def get_text_from_paragraphs(article):
    return ' '.join(map(lambda p: p.text, article.find_all('p')))

def getWashPostText(url,token):
    try:
        page = urllib.request.urlopen(url).read().decode('utf8')
    except:
        return (None,None)
    soup = BeautifulSoup(page, "html5lib")
    if soup is None:
        return (None,None)
    text = ""
    if soup.find_all(token) is not None:
        text = ''.join(map(get_text_from_paragraphs, soup.find_all(token)))
    return text, soup.title.text

def getNYTText(url,token):
    res = urllib.request.urlopen(url).read().decode('utf8')
    soup = BeautifulSoup(res, "html5lib")
    title = soup.find('title').text
    mydivs = soup.findAll("p", {"class":"story-body-text story-content"})
    text = ''.join(map(lambda p:p.text, mydivs))
    return text, title

def scrapeSource(url, magicFrag='2017',scraperFunction=getNYTText,token='None'):
    urlBodies = {}
    try:
        req = urllib.request.Request(url,headers=hdr)
        res = urllib.request.urlopen(req).read().decode('utf8')
        soup = BeautifulSoup(res, "html5lib")
    except:
        print("Failed to get", url)
        return urlBodies
    
    numErrors = 0
    for a in soup.findAll('a'):
        try:
            url = a['href']
            if( (url not in urlBodies) and 
               ((magicFrag is not None and magicFrag in url) 
               or magicFrag is None)):
                body = scraperFunction(url,token)
                if body and len(body) > 0:
                    urlBodies[url] = body
                print(url)
        except:
            numErrors += 1
    return urlBodies


In [5]:
class FrequencySummarizer:
    def __init__(self,min_cut=0.1,max_cut=0.9):
        self._min_cut = min_cut
        self._max_cut = max_cut
        self._stopwords = set(stopwords.words('english') +
                              list(punctuation) +
                              [u"'s",'"'])

        
    def _compute_frequencies(self,word_sent,customStopWords=None):
        freq = defaultdict(int)
        if customStopWords is None:
            stopwords = set(self._stopwords)
        else:
            stopwords = set(customStopWords).union(self._stopwords)
        for sentence in word_sent:
            for word in sentence:
                if word not in stopwords:
                    freq[word] += 1
        m = float(max(freq.values()))
        for word in list(freq.keys()):
            freq[word] = freq[word]/m
            if freq[word] >= self._max_cut or freq[word] <= self._min_cut:
                del freq[word]
        return freq
    
    def extractFeatures(self,article,n,customStopWords=None):
        text = article[0]
        title = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent,customStopWords)
        if n < 0:
            return nlargest(len(self._freq_keys()),self._freq,key=self._freq.get)
        else:
            return nlargest(n,self._freq,key=self._freq.get)
    
    def extractRawFrequencies(self, article):
        text = article[0]
        title = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word] += 1
        return freq
    
    def summarize(self, article,n):
        text = article[0]
        title = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sentence in enumerate(word_sent):
            for word in sentence:
                if word in self._freq:
                    ranking[i] += self._freq[word]
        sentences_index = nlargest(n,ranking,key=ranking.get)

        return [sentences[j] for j in sentences_index]


In [6]:
urlWashingtonPostNonTech = "https://www.washingtonpost.com/sports"
urlNewYorkTimesNonTech = "https://www.nytimes.com/pages/sports/index.html"
urlWashingtonPostTech = "https://www.washingtonpost.com/business/technology"
urlNewYorkTimesTech = "http://www.nytimes.com/pages/technology/index.html"

washingtonPostTechArticles = scrapeSource(urlWashingtonPostTech,
                                          '2017',
                                         getWashPostText,
                                         'article') 
washingtonPostNonTechArticles = scrapeSource(urlWashingtonPostNonTech,
                                          '2017',
                                         getWashPostText,
                                         'article')
                
                
newYorkTimesTechArticles = scrapeSource(urlNewYorkTimesTech,
                                       '2017',
                                       getNYTText,
                                       None)
newYorkTimesNonTechArticles = scrapeSource(urlNewYorkTimesNonTech,
                                       '2017',
                                       getNYTText,
                                       None)


https://www.washingtonpost.com/news/post-nation/wp/2017/08/31/david-a-clarke-jr-resigns-as-milwaukee-county-sheriff/?utm_term=.2493790a74c1
https://www.washingtonpost.com/news/post-nation/wp/2017/09/02/erratic-wildfire-near-los-angeles-burns-3000-acres-and-forces-hundreds-to-evacuate/?utm_term=.aa6fe826447e
https://www.washingtonpost.com/local/trumps-ms-13-crackdown-going-after-suspected-gang-members-for-immigration-violations/2017/08/31/8c467bcc-8bf7-11e7-8df5-c2e5cf46c1e2_story.html
https://www.washingtonpost.com/news/the-intersect/wp/2017/08/31/what-happened-when-jake-paul-youtubes-most-divisive-star-decided-to-rescue-texas-from-harvey/
https://www.washingtonpost.com/news/arts-and-entertainment/wp/2017/09/02/tyler-perry-gives-1-million-to-hurricane-harvey-relief-including-250000-to-joel-osteens-church/?utm_term=.058076350e9e
https://www.washingtonpost.com/news/football-insider/wp/2017/09/01/redskins-release-veteran-safety-will-blackmon-among-first-cuts/?utm_term=.672c67dfaf4e
https:

https://www.washingtonpost.com/news/sports/wp/2017/09/02/italian-tennis-player-suspended-from-u-s-open-for-calling-female-umpire-a-whore/
https://www.washingtonpost.com/news/sports/wp/2017/09/03/if-we-learned-anything-from-its-victory-over-fsu-its-that-alabama-is-still-as-good-as-it-gets/
https://www.washingtonpost.com/sports/tv-and-radio-listings-september-3/2017/09/03/959d6f10-9060-11e7-84c0-02cc069f2c37_story.html
https://www.washingtonpost.com/sports/orioles-fall-to-blue-jays-lose-ground-in-wild-card-chase/2017/09/02/105ef6c8-8f7c-11e7-91d5-ab4e4bb76a3a_story.html
https://www.washingtonpost.com/news/early-lead/wp/2017/09/02/blind-long-snapper-jake-olson-helps-usc-score-extra-point/
https://www.washingtonpost.com/news/early-lead/wp/2017/09/02/curt-schillings-mission-to-deliver-supplies-to-harvey-victims-briefly-derailed/
https://www.washingtonpost.com/sports/coco-vandeweghe-beats-agnieszka-radwanska-to-reach-fourth-round-at-us-open/2017/09/02/a59e523a-8f7a-11e7-91d5-ab4e4bb76a3a_sto

https://www.nytimes.com/2017/08/30/technology/doxxing-protests.html
https://www.nytimes.com/2017/08/30/us/politics/eric-schmidt-google-new-america.html
https://www.nytimes.com/2017/08/30/technology/amazon-alexa-microsoft-cortana.html
https://www.nytimes.com/2017/08/29/technology/uber-ceo-board.html
https://www.nytimes.com/2017/08/29/business/ford-driverless-pizza-delivery-dominos.html
https://www.nytimes.com/2017/08/29/dining/restaurant-reservation-apps.html
https://www.nytimes.com/2017/08/29/dining/opentable-restaurant-reservations.html
https://www.nytimes.com/2017/08/29/technology/personaltech/go-bold-or-italics-with-gmail-formatting-options.html
https://www.nytimes.com/2017/08/29/travel/travel-photography-tours.html
https://www.nytimes.com/2017/08/29/magazine/in-our-cynical-age-no-one-fails-anymore-everybody-pivots.html
https://www.nytimes.com/2017/08/28/business/dealbook/tim-cook-apple-moral-responsibility.html
https://www.nytimes.com/2017/08/28/technology/uber-new-ceo.html
https:/

In [7]:
articleSummaries = {}
for techUrlDictionary in [newYorkTimesTechArticles, washingtonPostTechArticles]:
    for articleUrl in techUrlDictionary:
        if techUrlDictionary[articleUrl][0] is not None:
            if len(techUrlDictionary[articleUrl][0]) > 0:
                fs = FrequencySummarizer()
                summary = fs.extractFeatures(techUrlDictionary[articleUrl],25)
                articleSummaries[articleUrl] = {'feature-vector': summary,
                                               'label': 'Tech'}
for nontechUrlDictionary in [newYorkTimesNonTechArticles, washingtonPostNonTechArticles]:
    for articleUrl in nontechUrlDictionary:
        if nontechUrlDictionary[articleUrl][0] is not None:
            if len(nontechUrlDictionary[articleUrl][0]) > 0:
                fs = FrequencySummarizer()
                summary = fs.extractFeatures(nontechUrlDictionary[articleUrl],25)
                articleSummaries[articleUrl] = {'feature-vector': summary,
                                               'label': 'Non-Tech'}

In [8]:
def getDoxyDonkeyText(testUrl,token):
    response = requests.get(testUrl)
    soup = BeautifulSoup(response.content, "html5lib")
    page = str(soup)
    title = soup.find("title").text
    mydivs = soup.findAll("div", {"class":token})
    text = ''.join(map(lambda p:p.text,mydivs))
    return text,title
    # our test instance, just like our training data, is nicely
    # setup as a (title,text) tuple

testUrl = "http://doxydonkey.blogspot.in"
testArticle = getDoxyDonkeyText(testUrl,"post-body")

fs = FrequencySummarizer()
testArticleSummary = fs.extractFeatures(testArticle, 25)

In [9]:
similarities = {}
for articleUrl in articleSummaries:
    oneArticleSummary = articleSummaries[articleUrl]['feature-vector']
    similarities[articleUrl] = len(set(testArticleSummary).intersection(set(oneArticleSummary)))

labels = defaultdict(int)    
knn = nlargest(5, similarities, key=similarities.get)

for oneNeighbor in knn:
    labels[articleSummaries[oneNeighbor]['label']] += 1

nlargest(1,labels,key=labels.get)

['Tech']

In [10]:
cumulativeRawFrequencies = {'Tech':defaultdict(int),'Non-Tech':defaultdict(int)}
trainingData = {'Tech':newYorkTimesTechArticles,'Non-Tech':newYorkTimesNonTechArticles}
for label in trainingData:
    for articleUrl in trainingData[label]:
        if len(trainingData[label][articleUrl][0]) > 0:
            fs = FrequencySummarizer()
            rawFrequencies = fs.extractRawFrequencies(trainingData[label][articleUrl])
            for word in rawFrequencies:
                cumulativeRawFrequencies[label][word] += rawFrequencies[word]

In [11]:
techiness = 1.0
nontechiness = 1.0
for word in testArticleSummary:
    if word in cumulativeRawFrequencies['Tech']:
        techiness *= 1e3*cumulativeRawFrequencies['Tech'][word] / float(sum(cumulativeRawFrequencies['Tech'].values()))
    else:
        techiness /= 1e3
    if word in cumulativeRawFrequencies['Non-Tech']:
        nontechiness *= 1e3*cumulativeRawFrequencies['Non-Tech'][word] / float(sum(cumulativeRawFrequencies['Non-Tech'].values()))
    else:
        nontechiness /= 1e3

techiness *= float(sum(cumulativeRawFrequencies['Tech'].values())) / (float(sum(cumulativeRawFrequencies['Tech'].values())) + float(sum(cumulativeRawFrequencies['Non-Tech'].values())))
nontechiness *= float(sum(cumulativeRawFrequencies['Non-Tech'].values())) / (float(sum(cumulativeRawFrequencies['Tech'].values())) + float(sum(cumulativeRawFrequencies['Non-Tech'].values())))
if techiness > nontechiness:
    label = 'Tech'
else:
    label = 'Non-Tech'
print(label, techiness, nontechiness)


Tech 6.911242240936221e-07 7.237510698151429e-31
