In [None]:
import requests
import urllib2
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
from math import log

In [None]:
def getWashPostText(url, token):
    # 'token' is the tag between which article is located
    
    try:
        page = urllib2.urlopen(url).read().decode('utf8')
    except:
        return (None, None)
    
    soup = BeautifulSoup(page, "lxml")
    if soup is None:
        return (None, None)
    text = ""
    if soup.find_all(token) is not None:
        text = ''.join(map(lambda p: p.text, soup.find_all(token)))
        soup2 = BeautifulSoup(text, "lxml")
        if soup2.find_all('p') is not None:
            text = ''.join(map(lambda p: p.text, soup2.find_all('p')))
    return (text, soup.title.text)

In [None]:
def getNYText(url, token):
    response = requests.get(url)
    
    soup = BeautifulSoup(response.content, "lxml")
    page = str(soup)
    title = soup.find('title').text
    mydivs = soup.findAll({"p": {"class":"story-body-text story-content"}})
    text = ''.join(map(lambda p: p.text, mydivs))
    
    return text, title

In [None]:
def scrapeSource(url, magicFrag='2015', scraperFunction=getNYText, token='None'):
    urlBodies = {}
    response = requests.get(url).text
    soup = BeautifulSoup(response, "lxml")
    numErrors = 0
    
    for a in soup.findAll('a'):
        try:
            url = a['href']
            if ((url not in urlBodies) 
                and (magicFrag is not None and magicFrag in url) 
                or magicFrag is None):
                
                body = scraperFunction(url, token)
                if body and len(body) > 0:
                    urlBodies[url] = body
                print url
        except Exception as e:
            numErrors += 1
    return urlBodies

In [None]:
# Now we make the frequency summarizer class. Given a (title, article_text), this class has
# easy methods to find the 'most important' sentences and word. (Most important is defined
# by the frequency of occurrance after excluding stopwords)

class FrequencySummarizer:
    def __init__(self, min_cut = 0, max_cut = 0.9):
        self._min_cut = min_cut
        self._max_cut = max_cut
        self._stopwords = set(stopwords.words('english') + 
                              list(punctuation) + 
                              [u"'s", '"'])
    
    def _compute_frequencies(self, word_sent, customStopWords = None):
        freq = defaultdict(int)
        if customStopWords is None:
            stopwords = set(self._stopwords)
        else:
            stopwords = set(customStopWords).union(self._stopwords)
        for sent in word_sent:
            for word in sent:
                if word not in stopwords:
                    freq[word] += 1
        m = float(max(freq.values()))
        for word in freq.keys():
            freq[word] = freq[word] / m
            if freq[word] >= self._max_cut and freq[word] <= self._min_cut:
                del freq[word]
        return freq
    
    def extractFeatures(self, article, n, customStopWords = None):
        text = article[0]
        title = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent, customStopWords)
        if n < 0:
            return nlargest(len(self._freq.keys()), self._freq, key = self._freq.get)
        else:
            return nlargest(n, self._freq, key = self._freq.get)
        
    def extractRawFrequencies(self, article):
        title = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word] += 1
        return freq
    
    def summarize(self, article, n):
        title = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i, sentence in enumerate(word_sent):
            for word in sentence:
                if word in self._freq:
                    ranking[i] += self._freq[word]
        sentences_index = nlargest(n, ranking, key = ranking.get)
        return [sents[j] for j in sents_index]

In [None]:
urlWashingtonPostNonTech = "https://www.washingtonpost.com/sports"
urlNewYorkTimesNonTech = "https://www.nytimes.com/section/sports"
urlWashingtonPostTech = "https://www.washingtonpost.com/business/technology"
urlNewYorkTimesTech = "https://www.nytimes.com/section/technology"

washingtonPostTechArticles = scrapeSource(urlWashingtonPostTech, '2017', getWashPostText, 'article')
washingtonPostNonTechArticles = scrapeSource(urlWashingtonPostNonTech, '2017', getWashPostText, 'article')
newYorkTimesTechArticles = scrapeSource(urlNewYorkTimesTech, '2017', getNYText, None)
newYorkTimesNonTechArticles = scrapeSource(urlNewYorkTimesNonTech, '2017', getNYText, None)

In [None]:
articleSummaries = {}
for techUrlDictionary in [washingtonPostTechArticles, newYorkTimesTechArticles]:
    for articleUrl in techUrlDictionary:
        try:
            if len(techUrlDictionary[articleUrl][0]) > 0:
                fs = FrequencySummarizer()
                summary = fs.extractFeatures(techUrlDictionary[articleUrl], 25)
                articleSummaries[articleUrl] = {
                    'feature-vector': summary,
                    'label': 'Tech'
                }
        except Exception as e:
            print "Error while computing {}: {}".format(techUrlDictionary[articleUrl][0], e)
            
for nonTechUrlDictionary in [washingtonPostNonTechArticles, newYorkTimesNonTechArticles]:
    for articleUrl in nonTechUrlDictionary:
        try:
            if len(nonTechUrlDictionary[articleUrl][0]) > 0:
                fs = FrequencySummarizer()
                summary = fs.extractFeatures(nonTechUrlDictionary[articleUrl], 25)
                articleSummaries[articleUrl] = {
                    'feature-vector': summary,
                    'label': 'Non-Tech'
                }
        except Exception as e:
            print "Error while computing {}: {}".format(nonTechUrlDictionary[articleUrl][0], e)

In [None]:
# Now, training Data is all set up. We need to get our test instance.

def getDoxyDonkeyText(testUrl, token):
    response = requests.get(testUrl)
    soup = BeautifulSoup(response.content)
    page = str(soup)
    title = soup.find('title').text
    mydivs = soup.findAll("div", {"class": token})
    text = ''.join(map(lambda p:p.text,mydivs))
    # Test instance should return data in the same format as training data
    return text, title

testUrl = "http://doxydonkey.blogspot.in"
testArticle = getDoxyDonkeyText(testUrl, "post-body")

In [None]:
fs = FrequencySummarizer()
testArticleSummary = fs.extractFeatures(testArticle, 25)

In [None]:
# Now, to begin the true classification. First, we'll use KNN
# We'll find the 5 nearest (most similar) articles, and then take 
# a majority vote of their fields

def doKNN(testArticleSummary, articleSummaries):
    similarities = {}
    for articleUrl in articleSummaries:
        oneArticleSummary = articleSummaries[articleUrl]['feature-vector']
        similarities[articleUrl] = len(set(testArticleSummary).intersection(set(oneArticleSummary)))

    labels = defaultdict(int)
    knn = nlargest(5, similarities, key=similarities.get)

    for oneNeighbour in knn:
        labels[articleSummaries[oneNeighbour]['label']] += 1

    return nlargest(1, labels, key=labels.get)[0]

In [None]:
# Now, we'll do classification with Naive Baye's Classifier

cumulativeRawFrequencies = {
    'Tech': defaultdict(int),
    'Non-Tech': defaultdict(int)
}

trainingData = {
    'Tech': newYorkTimesTechArticles,
    'Non-Tech': newYorkTimesNonTechArticles
}

for label in trainingData:
    for articleUrl in trainingData[label]:
        if len(trainingData[label][articleUrl][0]) > 0:
            fs = FrequencySummarizer()
            rawFrequencies = fs.extractRawFrequencies(trainingData[label][articleUrl])
            for word in rawFrequencies:
                cumulativeRawFrequencies[label][word] += rawFrequencies[word]

In [None]:
def doNaiveBayes(testArticleSummary, cumulativeRawFrequencies):
    techiness = 1.0
    nontechiness = 1.0
    for word in testArticleSummary:
        # for each 'feature'
        if word in cumulativeRawFrequencies['Tech']:
            # multiply techiness by the probability that this word appears in a Tech article 
            techiness *= 1e3*cumulativeRawFrequencies['Tech'][word] / float(sum(cumulativeRawFrequencies['Tech'].values()))
        else:
            techiness *= 1e3

        if word in cumulativeRawFrequencies['Non-Tech']:
            nontechiness *= cumulativeRawFrequencies['Non-Tech'][word] / float(sum(cumulativeRawFrequencies['Non-Tech'].values()))
        else:
            nontechiness *= 1e3

    # Now, we need to normalize our result, with respect to the number of words in the article

    techiness *= float(sum(cumulativeRawFrequencies['Tech'].values())) / (float(sum(cumulativeRawFrequencies['Tech'].values())) + float(sum(cumulativeRawFrequencies['Non-Tech'].values())))

    nontechiness *= float(sum(cumulativeRawFrequencies['Non-Tech'].values())) / (float(sum(cumulativeRawFrequencies['Tech'].values())) + float(sum(cumulativeRawFrequencies['Non-Tech'].values())))

    if techiness > nontechiness:
        label = 'Tech'
    else:
        label = 'Non-Tech'
    return label

In [19]:
print doKNN(testArticleSummary, articleSummaries)
print doNaiveBayes(testArticleSummary, cumulativeRawFrequencies)

Tech
Tech
