In [35]:
import requests
import urllib.request
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
#from math import log
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [25]:
class FrequencySummarizer:
    def __init__(self,min_cut=0.1,max_cut=0.9):
        self._min_cut = min_cut
        self._max_cut = max_cut
        self._stopwords = set(stopwords.words('english') +
                              list(punctuation) +
                              [u"'s",'"'])
    
    def _compute_frequencies(self,word_sent,customStopWords=None):
        freq = defaultdict(int)
        if customStopWords is None:
            stopwords = set(self._stopwords)
        else:
            stopwords = set(customStopWords).union(self._stopwords)
        for sentence in word_sent:
            for word in sentence:
                if word not in stopwords:
                    freq[word] += 1
        m = float(max(freq.values()))
        for word in list(freq.keys()):
            freq[word] = freq[word]/m
            if freq[word] >= self._max_cut or freq[word] <= self._min_cut:
                del freq[word]
        return freq
    
    def extractFeatures(self,article,n,customStopWords=None):
        text = article[0]
        title = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent,customStopWords)
        if n < 0:
            return nlargest(len(self._freq_keys()),self._freq,key=self._freq.get)
        else:
            return nlargest(n,self._freq,key=self._freq.get)
    
    def extractRawFrequencies(self, article):
        text = article[0]
        title = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word] += 1
        return freq
    
    def summarize(self, article,n):
        text = article[0]
        title = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sentence in enumerate(word_sent):
            for word in sentence:
                if word in self._freq:
                    ranking[i] += self._freq[word]
        sentences_index = nlargest(n,ranking,key=ranking.get)

        return [sentences[j] for j in sentences_index]

In [20]:
def getDoxyDonkeyText(testUrl,token):
    response = requests.get(testUrl)
    soup = BeautifulSoup(response.content, "html5lib")
    page = str(soup)
    title = soup.find("title").text
    mydivs = soup.findAll("div", {"class":token})
    text = ''.join(map(lambda p:p.text,mydivs))
    return text,title

def getAllDoxyDonkeyPosts(url,links):
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    soup = BeautifulSoup(response, "html5lib")
    for a in soup.findAll('a'):
        try:
            url = a['href']
            title = a['title']
            if title == "Older Posts":
                print(title, url)
                links.append(url)
                getAllDoxyDonkeyPosts(url,links)
        except:
            title = ""
    return

In [21]:
blogUrl = "http://doxydonkey.blogspot.in"
links = []
getAllDoxyDonkeyPosts(blogUrl,links)
doxyDonkeyPosts = {}
for link in links:
    doxyDonkeyPosts[link] = getDoxyDonkeyText(link,'post-body')


documentCorpus = []
for onePost in doxyDonkeyPosts.values():
    documentCorpus.append(onePost[0])


Older Posts http://doxydonkey.blogspot.sg/search?updated-max=2017-05-23T19:53:00-07:00&max-results=7
Older Posts http://doxydonkey.blogspot.sg/search?updated-max=2017-05-14T19:02:00-07:00&max-results=7&start=7&by-date=false
Older Posts http://doxydonkey.blogspot.sg/search?updated-max=2017-05-02T19:43:00-07:00&max-results=7&start=14&by-date=false
Older Posts http://doxydonkey.blogspot.sg/search?updated-max=2017-04-17T19:26:00-07:00&max-results=7&start=21&by-date=false
Older Posts http://doxydonkey.blogspot.sg/search?updated-max=2017-04-10T18:56:00-07:00&max-results=7&start=28&by-date=false
Older Posts http://doxydonkey.blogspot.sg/search?updated-max=2017-03-30T19:57:00-07:00&max-results=7&start=35&by-date=false
Older Posts http://doxydonkey.blogspot.sg/search?updated-max=2017-03-20T19:47:00-07:00&max-results=7&start=42&by-date=false
Older Posts http://doxydonkey.blogspot.sg/search?updated-max=2017-03-02T17:42:00-08:00&max-results=7&start=49&by-date=false
Older Posts http://doxydonkey.bl

Older Posts http://doxydonkey.blogspot.sg/search?updated-max=2015-04-23T20:19:00-07:00&max-results=7&start=462&by-date=false
Older Posts http://doxydonkey.blogspot.sg/search?updated-max=2015-04-14T19:40:00-07:00&max-results=7&start=469&by-date=false
Older Posts http://doxydonkey.blogspot.sg/search?updated-max=2015-04-05T20:22:00-07:00&max-results=7&start=476&by-date=false
Older Posts http://doxydonkey.blogspot.sg/search?updated-max=2015-03-24T20:12:00-07:00&max-results=7&start=483&by-date=false
Older Posts http://doxydonkey.blogspot.sg/search?updated-max=2015-03-15T20:41:00-07:00&max-results=7&start=490&by-date=false
Older Posts http://doxydonkey.blogspot.sg/search?updated-max=2015-03-03T19:30:00-08:00&max-results=7&start=497&by-date=false
Older Posts http://doxydonkey.blogspot.sg/search?updated-max=2015-02-22T19:55:00-08:00&max-results=7&start=504&by-date=false
Older Posts http://doxydonkey.blogspot.sg/search?updated-max=2015-02-11T20:02:00-08:00&max-results=7&start=511&by-date=false


In [40]:
vectorizer = TfidfVectorizer(max_df=0.5,min_df=2,stop_words='english')
X = vectorizer.fit_transform(documentCorpus)
km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 100, n_init = 1, verbose = True)
km.fit(X)

keywords = {}
for i,cluster in enumerate(km.labels_):
    oneDocument = documentCorpus[i]
    fs = FrequencySummarizer()
    summary = fs.extractFeatures((oneDocument,""),
                                100,
                                [u"according",u"also",u"billion",u"like",u"new", u"one",u"year",u"first",u"last"])
    if cluster not in keywords:
        keywords[cluster] = set(summary)
    else:
        keywords[cluster] = keywords[cluster].intersection(set(summary))

for cluster in keywords.keys():
    print ("Cluster", cluster, "\nKeywords",keywords[cluster])

Initialization complete
Iteration  0, inertia 137.600
Iteration  1, inertia 71.707
Converged at iteration 1: center shift 0.000000e+00 within tolerance 7.273563e-09
Cluster 0 
Keywords {'company', '”', 'business', 'million', 'percent', '“', 'said'}
Cluster 1 
Keywords {'would', 'said', 'million'}
Cluster 2 
Keywords {'“', 'company', '”', 'said', 'million'}
Cluster 3 
Keywords {'percent', 'said'}
Cluster 4 
Keywords {'percent', '“', 'company', '”', 'said'}
