# Overview
* Feature extraction using the bag of words model
* Use K-Means clustering to identify a set of topics
* Implement silhouette analysis to select number of clusters [link](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html)
* Using the K-Nearest neighbors model for classifying text into those topics

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import defaultdict
from string import punctuation
from heapq import nlargest
import nltk

In [2]:
page = requests.get("https://www.kdnuggets.com/2019/index.html")
soup = BeautifulSoup(page.content, "lxml")
soup.findAll('a')

[<a href="/"></a>,
 <a href="/news/subscribe.html" target="_blank"><b>Subscribe to KDnuggets News</b></a>,
 <a href="https://twitter.com/kdnuggets" target="_blank"><img alt="Twitter" height="48" src="/images/tw_c48.png" style="vertical-align: bottom" width="48"/></a>,
 <a href="https://www.facebook.com/kdnuggets" target="_blank"><img alt="Facebook" height="48" src="/images/fb_c48.png" style="vertical-align: bottom" width="48"/></a>,
 <a href="https://www.linkedin.com/groups/54257/" target="_blank"><img alt="LinkedIn" height="48" src="/images/in_c48.png" style="vertical-align: bottom" width="48"/></a>,
 <a href="/contact.html"><b>Contact</b></a>,
 <a href="/"></a>,
 <a href="/software/index.html" title="Data Science Software">SOFTWARE</a>,
 <a href="/news/index.html" title="News">News/Blog</a>,
 <a href="/news/top-stories.html">Top stories</a>,
 <a href="https://www.kdnuggets.com/opinions/index.html" title="Opinions">Opinions</a>,
 <a href="https://www.kdnuggets.com/tutorials/index.html

In [7]:
 list_of_url = ["https://www.kdnuggets.com/2019/05/index.html",
                "https://www.kdnuggets.com/2019/04/index.html",
                "https://www.kdnuggets.com/2019/03/index.html",
                "https://www.kdnuggets.com/2019/02/index.html",
                "https://www.kdnuggets.com/2019/01/index.html"]

In [81]:
def getArticleText(list_of_url):
    links = []
    posts = []
    # Create a list of links to all articles
    for link in list_of_url:
        req = requests.get("https://www.kdnuggets.com/2019/05/index.html")
        soup = BeautifulSoup(req.content)
        bullets = soup.findAll("ul", {"class": "three_ul"})
        for bullet in bullets:
            for b in bullet.findAll("li"):
                links.append((b.find('a', href=True))['href'])
    # Create a list of the body text from all articles
    for article in links:
        page = requests.get(article)
        art_soup = BeautifulSoup(page.content)
        s = ' '.join(map(lambda p: p.text, art_soup.find_all(id="post-"))).replace('\n', '')
        s = s.replace('\xa0', '')
        posts+=[s]
        
    return posts
    # Load results into a dataframe
    #df = pd.DataFrame({'Article Text':posts})
    #pd.options.display.max_colwidth = 100
    #return df

doc = getArticleText(list_of_url)

In [146]:
def getCluster(doc, n, url_for_pred):
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')
    X = vectorizer.fit_transform(doc)
    km = KMeans(n_clusters=n, init='k-means++', max_iter=100, n_init=1, verbose =True)
    km.fit(X)
    print(np.unique(km.labels_, return_counts=True))
    text={}
    for i, cluster in enumerate(km.labels_):
        oneDocument = doc[i]
        if cluster not in text.keys():
            text[cluster] = oneDocument
        else:
            text[cluster] += oneDocument
    _stopwords = set(stopwords.words('english') 
                     + list(punctuation) 
                     + ['“','”', "2019", "2019.", "word", "words", '--', "'s"] 
                     + [str(x) for x in range(10)])
    keywords ={}
    counts={}
    for cluster in range(n):
        word_sent = word_tokenize(text[cluster].lower())
        word_sent = [word for word in word_sent if word not in _stopwords]
        freq = FreqDist(word_sent)
        # Pick top 100 words
        keywords[cluster] = nlargest(100, freq, key=freq.get)
        counts[cluster]=freq
    unique_keys = {}
    for cluster in range(n):
        other_clusters= list(set(range(n)) - set([cluster]))
        keys_other_clusters=set(keywords[other_clusters[0]]).union(set(keywords[other_clusters[1]]))
        # Find the list of words present in other clusters and remove them from this cluster
        unique=set(keywords[cluster])-keys_other_clusters
        # Pick the top 10 words unique to this cluster
        unique_keys[cluster] = nlargest(10, unique, key=counts[cluster].get)
    print(unique_keys)
    # Load text that we want to predict
    page = requests.get(url_for_pred)
    soup = BeautifulSoup(page.content)
    s = ' '.join(map(lambda p: p.text, soup.find_all(id="post-"))).replace('\n', '')
    s = s.replace('\xa0', '')
    classifier = KNeighborsClassifier(n_neighbors=20)
    classifier.fit(X, km.labels_)
    test = vectorizer.transform([s])
    return classifier.predict(test)

In [147]:
getCluster(doc, 4, "https://www.kdnuggets.com/2018/12/solve-image-classification-problem-quickly-easily.html")

Initialization complete
Iteration  0, inertia 759.358
Iteration  1, inertia 387.856
Iteration  2, inertia 387.502
Converged at iteration 2: center shift 0.000000e+00 within tolerance 9.262319e-09
(array([0, 1, 2, 3]), array([105, 110,  50, 165], dtype=int64))
{0: ['training', 'test', 'images', 'dataset', 'neural', 'network', 'vision', 'image', 'labels', 'train'], 1: ['customer', 'ai', 'churn', 'customers', 'algorithms', 'business', 'decision', 'tensorflow', 'systems', 'companies'], 2: ['df', 'text', 'name', 'import', 'sentiment', 'tweets', 'tweet', '1254', 'animation', 'key'], 3: ['program', 'scientist', 'job', 'graph', 'probability', 'visualization', 'sql', 'cs', 'skills', 'rank']}


array([0])

# Summarize an article

In [4]:
# Encapsulating the parsing logic into a function
def getText(url):
    # download page
    page = requests.get(url)
    # Instantiate soup object
    soup = BeautifulSoup(page.content)
    # Find all elements with a article tag, extract the text and join into one single string
    text = ' '.join(map(lambda p: p.text, soup.find_all(id="post-"))).replace('\n', '')
    # Remove all \xa0
    text = text.replace('\xa0', ' ')
    return text

In [5]:
url = "https://www.kdnuggets.com/2019/05/lady-tasting-tea-science.html"
text = getText(url)

In [6]:
# Encapsulate summary logic into function
def summarize(text, n):
    '''
    This function takes a string of text and the desired number of sentences in the ouput summary as inputs.
    '''
    # Get list of individual sentences in the text
    sents = sent_tokenize(text)
    
    # The full text must be longer than the desired summary
    assert n <= len(sents)
    # Get list of individual words in the text
    word_sent = word_tokenize(text.lower())
    # Create a set of stopwords
    _stopwords = set(stopwords.words('english') + list(punctuation) + ['“','”'])
    
    # Remove stopwords from our text
    word_sent = [word for word in word_sent if word not in _stopwords]
    # Construct a frequency distribution of words
    freq = FreqDist(word_sent)
    
    # Compute the significance score of each sentence by adding up the word frequencies. Add each ranking to a dictionary for lookup.
    ranking = defaultdict(int)
    for i, sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i] += freq[w]
    
    #  Pick most important sentences based on signifiance score
    sents_idx = nlargest(n, ranking, key=ranking.get)
    # Reorder sentences based on order in original text
    return " ".join([sents[j] for j in sorted(sents_idx)])

In [8]:
summarize(text, 4)

'Less than a century old, DOE has made possible scientific advances and is a standard part of not only scientific experiments but experiments conducted out in the world: surveys, marketing studies, credit decisions are all conducted (or should be) using basic experimental design principles.Yet not many people know that a experiment arising from a casual conversation at an English university about tea-drinking is one of the first examples of an experiment designed using statistical ideas, by a geneticist named Ronald Aylmer (R.A.) Fisher.The Lady Tasting TeaBy the 1900’s, scientific experiments had been conducted for hundreds of years. Fisher and others scoffed at this and a colleague, William Roach, suggested a test.Fisher then quickly constructed a test, presenting Ms. Bristol with 8 cups of tea, 4 of which had milk poured in first, and 4 of which had milked added after the tea, but which otherwise were the same in terms of appearance, temperature, etc. In our tea example, Fisher used