In [129]:
# article retrieval
import requests
from bs4 import BeautifulSoup

In [130]:
home_url = 'https://arstechnica.com/'
# page_text = requests.get(pageUrl).text
# parsed_page = BeautifulSoup(page_text, 'lxml')

In [131]:
# TODO: use scrapy instead
def retrieve_parsed_page(url):
    page_text = requests.get(url).text
    return BeautifulSoup(page_text, 'lxml')

def get_article_links(parsed_page):
    anchor_list = parsed_page.select('h2 a')
    return [a['href'] for a in anchor_list]

def get_article_text(parsed_page):
    page_paragraphs = parsed_page.find('section', {'class': 'article-guts'}).findAll('p')
    paragraph_text = [paragraph.text for paragraph in page_paragraphs]
    return '\n'.join(paragraph_text)

In [132]:
parsed_homepage = retrieve_parsed_page(home_url)
article_urls = get_article_links(parsed_homepage)

article_list = []
for url in article_urls:
    print(f'retrieving article: {url}')
    parsed_page = retrieve_parsed_page(url)
    article_text = get_article_text(parsed_page)
    article_list.append(article_text)
print(f'done parsing {len(article_list)} articles')

retrieving article: https://arstechnica.com/gadgets/2017/12/guidemaster-everything-amazons-alexa-can-do-plus-the-best-skills-to-enable/
retrieving article: https://arstechnica.com/gaming/2017/12/ars-technicas-best-video-games-of-2017/
retrieving article: https://arstechnica.com/science/2017/12/9-himalayan-yeti-samples-turn-out-to-be-bears-dog/
retrieving article: https://arstechnica.com/tech-policy/2017/12/library-of-congress-to-get-selective-about-the-tweets-it-keeps/
retrieving article: https://arstechnica.com/information-technology/2017/12/mozilla-squashes-critical-thunderbird-bug/
retrieving article: https://arstechnica.com/gaming/2017/12/big-titles-must-wait-as-nintendo-pushes-back-64gb-switch-game-card-rollout/
retrieving article: https://arstechnica.com/tech-policy/2017/12/these-experts-figured-out-why-so-many-bogus-patents-get-approved/
retrieving article: https://arstechnica.com/gadgets/2017/12/whatsapp-to-drop-blackberry-windows-phone-8-0-support-after-new-years-eve/
retrievi

In [133]:
# TF-IDF vectorization

In [191]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_instance = TfidfVectorizer(
    max_df=0.65, # exclude words that appear in more than 50% of the corpus
    min_df=2, # exclude words that appear less than 2 times total. (cut )
    stop_words='english', 
    strip_accents='unicode'
)

# from sklearn.feature_extraction.text import CountVectorizer
# count_v = CountVectorizer()
# print(count_v.fit_transform(article_list))

In [192]:
X = vectorizer_instance.fit_transform(article_list)

In [193]:
# print(X[0].toarray())
X

<31x1619 sparse matrix of type '<class 'numpy.float64'>'
	with 5482 stored elements in Compressed Sparse Row format>

In [194]:
# Clustering

In [195]:
from sklearn.cluster import KMeans
import numpy as np

In [196]:
km_fitter = KMeans(
    n_clusters = 3,
    init='k-means++',
    max_iter=100,
    n_init=1,
    verbose=True
)

In [197]:
km_fitter.fit(X)

Initialization complete
Iteration  0, inertia 48.582
Iteration  1, inertia 25.441
Converged at iteration 1: center shift 0.000000e+00 within tolerance 5.606854e-08


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=3, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [198]:
print(np.unique(km_fitter.labels_, return_counts=True))

(array([0, 1, 2], dtype=int32), array([ 6, 19,  6]))


In [211]:
grouped_articles = {}
for group, article in zip(km_fitter.labels_, article_list):
    try:
         grouped_articles[group].append(article)
    except KeyError:
        grouped_articles[group] = [article]

In [200]:
# cluster analysis

In [239]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.probability import FreqDist
from heapq import nlargest

In [293]:
stopword_list = stopwords.words('english') + list(punctuation) + ['’', '“', '”', '\'s']
newline_char = '\n'

def tokenize_words(text):
    tokenized_words = word_tokenize(text.lower())
    return [word for word in tokenized_words if word not in stopword_list]

def top_occurring_words(word_list, top_n=10):
    word_frequency = FreqDist(word_list)
    top_words = nlargest(top_n, word_frequency, key=word_frequency.get)
    return [(word, word_frequency[word]) for word in top_words]

def get_exclusion_set(group_name, grouped_articles):
    other_articles = '\n'.join(
        '\n'.join(articles)
        for group, articles
        in grouped_articles.items()
        if group != group_name
    )
    return set(tokenize_words(other_articles))

def get_word_list(articles, group_name, grouped_articles):
    merged_list = '\n'.join(article_list)
    tokenized_words = tokenize_words(merged_list)
    exclusion_set = get_exclusion_set(group_name, grouped_articles)
    return [
        word
        for word
        in tokenized_words
        if word not in exclusion_set
    ]


def parse_article_list(articles, group_name, grouped_articles, top_n=40):
    word_list = get_word_list(articles, group_name, grouped_articles)
    return top_occurring_words(word_list, top_n)

    
def display_top_unique_words_by_group(grouped_articles, n_words):
    """
    Display the n most used words unique to each group (i.e. not used in any other group)
    """
    merged_articles = {
        group: parse_article_list(articles, group, grouped_articles, n_words)
        for group, articles 
        in grouped_articles.items()
    }
    
    for group, word_list in merged_articles.items():
        tuple_joiner = lambda tup: f'{tup[0]} - {tup[1]}'
        print('{group}: {word}\n'.format(
            group=group,
            word=' | '.join((tuple_joiner(word) for word in word_list))
        ))

In [294]:
display_top_unique_words_by_group(grouped_articles, 10)

1: alexa - 50 | apple - 37 | patent - 32 | waymo - 32 | cars - 28 | 230 - 24 | driverless - 23 | iphone - 18 | id - 18 | patents - 16

0: vr - 24 | lunar - 24 | module - 18 | destiny - 12 | nintendo - 12 | bungie - 11 | o'donnell - 9 | orbit - 9 | storage - 7 | htc - 7

2: sous-vide - 32 | water - 27 | food - 25 | mellow - 18 | mantle - 18 | rock - 13 | fda - 13 | mars - 12 | beef - 11 | eating - 11

