In [401]:
# article retrieval
import requests
from bs4 import BeautifulSoup

In [402]:
url_list = [
    'https://arstechnica.com/',
    'https://arstechnica.com/page/2/',
    'https://arstechnica.com/page/3/',
    'https://arstechnica.com/page/4/',
    'https://arstechnica.com/page/5/',
]

In [403]:
# TODO: use scrapy instead
def retrieve_parsed_page(url):
    page_text = requests.get(url).text
    return BeautifulSoup(page_text, 'lxml')

def get_article_links(parsed_page):
    anchor_list = parsed_page.select('h2 a')
    return [a['href'] for a in anchor_list]

def get_article_text(parsed_page):
    page_paragraphs = parsed_page.find('section', {'class': 'article-guts'}).findAll('p')
    paragraph_text = [paragraph.text for paragraph in page_paragraphs]
    return '\n'.join(paragraph_text)

In [407]:
page_bodies = (retrieve_parsed_page(url) for url in url_list)
article_urls = []
for body in page_bodies:
    article_urls += get_article_links(body)

print(f'starting parse of {len(article_urls)} articles')

article_list = []
for url in article_urls:
    print(f'retrieving article: {url}')
    parsed_page = retrieve_parsed_page(url)
    article_text = get_article_text(parsed_page)
    article_list.append(article_text)
print(f'done parsing {len(article_list)} articles')

starting parse of 155
retrieving article: https://arstechnica.com/gadgets/2017/12/guidemaster-everything-amazons-alexa-can-do-plus-the-best-skills-to-enable/
retrieving article: https://arstechnica.com/gaming/2017/12/ars-technicas-best-video-games-of-2017/
retrieving article: https://arstechnica.com/gaming/2017/12/hello-neighbor-review-an-all-around-bad-time-in-surreal-suburbia/
retrieving article: https://arstechnica.com/cars/2017/12/elon-musk-promises-rain-sensing-wipers-tesla-pickup-truck/
retrieving article: https://arstechnica.com/science/2017/12/9-himalayan-yeti-samples-turn-out-to-be-bears-dog/
retrieving article: https://arstechnica.com/tech-policy/2017/12/library-of-congress-to-get-selective-about-the-tweets-it-keeps/
retrieving article: https://arstechnica.com/information-technology/2017/12/mozilla-squashes-critical-thunderbird-bug/
retrieving article: https://arstechnica.com/gaming/2017/12/big-titles-must-wait-as-nintendo-pushes-back-64gb-switch-game-card-rollout/
retrieving

retrieving article: https://arstechnica.com/cars/2017/12/driverless-car-from-gms-cruise-and-motorcycle-collide-in-san-francisco/
retrieving article: https://arstechnica.com/tech-policy/2017/12/north-korea-suspected-in-latest-bitcoin-heist-bankrupting-youbit-exchange/
retrieving article: https://arstechnica.com/gadgets/2017/12/neither-microsoft-nor-google-look-good-in-this-chrome-installer-squabble/
retrieving article: https://arstechnica.com/science/2017/12/video-astronaut-scott-kelly-teaches-orbital-mechanics-with-kerbal-space-program/
retrieving article: https://arstechnica.com/science/2017/12/interstellar-visitor-might-be-a-comet-covered-in-carbonaceous-crud/
retrieving article: https://arstechnica.com/gaming/2017/12/magic-leap-finally-announces-a-headset-but-its-vague-rendered-in-photoshop/
retrieving article: https://arstechnica.com/tech-policy/2017/12/new-york-tried-end-run-around-fcc-preemption-with-net-neutrality-law/
retrieving article: https://arstechnica.com/gadgets/2017/12/

retrieving article: https://arstechnica.com/gadgets/2017/12/youtube-launched-a-vr-app-on-steam-but-its-broken/
retrieving article: https://arstechnica.com/tech-policy/2017/12/feds-charge-new-york-woman-with-sending-bitcoins-to-support-isis/
retrieving article: https://arstechnica.com/gaming/2017/12/sorry-playerunknown-you-probably-cant-stop-battlegrounds-copycats/
retrieving article: https://arstechnica.com/science/2017/12/who-report-data-on-marijuana-compound-does-not-justify-dea-scheduling/
retrieving article: https://arstechnica.com/gadgets/2017/12/alphabet-wants-to-deliver-internet-access-via-laser-beams/
retrieving article: https://arstechnica.com/gaming/2017/12/a-toy-story-for-nerds-like-us-netflixs-the-toys-that-made-us-doc-drops-next-week/
retrieving article: https://arstechnica.com/gaming/2017/12/its-happening-aliens-are-shooting-up-human-space-stations-in-elite-dangerous/
retrieving article: https://arstechnica.com/science/2017/12/apollo-triumph/
retrieving article: https://a

In [408]:
# TF-IDF vectorization

In [463]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_instance = TfidfVectorizer(
    max_df=0.5, # exclude words that appear in more than 50% of the corpus
    min_df=3, # exclude words that appear less than 2 times total. (cut)
    stop_words='english', 
    strip_accents='unicode'
)

# from sklearn.feature_extraction.text import CountVectorizer
# count_v = CountVectorizer()
# print(count_v.fit_transform(article_list))

In [464]:
X = vectorizer_instance.fit_transform(article_list)

In [465]:
# print(X[0].toarray())
X

<155x3578 sparse matrix of type '<class 'numpy.float64'>'
	with 30328 stored elements in Compressed Sparse Row format>

In [466]:
# Clustering

In [467]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import numpy as np

In [475]:
# use Silhouette score to find a good value for number of clusters
cluster_scores = {}
for n_cluster in range(2,20):
    km_fitter = KMeans(
        n_clusters = n_cluster,
        init='k-means++',
        max_iter=20,
        n_init=1,
        verbose=False
    )
    km_fitter.fit(X)
    cluster_scores[n_cluster] = silhouette_score(X, km_fitter.labels_)

In [476]:
sorted_cluster_scores = sorted(cluster_scores, key=cluster_scores.get, reverse=True)
print(f'sorted values: { {key: cluster_scores[key] for key in sorted_cluster_scores} }\n\nbest value: {sorted_cluster_scores[0]}')

sorted values: {15: 0.072699075364547366, 19: 0.072360495579434067, 13: 0.069781529955573018, 12: 0.068921836814582971, 18: 0.067196668687603947, 17: 0.065416735021016109, 14: 0.062763902475867453, 6: 0.05605791697209362, 9: 0.055569852668344175, 16: 0.05496508654556239, 11: 0.050521712525642394, 10: 0.046868183415826291, 8: 0.041997257067273482, 7: 0.037064864842116768, 5: 0.033822887501055714, 4: 0.032801365854935302, 3: 0.025582054748249736, 2: 0.013865257179117955}

best value: 15


In [477]:
km_fitter = KMeans(
    n_clusters = sorted_cluster_scores[0],
    init='k-means++',
    max_iter=100,
    n_init=1,
    verbose=True
)

In [478]:
km_fitter.fit(X)

Initialization complete
Iteration  0, inertia 213.811
Iteration  1, inertia 115.944
Iteration  2, inertia 115.350
Converged at iteration 2: center shift 0.000000e+00 within tolerance 2.656964e-08


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=15, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [479]:
print(np.unique(km_fitter.labels_, return_counts=True))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14], dtype=int32), array([10, 12, 17,  7, 17, 14,  8,  4, 20,  8,  6, 13,  9,  5,  5]))


In [480]:
grouped_articles = {}
for group, article in zip(km_fitter.labels_, article_list):
    try:
         grouped_articles[group].append(article)
    except KeyError:
        grouped_articles[group] = [article]

In [481]:
# cluster analysis

In [482]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.probability import FreqDist
from heapq import nlargest

In [484]:
stopword_list = stopwords.words('english') + list(punctuation) + ['’', '“', '”', '\'s']
newline_char = '\n'

def tokenize_words(text):
    tokenized_words = word_tokenize(text.lower())
    return [word for word in tokenized_words if word not in stopword_list]

def top_occurring_words(word_list, top_n=10):
    word_frequency = FreqDist(word_list)
    top_words = nlargest(top_n, word_frequency, key=word_frequency.get)
    return [(word, word_frequency[word]) for word in top_words]

def get_exclusion_set(group_name, grouped_articles):
    other_articles = '\n'.join(
        '\n'.join(articles)
        for group, articles
        in grouped_articles.items()
        if group != group_name
    )
    return set(tokenize_words(other_articles))

def get_word_list(articles, group_name, grouped_articles):
    merged_list = '\n'.join(article_list)
    tokenized_words = tokenize_words(merged_list)
    exclusion_set = get_exclusion_set(group_name, grouped_articles)
    return [
        word
        for word
        in tokenized_words
        if word not in exclusion_set
    ]


def parse_article_list(articles, group_name, grouped_articles, top_n=40):
    print(f'parsing articles for group {group_name}')
    word_list = get_word_list(articles, group_name, grouped_articles)
    return top_occurring_words(word_list, top_n)

    
def display_top_unique_words_by_group(grouped_articles, n_words):
    """
    Display the n most used words unique to each group (i.e. not used in any other group)
    """
    merged_articles = {
        group: parse_article_list(articles, group, grouped_articles, n_words)
        for group, articles 
        in grouped_articles.items()
    }
    
    for group, word_list in merged_articles.items():
        tuple_joiner = lambda tup: f'{tup[0]} - {tup[1]}'
        print('{group}: {word}\n'.format(
            group=group,
            word=' | '.join((tuple_joiner(word) for word in word_list))
        ))

In [485]:
display_top_unique_words_by_group(grouped_articles, 10)

parsing articles for group 8
parsing articles for group 4
parsing articles for group 5
parsing articles for group 12
parsing articles for group 11
parsing articles for group 2
parsing articles for group 10
parsing articles for group 1
parsing articles for group 13
parsing articles for group 3
parsing articles for group 0
parsing articles for group 7
parsing articles for group 6
parsing articles for group 14
parsing articles for group 9
8: alexa - 50 | samsung - 17 | e-ink - 17 | mofs - 16 | smartwatch - 14 | smartwatches - 14 | tango - 11 | headset - 10 | bixby - 9 | mof - 9

4: neighbor - 15 | loot - 14 | adana - 13 | nintendo - 12 | arg - 12 | puzzles - 11 | bungie - 11 | vi - 10 | elite - 10 | o'donnell - 9

5: driverless - 23 | stelvio - 23 | alfa - 20 | bmw - 20 | cruise - 18 | tesla - 16 | phoenix - 14 | toyota - 13 | hyperloop - 12 | lane - 11

12: backpage - 16 | jacobs - 13 | spca - 13 | knightscope - 12 | comicmix - 11 | immunity - 10 | robots - 10 | samples - 9 | professor -