In [401]:
# article retrieval
import requests
from bs4 import BeautifulSoup

In [402]:
url_list = [
    'https://arstechnica.com/',
    'https://arstechnica.com/page/2/',
    'https://arstechnica.com/page/3/',
    'https://arstechnica.com/page/4/',
    'https://arstechnica.com/page/5/',
]

In [403]:
# TODO: use scrapy instead
def retrieve_parsed_page(url):
    page_text = requests.get(url).text
    return BeautifulSoup(page_text, 'lxml')

def get_article_links(parsed_page):
    anchor_list = parsed_page.select('h2 a')
    return [a['href'] for a in anchor_list]

def get_article_text(parsed_page):
    page_paragraphs = parsed_page.find('section', {'class': 'article-guts'}).findAll('p')
    paragraph_text = [paragraph.text for paragraph in page_paragraphs]
    return '\n'.join(paragraph_text)

In [407]:
page_bodies = (retrieve_parsed_page(url) for url in url_list)
article_urls = []
for body in page_bodies:
    article_urls += get_article_links(body)

print(f'starting parse of {len(article_urls)} articles')

article_list = []
for url in article_urls:
    print(f'retrieving article: {url}')
    parsed_page = retrieve_parsed_page(url)
    article_text = get_article_text(parsed_page)
    article_list.append(article_text)
print(f'done parsing {len(article_list)} articles')

starting parse of 155
retrieving article: https://arstechnica.com/gadgets/2017/12/guidemaster-everything-amazons-alexa-can-do-plus-the-best-skills-to-enable/
retrieving article: https://arstechnica.com/gaming/2017/12/ars-technicas-best-video-games-of-2017/
retrieving article: https://arstechnica.com/gaming/2017/12/hello-neighbor-review-an-all-around-bad-time-in-surreal-suburbia/
retrieving article: https://arstechnica.com/cars/2017/12/elon-musk-promises-rain-sensing-wipers-tesla-pickup-truck/
retrieving article: https://arstechnica.com/science/2017/12/9-himalayan-yeti-samples-turn-out-to-be-bears-dog/
retrieving article: https://arstechnica.com/tech-policy/2017/12/library-of-congress-to-get-selective-about-the-tweets-it-keeps/
retrieving article: https://arstechnica.com/information-technology/2017/12/mozilla-squashes-critical-thunderbird-bug/
retrieving article: https://arstechnica.com/gaming/2017/12/big-titles-must-wait-as-nintendo-pushes-back-64gb-switch-game-card-rollout/
retrieving

retrieving article: https://arstechnica.com/cars/2017/12/driverless-car-from-gms-cruise-and-motorcycle-collide-in-san-francisco/
retrieving article: https://arstechnica.com/tech-policy/2017/12/north-korea-suspected-in-latest-bitcoin-heist-bankrupting-youbit-exchange/
retrieving article: https://arstechnica.com/gadgets/2017/12/neither-microsoft-nor-google-look-good-in-this-chrome-installer-squabble/
retrieving article: https://arstechnica.com/science/2017/12/video-astronaut-scott-kelly-teaches-orbital-mechanics-with-kerbal-space-program/
retrieving article: https://arstechnica.com/science/2017/12/interstellar-visitor-might-be-a-comet-covered-in-carbonaceous-crud/
retrieving article: https://arstechnica.com/gaming/2017/12/magic-leap-finally-announces-a-headset-but-its-vague-rendered-in-photoshop/
retrieving article: https://arstechnica.com/tech-policy/2017/12/new-york-tried-end-run-around-fcc-preemption-with-net-neutrality-law/
retrieving article: https://arstechnica.com/gadgets/2017/12/

retrieving article: https://arstechnica.com/gadgets/2017/12/youtube-launched-a-vr-app-on-steam-but-its-broken/
retrieving article: https://arstechnica.com/tech-policy/2017/12/feds-charge-new-york-woman-with-sending-bitcoins-to-support-isis/
retrieving article: https://arstechnica.com/gaming/2017/12/sorry-playerunknown-you-probably-cant-stop-battlegrounds-copycats/
retrieving article: https://arstechnica.com/science/2017/12/who-report-data-on-marijuana-compound-does-not-justify-dea-scheduling/
retrieving article: https://arstechnica.com/gadgets/2017/12/alphabet-wants-to-deliver-internet-access-via-laser-beams/
retrieving article: https://arstechnica.com/gaming/2017/12/a-toy-story-for-nerds-like-us-netflixs-the-toys-that-made-us-doc-drops-next-week/
retrieving article: https://arstechnica.com/gaming/2017/12/its-happening-aliens-are-shooting-up-human-space-stations-in-elite-dangerous/
retrieving article: https://arstechnica.com/science/2017/12/apollo-triumph/
retrieving article: https://a

In [408]:
# TF-IDF vectorization

In [463]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_instance = TfidfVectorizer(
    max_df=0.5, # exclude words that appear in more than 50% of the corpus
    min_df=3, # exclude words that appear less than 2 times total. (cut)
    stop_words='english', 
    strip_accents='unicode'
)

# from sklearn.feature_extraction.text import CountVectorizer
# count_v = CountVectorizer()
# print(count_v.fit_transform(article_list))

In [464]:
X = vectorizer_instance.fit_transform(article_list)

In [465]:
# print(X[0].toarray())
X

<155x3578 sparse matrix of type '<class 'numpy.float64'>'
	with 30328 stored elements in Compressed Sparse Row format>

In [466]:
# Clustering

In [467]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import numpy as np

In [475]:
# use Silhouette score to find a good value for number of clusters
cluster_scores = {}
for n_cluster in range(2,20):
    km_fitter = KMeans(
        n_clusters = n_cluster,
        init='k-means++',
        max_iter=20,
        n_init=1,
        verbose=False
    )
    km_fitter.fit(X)
    cluster_scores[n_cluster] = silhouette_score(X, km_fitter.labels_)

In [476]:
sorted_cluster_scores = sorted(cluster_scores, key=cluster_scores.get, reverse=True)
print(f'sorted values: { {key: cluster_scores[key] for key in sorted_cluster_scores} }\n\nbest value: {sorted_cluster_scores[0]}')

sorted values: {15: 0.072699075364547366, 19: 0.072360495579434067, 13: 0.069781529955573018, 12: 0.068921836814582971, 18: 0.067196668687603947, 17: 0.065416735021016109, 14: 0.062763902475867453, 6: 0.05605791697209362, 9: 0.055569852668344175, 16: 0.05496508654556239, 11: 0.050521712525642394, 10: 0.046868183415826291, 8: 0.041997257067273482, 7: 0.037064864842116768, 5: 0.033822887501055714, 4: 0.032801365854935302, 3: 0.025582054748249736, 2: 0.013865257179117955}

best value: 15


In [477]:
km_fitter = KMeans(
    n_clusters = sorted_cluster_scores[0],
    init='k-means++',
    max_iter=100,
    n_init=1,
    verbose=True
)

In [478]:
km_fitter.fit(X)

Initialization complete
Iteration  0, inertia 213.811
Iteration  1, inertia 115.944
Iteration  2, inertia 115.350
Converged at iteration 2: center shift 0.000000e+00 within tolerance 2.656964e-08


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=15, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [479]:
print(np.unique(km_fitter.labels_, return_counts=True))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14], dtype=int32), array([10, 12, 17,  7, 17, 14,  8,  4, 20,  8,  6, 13,  9,  5,  5]))


In [480]:
grouped_articles = {}
for group, article in zip(km_fitter.labels_, article_list):
    try:
         grouped_articles[group].append(article)
    except KeyError:
        grouped_articles[group] = [article]

In [481]:
# cluster analysis

In [482]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.probability import FreqDist
from heapq import nlargest

In [499]:
stopword_list = stopwords.words('english') + list(punctuation) + ['’', '“', '”', '\'s']
newline_char = '\n'

def tokenize_words(text):
    tokenized_words = word_tokenize(text.lower())
    return [word for word in tokenized_words if word not in stopword_list]

def top_occurring_words(word_list, top_n=10):
    word_frequency = FreqDist(word_list)
    top_words = nlargest(top_n, word_frequency, key=word_frequency.get)
    return [(word, word_frequency[word]) for word in top_words]

def get_exclusion_set(group_name, grouped_articles):
    other_articles = '\n'.join(
        '\n'.join(articles)
        for group, articles
        in grouped_articles.items()
        if group != group_name
    )
    return set(tokenize_words(other_articles))

def get_word_list(articles, group_name, grouped_articles):
    merged_list = '\n'.join(article_list)
    tokenized_words = tokenize_words(merged_list)
    exclusion_set = get_exclusion_set(group_name, grouped_articles)
    return [
        word
        for word
        in tokenized_words
        if word not in exclusion_set
    ]

def parse_article_list(articles, group_name, grouped_articles, top_n=40):
    print(f'parsing articles for group {group_name}')
    word_list = get_word_list(articles, group_name, grouped_articles)
    return top_occurring_words(word_list, top_n)

def top_unique_words_by_group(grouped_articles, n_words):
    """
    Display the n most used words unique to each group (i.e. not used in any other group)
    """
    return {
        group: parse_article_list(articles, group, grouped_articles, n_words)
        for group, articles 
        in grouped_articles.items()
    }
    
    
def display_results(merged_articles):
    for group, word_list in merged_articles.items():
        print('GROUP NUMBER {group}: {word}\n\n'.format(
            group=group,
            word=' | '.join(f'{word[0]} - {word[1]}' for word in word_list)
        ))

In [500]:
classified_article_groups = top_unique_words_by_group(grouped_articles, 10)

parsing articles for group 8
parsing articles for group 4
parsing articles for group 5
parsing articles for group 12
parsing articles for group 11
parsing articles for group 2
parsing articles for group 10
parsing articles for group 1
parsing articles for group 13
parsing articles for group 3
parsing articles for group 0
parsing articles for group 7
parsing articles for group 6
parsing articles for group 14
parsing articles for group 9


In [502]:
display_results(classified_article_groups)

GROUP NUMBER 8: alexa - 50 | samsung - 17 | e-ink - 17 | mofs - 16 | smartwatch - 14 | smartwatches - 14 | tango - 11 | headset - 10 | bixby - 9 | mof - 9


GROUP NUMBER 4: neighbor - 15 | loot - 14 | adana - 13 | nintendo - 12 | arg - 12 | puzzles - 11 | bungie - 11 | vi - 10 | elite - 10 | o'donnell - 9


GROUP NUMBER 5: driverless - 23 | stelvio - 23 | alfa - 20 | bmw - 20 | cruise - 18 | tesla - 16 | phoenix - 14 | toyota - 13 | hyperloop - 12 | lane - 11


GROUP NUMBER 12: backpage - 16 | jacobs - 13 | spca - 13 | knightscope - 12 | comicmix - 11 | immunity - 10 | robots - 10 | samples - 9 | professor - 9 | pink - 9


GROUP NUMBER 11: sous-vide - 32 | food - 31 | fda - 29 | mellow - 18 | cbd - 18 | homeopathic - 17 | rainfall - 12 | beef - 11 | cassell - 11 | cook - 10


GROUP NUMBER 2: api - 42 | chrome - 33 | keeper - 16 | 32-bit - 14 | centennial - 11 | installer - 10 | plugin - 8 | mining - 7 | renewal - 6 | tinder - 6


GROUP NUMBER 10: lockheed - 14 | elliot - 12 | aircraft 

In [486]:
# K-nearest classification

In [487]:
from sklearn.neighbors import KNeighborsClassifier

In [491]:
new_article = 'A new cryptocurrency called Bitcoin Gold is now live on the Internet. It aims to correct what its backers see as a serious flaw in the design of the original Bitcoin.  There are hundreds of cryptocurrencies on the Internet and many of them are derived from Bitcoin in one way or another. But Bitcoin Gold—like Bitcoin Cash another Bitcoin spinoff that was created in August—is different in two important ways.  Bitcoin Gold is branding itself as a version of Bitcoin rather than merely new platforms derived from Bitcoins source code. It has also chosen to retain Bitcoins transaction history which means that if you owned bitcoins before the fork you now own an equal amount of gold bitcoins.  While Bitcoin Cash was designed to resolve Bitcoins capacity crunch with larger blocks Bitcoin Gold aims to tackle another of Bitcoins perceived flaws: the increasing centralization of the mining industry that verifies and secures Bitcoin transactions.  The original vision for Bitcoin was that anyone would be able to participate in Bitcoin mining with their personal PCs earning a bit of extra cash as they helped to support the network. But as Bitcoin became more valuable people discovered that Bitcoin mining could be done much more efficiently with custom-built application-specific integrated circuits (ASICs).  As a result Bitcoin mining became a specialized and highly concentrated industry. The leading companies in this new industry wield a disproportionate amount of power over the Bitcoin network.  Bitcoin Gold aims to dethrone these mining companies by introducing an alternative mining algorithm thats much less susceptible to ASIC-based optimization. In theory that will allow ordinary Bitcoin Gold users to earn extra cash with their spare computing cycles just as people could do in the early days of Bitcoin.  How Bitcoin mining became centralized At the core of the Bitcoin network is the blockchain a distributed ledger that records every transaction that has ever occurred. Every 10 minutes on average one computer in Bitcoins peer-to-peer network adds a block to the end of the blockchain and collects a reward of 12.5 bitcoins—currently worth around $75000.  Miners compete for the privilege of adding a block to the blockchain by racing to solve a difficult mathematical problem. Heres how it works: the miner takes a list of transactions and adds a random string called a nonce to the end. Then it computes a SHA-256 hash function of this entire block.  A hash function is designed to produce an essentially random string of bits that uniquely represents the data that was hashed. The miner wins if these random bits begin with a certain number of zeros. Most of the time this doesnt happen so the miner chooses another nonce and repeats the calculation until a winning block comes up.  A miner who discovers a winning block announces it to the rest of the network; everyone else on the network verifies that it meets all the requirements of the Bitcoin rules and then adds the block to their copies of the blockchain. Then the race begins again.  The point of this rather elaborate process is to give the network a way to reach a consensus without a central authority to count votes. If two blocks are announced around the same time producing a disagreement about which one is the official block the dispute is settled by running another round of the race. Whoever wins the next round gets to choose which block in the preceding round becomes official.  In practice this means that a nodes influence over the network is proportional to the amount of computing power it has. In Bitcoins early years this gave Bitcoin mining a democratic character. Almost everyone in the community had spare computing power sitting on their desks that they could devote to Bitcoin mining earning virtual currency in the process. There were few professional bitcoin miners with dedicated hardware because somebody buying special hardware couldnt compete with a bunch of guys using spare computing cycles they already had.  But that changed when people began building custom Bitcoin-mining ASICs. These chips could compute SHA-256 hashes so much more efficiently than a PC that PC miners soon couldnt even produce enough bitcoins to cover their electricity bills. Mining became an ASIC-only operation heavily concentrated in places with low electricity costs.  Bitcoin Gold hopes to make mining democratic again This strategy of making influence over a network proportional to computing power is known as proof-of-work. The more work you do—in this case computing SHA-256 hashes billions of times—the more likely you are to win a chance to add a block to the blockchain.  Bitcoin Gold is identical to vanilla Bitcoin in most respects but it uses an alternative proof-of-work algorithm called Equihash that supporters believe is impervious to being sped up with custom hardware. Equihash has also been adopted by a Bitcoin rival called Zcash for the same reason.  The key idea behind Equihash is that the algorithm is constrained more by memory than by computing power. Heres a simplified summary of how Equihash works (you can get all the gory details in the Equihash white paper):  Equihash starts with a list of pseudorandom bit strings derived from the block the miner wants to add to the blockchain. The miner tries to find a subset of n strings (out of the ones generated in step 1) that XOR to zero. The bit strings chosen in step 2 are concatenated together and hashed with the goal (as in the original Bitcoin) of finding a value below some pre-defined value. Step 2 is the hard part of this process—the first and third steps are relatively trivial. And the most efficient algorithm for completing step 2 requires a lot of memory. Trying to solve the problem with less than the optimal amount of memory imposes drastic computational penalties. In one example presented in the Equihash paper solving a version of the problem with 700 megabytes took about 15 seconds while solving the same problem with 250 megabytes took 1000 times as long.  The reason this matters the creators of Equihash say is that its not really feasible to optimize memory-intensive algorithms with custom silicon the way you can optimize compute-intensive algorithms. Bitcoin mining hardware is blazingly fast because a chip custom-designed for computing SHA-256 hashes can compute vastly more hashes per second than a conventional CPU with the same number of transistors. But 1GB of memory takes up as much space on a chip whether its being used for custom mining hardware or a general purpose PC.  The result supporters hope is that Bitcoin Gold will always be accessible to ordinary users who want to mine cryptocurrency with their PCs. That could give Bitcoin a more democratic character and reduce the influence of the big mining pools that are so powerful within the mainstream Bitcoin network.  Bitcoin Gold is still a fringe cryptocurrency Bitcoin Golds vision of democratizing Bitcoin mining appeals to a lot of people in the mainstream Bitcoin world. But the currency still faces a ton of skepticism in the broader Bitcoin community.  Critics have objected to the unusual way that Bitcoin Gold launched the currency. After forking the main Bitcoin blockchain a few weeks ago the Bitcoin Gold team operated the new network privately allowing them to mine a bunch of gold bitcoins without competition from the rest of the Bitcoin world. Critics say this leaves fewer bitcoins available for anyone else to mine.  The broader objection though is that many bitcoiners look with suspicion on any effort to split the Bitcoin community. They worry that having multiple competing versions of Bitcoin will confuse the public. Opponents argue that Bitcoin Gold is unfairly capitalizing on the Bitcoin name.  But the Bitcoin Gold team insists that their project will be good for Bitcoin in the long run. They say their ultimate goal is to prove the viability of Equihash as an alternative proof-of-work algorithm and eventually convince the mainstream Bitcoin network to make a similar move. That seems like an uphill battle however given the millions of dollars Bitcoin miners have invested in their existing hashing hardware.  The market values Bitcoin Gold much less than vanilla bitcoin or even Bitcoin Cash. On Monday afternoon one unit of Bitcoin Gold was worth around $250 compared to $1400 for Bitcoin Cash and $6300 for normal Bitcoin. Still with more than 16 million bitcoins in circulation the creation of Bitcoin Gold created $4 billion in new cryptocurrency value—at least on paper.'
classifier = KNeighborsClassifier(
    n_neighbors=10,
    n_jobs=-1
)

classifier.fit(X, km_fitter.labels_)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
           weights='uniform')

In [515]:
vectorized_article = vectorizer_instance.transform([new_article])
assigned_cluster = classifier.predict(vectorized_article)[0]

In [518]:
print(f'cluster n {assigned_cluster}: {classified_article_groups[assigned_cluster]}')


cluster n 6: [('bitcoins', 31), ('payments', 17), ('coinbase', 13), ('bitpay', 13), ('small-block', 7), ('bubble', 6), ('signatures', 6), ('alice', 6), ('mastercard', 5), ('nakamoto', 5)]
