In [1]:
import warnings
warnings.filterwarnings('ignore')

from __future__ import division, print_function, unicode_literals

import os
import numpy as np
import pandas as pd 
from math import log

from itertools import combinations

from IPython.display import display, HTML

In [2]:
# Get the data 
path = '/home/pratikshasahu/Documents/EIDETIA/env/data/HN_posts_year_to_Sep_26_2016.csv'
hn = pd.read_csv(path)

hn

Unnamed: 0,id,title,url,num_points,num_comments,author,created_at
0,12579008,You have two days to comment if you want stem ...,http://www.regulations.gov/document?D=FDA-2015...,1,0,altstar,9/26/2016 3:26
1,12579005,SQLAR the SQLite Archiver,https://www.sqlite.org/sqlar/doc/trunk/README.md,1,0,blacksqr,9/26/2016 3:24
2,12578997,What if we just printed a flatscreen televisio...,https://medium.com/vanmoof/our-secrets-out-f21...,1,0,pavel_lishin,9/26/2016 3:19
3,12578989,algorithmic music,http://cacm.acm.org/magazines/2011/7/109891-al...,1,0,poindontcare,9/26/2016 3:16
4,12578979,How the Data Vault Enables the Next-Gen Data W...,https://www.talend.com/blog/2016/05/12/talend-...,1,0,markgainor1,9/26/2016 3:14
5,12578975,Saving the Hassle of Shopping,https://blog.menswr.com/2016/09/07/whats-new-w...,1,1,bdoux,9/26/2016 3:13
6,12578954,Macalifa A new open-source music app for UWP ...,http://forums.windowscentral.com/windows-phone...,1,0,thecodrr,9/26/2016 3:06
7,12578942,GitHub theweavrs/Macalifa: A music player wri...,https://github.com/theweavrs/Macalifa,1,0,thecodrr,9/26/2016 3:04
8,12578919,Google Allo first Impression,http://prodissues.com/2016/09/google-allo-firs...,3,0,jandll,9/26/2016 2:57
9,12578918,Advanced Multimedia on the Linux Command Line,https://avi.alkalay.net/2016/09/multimedia-lin...,1,0,mynameislegion,9/26/2016 2:56


In [3]:
# Preprocess titles from HN posts

from string import punctuation
punctrans = str.maketrans(dict.fromkeys(punctuation)) 

def tokenize(title):
    x = title.lower() # Lowercase
    x = x.encode('ascii', 'ignore').decode() # Keep only ascii chars.
    x = x.translate(punctrans) # Remove punctuation
    return x.split() # Return tokenized.

texts_tokenized = hn['title'].apply(tokenize)

from nltk.corpus import stopwords
sw = set(stopwords.words('english'))

for i in range(0, 5):
    for text in texts_tokenized:
        for x in text:
            if x in sw:
                text.remove(x)

print(texts_tokenized[:20])

0     [two, days, comment, want, stem, cells, classi...
1                             [sqlar, sqlite, archiver]
2        [printed, flatscreen, television, side, boxes]
3                                  [algorithmic, music]
4     [data, vault, enables, nextgen, data, warehous...
5                            [saving, hassle, shopping]
6     [macalifa, new, opensource, music, app, uwp, w...
7     [github, theweavrsmacalifa, music, player, wri...
8                     [google, allo, first, impression]
9          [advanced, multimedia, linux, command, line]
10              [ask, hn, tld, use, local, development]
11                                        [muroc, maru]
12                   [companies, make, products, worse]
13                           [tuning, aws, sqs, queues]
14                                    [promise, github]
15                              [joint, rd, ups, downs]
16    [ibm, announces, next, implementation, apples,...
17       [amazons, algorithms, dont, find, best,

In [4]:
# Compute unigram and bigram counts

from collections import Counter
cx = Counter()
cxy = Counter()

for text in texts_tokenized:
    for x in text:
        cx[x] += 1
    
    for x, y in set(map(tuple, map(sorted, combinations(text, 2)))):
        cxy[(x,y)] += 1
        
print(len(cx))
print('\nMost common: ', cx.most_common()[:20])
print('\nLeast common: ', cx.most_common()[(len(cx)-20):])

99044

Most common:  [('hn', 20237), ('show', 10753), ('new', 10080), ('ask', 9582), ('data', 6628), ('google', 5532), ('app', 5124), ('using', 4613), ('us', 4189), ('web', 4134), ('startup', 3849), ('open', 3828), ('first', 3730), ('code', 3705), ('apple', 3695), ('pdf', 3659), ('software', 3558), ('video', 3462), ('tech', 3410), ('free', 3180)]

Least common:  [('codenewbie', 1), ('makefileinspired', 1), ('managerbootstrapper', 1), ('reduxrouting', 1), ('libraryagnostic', 1), ('mocktheclock', 1), ('uncompromising', 1), ('appypaper', 1), ('ringcx', 1), ('getawesomeness', 1), ('keck', 1), ('developerfounderceo', 1), ('integeration', 1), ('gayford', 1), ('superweed', 1), ('prewwii', 1), ('microserivces', 1), ('interdependency', 1), ('tempted', 1), ('isare', 1)]


In [5]:
# Remove infrequent unigrams.

print('%d tokens before' % len(cx))

min_count = (1 / 2000) * len(hn) # = 146.5595

for x in list(cx.keys()):
    if cx[x] < min_count:
        del cx[x]

# Remove infrequent bigrams.

for x, y in list(cxy.keys()):
    if x not in cx or y not in cx:
        del cxy[(x, y)]

print('%d tokens after' % len(cx))
print('\nMost common:', cx.most_common()[:20])
print('\nLeast common:', cx.most_common()[(len(cx)-20):])

99044 tokens before
2022 tokens after

Most common: [('hn', 20237), ('show', 10753), ('new', 10080), ('ask', 9582), ('data', 6628), ('google', 5532), ('app', 5124), ('using', 4613), ('us', 4189), ('web', 4134), ('startup', 3849), ('open', 3828), ('first', 3730), ('code', 3705), ('apple', 3695), ('pdf', 3659), ('software', 3558), ('video', 3462), ('tech', 3410), ('free', 3180)]

Least common: [('views', 148), ('emulator', 148), ('directory', 148), ('director', 148), ('amiga', 148), ('bigger', 147), ('hold', 147), ('depression', 147), ('philosophy', 147), ('parts', 147), ('infographic', 147), ('average', 147), ('scam', 147), ('generating', 147), ('targets', 147), ('volkswagen', 147), ('investor', 147), ('classes', 147), ('match', 147), ('timeline', 147)]


In [6]:
# Build unigram <-> index lookup.

x2i, i2x = {}, {}
for i, x in enumerate(cx.keys()):
    x2i[x] = i
    i2x[i] = x
    
# Sum unigram and bigram counts for computing probabilities.
# i.e. p(x) = count(x) / sum(all counts).

sx = sum(cx.values())
sxy = sum(cxy.values())

In [7]:
# Accumulate data, rows, and cols to build sparse PMI matrix
from scipy.sparse import csc_matrix
from pprint import pformat

pmi_samples = Counter()
data, rows, cols = [], [], []

for (x, y), n in cxy.items():
    rows.append(x2i[x])
    cols.append(x2i[y])
    data.append(log((n / sxy) / (cx[x] / sx) / (cx[y] / sx)))
    pmi_samples[(x, y)] = data[-1]

PMI = csc_matrix((data, (rows, cols)))

print('%d non-zero elements' % PMI.count_nonzero())
print('\nSample PMI values\n', pformat(pmi_samples.most_common()[:15]))

559498 non-zero elements

Sample PMI values
 [(('cheat', 'sheet'), 7.741703919492517),
 (('gravitational', 'waves'), 7.594274561632172),
 (('peter', 'thiel'), 7.501114832399731),
 (('oculus', 'rift'), 7.444931552717988),
 (('nobel', 'prize'), 7.430011693174276),
 (('cook', 'tim'), 7.387140034347794),
 (('virus', 'zika'), 7.218029192769757),
 (('edward', 'snowden'), 7.154950416854075),
 (('clinton', 'hillary'), 7.109859339780078),
 (('area', 'bay'), 7.103617893940743),
 (('boot', 'spring'), 7.096688700497847),
 (('states', 'united'), 7.069464109429888),
 (('korea', 'north'), 7.054418293068599),
 (('panama', 'papers'), 7.0542136375644935),
 (('elon', 'musk'), 6.952284458229978)]


In [8]:
# Factorize the PMI matrix using sparse SVD
from scipy.sparse.linalg import svds

U, S, V = svds(PMI, k=20) 
norms = np.sqrt(np.sum(np.square(U), axis=1, keepdims=True))
U /= np.maximum(norms, 1e-7)

In [9]:
# Show some nearest neighbor samples

k = 5

word = input('Enter search word: ')
nearest_neighbours = {}

for x in cx:
    if x == word:
        dd = np.dot(U, U[x2i[x]])
    
        for i in np.argpartition(-1 * dd, k + 1)[:k + 1]:
            if i2x[i] == x: 
                continue
                        
            nearest_neighbours[i2x[i]] = dd[i]

for nn in nearest_neighbours:
    print(nn)
    similar_articles = hn[hn['title'].str.contains(nn)]
    display(similar_articles.sort_values(by='num_points',ascending=False)[:5])

Enter search word: journal
leads


Unnamed: 0,id,title,url,num_points,num_comments,author,created_at
125886,11479422,Namecheap live chat social engineering leads t...,http://www.postphp.com/namecheap-livechat-soci...,695,408,Casseres,4/12/2016 13:38
12430,12464179,VW engineer pleads guilty to diesel emissions ...,http://www.detroitnews.com/story/business/auto...,187,151,oxryly1,9/9/2016 17:14
146017,11306520,Recording of aerospace executive's speech lead...,http://qz.com/641738/this-rocket-executive-pis...,142,96,prostoalex,3/17/2016 18:36
53111,12106462,Misuse of Creative Commons-licensed photo lead...,https://blog.wikimedia.org/2016/07/12/free-lic...,86,56,edward,7/16/2016 14:39
86317,11816846,Googling yourself now leads to personal privac...,http://www.mercurynews.com/ci_29964677/googlin...,70,32,smaili,6/1/2016 18:21


links


Unnamed: 0,id,title,url,num_points,num_comments,author,created_at
129824,11446965,"Internet hyperlinks do not infringe copyright,...",http://in.reuters.com/article/internet-copyrig...,523,107,jonbaer,4/7/2016 13:17
227898,10655318,WhatsApp is blocking Telegram links,https://orat.io/blog/as-of-today-whatsapp-is-b...,433,227,bmaeser,12/1/2015 14:01
117495,11548469,Document 17 Declassified 9/11 attackers may h...,https://28pagesdotorg.files.wordpress.com/2016...,240,146,agjmills,4/22/2016 11:59
138457,11373841,"Unable to open links in Safari, Mail or Messag...",https://bencollier.net/2016/03/unable-to-open-...,231,96,nardras,3/28/2016 12:33
116016,11561711,"Most popular links in Hacker News comments, 20...",https://github.com/antontarasenko/smq/blob/mas...,210,68,anton_tarasenko,4/24/2016 22:45


link


Unnamed: 0,id,title,url,num_points,num_comments,author,created_at
129824,11446965,"Internet hyperlinks do not infringe copyright,...",http://in.reuters.com/article/internet-copyrig...,523,107,jonbaer,4/7/2016 13:17
239811,10562207,Beware of ads that use inaudible sound to link...,http://arstechnica.com/tech-policy/2015/11/bew...,473,230,ivank,11/13/2015 20:29
227898,10655318,WhatsApp is blocking Telegram links,https://orat.io/blog/as-of-today-whatsapp-is-b...,433,227,bmaeser,12/1/2015 14:01
244984,10524717,The European Commission is preparing an attack...,https://juliareda.eu/2015/11/ancillary-copyrig...,421,192,jsnathan,11/7/2015 13:26
78366,11887652,Snowden reveals GCHQ spy programme with link t...,http://www.thenational.scot/news/us-whistleblo...,245,63,ghosh,6/12/2016 11:41


favorite


Unnamed: 0,id,title,url,num_points,num_comments,author,created_at
9004,12496558,Ask HN: What's your favorite HN post?,,691,138,rkhraishi,9/14/2016 13:20
229370,10644518,Ask HN: What's your favorite online course?,,395,159,sidcool,11/29/2015 13:22
281407,10259549,My favorite interview question,https://www.nczonline.net/blog/2015/09/my-favo...,227,195,antouank,9/22/2015 16:13
30599,12301055,My favorite day of the month is bank statement...,https://medium.com/@yanismydj/the-worlds-most-...,199,107,ylhert,8/16/2016 22:17
19680,12396420,My favorite Erlang Program (2013),http://joearms.github.io/2013/11/21/My-favorit...,189,38,Tomte,8/31/2016 6:48


importance


Unnamed: 0,id,title,url,num_points,num_comments,author,created_at
147069,11298583,Stargate Physics 101: A comedy about the impor...,https://archiveofourown.org/works/3673335,334,67,gwern,3/16/2016 16:32
105606,11650967,Tesla crash after flying 82 feet in the air sh...,http://electrek.co/2016/05/06/tesla-model-s-cr...,296,181,vinnyglennon,5/7/2016 19:24
286530,10221751,Edward Gibbon and the importance of great writ...,https://www.commentarymagazine.com/articles/be...,43,19,Thevet,9/15/2015 17:08
25229,12348990,I conducted an experiment on the importance of...,https://www.reddit.com/r/muacjdiscussion/comme...,18,4,exolymph,8/24/2016 1:15
193421,10934632,The importance of gold standard studies for co...,http://blog.givewell.org/2016/01/19/the-import...,17,0,apsec112,1/19/2016 22:33
