In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from glob import glob
import html2text

## Loading Data

In [2]:
# load data
filenames = glob("data/interim/df_*.csv")
dataframes = [pd.read_csv(f, encoding='latin-1') for f in filenames]

In [3]:
for i in range(17):
    dataframes[i] = dataframes[i].drop('Unnamed: 0', axis=1)

## Some useful functions

In [4]:
# takes a post id and returns a dataframe with that post
def getPost(id):
    
    # define dataframe structure in memory
    df = pd.DataFrame(columns=['by', 'score', 'time', 'timestamp', 'title', 'type', 'url', 'text',
       'parent', 'deleted', 'dead', 'descendants', 'id', 'ranking'])
    
    # find the post and add it to the dataframe in memory
    for dataframe in dataframes:
        df = pd.concat([dataframe[dataframe["id"] == id], df])
        
    return df

In [5]:
# takes a list of post ids and returns the comments on those posts
def getComments(parents, family=None):
    
    # define dataframe structure in memory
    children = pd.DataFrame(columns=['by', 'score', 'time', 'timestamp', 'title', 'type', 'url', 'text',
       'parent', 'deleted', 'dead', 'descendants', 'id', 'ranking'])
    
    # store family
    fam = family
    
    # base case: if no parents provided, return the family
    if not parents:
        return fam
    
    # recursive case: if parents provided, add children to family and check next generation
    else:
        
        # define the next generation
        nextgen = []
        
        # for each dataframe
        for dataframe in dataframes:

            # add the parent's children to the next generation
            children = pd.concat([dataframe[dataframe["parent"].isin(parents)], children]) 
            nextgen += list(children["id"])

            # add the parent's children to the family
            fam = pd.concat([children, fam])

            # reset children for the next parent
            children = pd.DataFrame(columns=['by', 'score', 'time', 'timestamp', 'title', 'type', 'url', 'text',
   'parent', 'deleted', 'dead', 'descendants', 'id', 'ranking'])

        # return getThread on the next generation and current family
        return getComments(nextgen, fam)        

In [6]:
def getThread(id):
    return pd.concat([getPost(id), getComments([id])])

In [7]:
def makeDocument(id):
    return html2text.html2text(" ".join(list(getThread(id)["text"].dropna())))

In [101]:
def imputeDescendants(thread):
    thread = thread.reset_index(drop=True)
    
    thread["count"] = 1
    
    sums = thread.groupby("parent").sum().reset_index()[["parent", "count"]]
    
    descendants = pd.merge(thread, sums, how="left", left_on="id", right_on="parent")
    
    descendants = descendants["count_y"].fillna(0).astype(int)
    
    thread["descendants"] = descendants
    
    return thread

In [230]:
def getMostRecentPostsBy(user):
    url = "https://hacker-news.firebaseio.com/v0/user/" + user + ".json"
    response = requests.get(url, data=data)
    comments = response.json()["submitted"][0:3]
    
    posts = []
    
    for comment in comments:
        url = "https://hacker-news.firebaseio.com/v0/item/" + str(comment) + ".json"
        response = requests.get(url, data=data)
        posts += [response.json()["text"]]
        
    return posts

## Testing the waters with TF IDF

In [102]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df=0, stop_words='english')

In [103]:
# define test documents
sexism = [14988086, 13784509, 14967505]
selfDriving = [15644680, 13718586, 14338328]
augmentedReality = [15124809, 14490239, 14713679]

In [104]:
corpus = [makeDocument(id) for id in sexism + selfDriving + augmentedReality]

In [105]:
# find our most descriptive words!
tfidf_matrix = tf.fit_transform(corpus)
feature_names = tf.get_feature_names()
dense_matrix = tfidf_matrix.todense()
num_documents = len(corpus)
for i in np.arange(num_documents):
    document_words = dense_matrix[i].tolist()[0]
    document_scores = [pair for pair in zip(range(0, len(document_words)), document_words) if pair[1] > 0]
    
    sorted_scores = sorted(document_scores, key=lambda t: t[1] * -1)[0:4] #3 words
    
    print("Top words in document {}".format(i + 1))
    for scores_tuple in sorted_scores:
        print("\tWord: {}, TF-IDF: {}".format(feature_names[scores_tuple[0]], round(scores_tuple[1], 5)))

Top words in document 1
	Word: women, TF-IDF: 0.54592
	Word: men, TF-IDF: 0.25394
	Word: people, TF-IDF: 0.19514
	Word: gender, TF-IDF: 0.15832
Top words in document 2
	Word: sexism, TF-IDF: 0.2909
	Word: manager, TF-IDF: 0.25308
	Word: tank, TF-IDF: 0.23651
	Word: dress, TF-IDF: 0.23221
Top words in document 3
	Word: women, TF-IDF: 0.3793
	Word: people, TF-IDF: 0.24035
	Word: salary, TF-IDF: 0.20347
	Word: memo, TF-IDF: 0.17917
Top words in document 4
	Word: car, TF-IDF: 0.52935
	Word: driving, TF-IDF: 0.29048
	Word: cars, TF-IDF: 0.24951
	Word: waymo, TF-IDF: 0.20291
Top words in document 5
	Word: google, TF-IDF: 0.34588
	Word: uber, TF-IDF: 0.27355
	Word: laptop, TF-IDF: 0.20365
	Word: waymo, TF-IDF: 0.19234
Top words in document 6
	Word: lyft, TF-IDF: 0.51379
	Word: uber, TF-IDF: 0.47412
	Word: driving, TF-IDF: 0.18652
	Word: drivers, TF-IDF: 0.17458
Top words in document 7
	Word: apple, TF-IDF: 0.33966
	Word: ar, TF-IDF: 0.33961
	Word: android, TF-IDF: 0.33788
	Word: ios, TF-IDF: 

In [114]:
imputeDescendants(getThread(14988086)).groupby("by").sum().sort_values(by="descendants", ascending=False).head()

Unnamed: 0_level_0,score,time,descendants,ranking,count
by,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
florianmari,321.0,1502446000.0,40,,1
imartin2k,,6009886000.0,19,,4
foldr,,19532360000.0,12,,13
rayiner,,1502451000.0,9,,1
scandox,,1502455000.0,9,,1


In [173]:
getRecentCommentsBy("imartin2k")["text"].dropna()[0]

Unnamed: 0,by,score,time,timestamp,title,type,url,text,parent,deleted,dead,descendants,id,ranking
1175,imartin2k,1.0,1.521186e+09,2018-03-16 07:42:18 UTC,Can Cambodia stay competitive with so many pub...,story,http://www.phnompenhpost.com/business/can-camb...,,0,,,0.0,16599209,
1178,imartin2k,1.0,1.521105e+09,2018-03-15 09:07:36 UTC,Spotify â?? The Impossible Success Story,story,http://northzone.com/spotify-impossible-succes...,,0,,,0.0,16591317,
359,imartin2k,2.0,1.521100e+09,2018-03-15 07:41:15 UTC,Lessons of History: summary in bullets,story,https://medium.com/@yanda/lessons-of-history-s...,,0,,,0.0,16591039,
173,imartin2k,3.0,1.521098e+09,2018-03-15 07:20:31 UTC,Icelandâ??s male circumcision ban,story,http://www.independent.co.uk/news/world/europe...,,0,,,0.0,16590979,
703,imartin2k,1.0,1.521019e+09,2018-03-14 09:20:38 UTC,Why Blockchain Liquidity Is a Bad Thingâ?¦,story,https://hackernoon.com/liquidity-kind-of-sucks...,,0,,,0.0,16583391,
708,imartin2k,1.0,1.521019e+09,2018-03-14 09:11:48 UTC,The Questions to Ask About the News,story,http://scottberkun.com/2018/the-questions-to-a...,,0,,,0.0,16583352,
1228,imartin2k,1.0,1.520923e+09,2018-03-13 06:39:53 UTC,What happened to 90â??s optimism? Did Silicon ...,story,https://roadlesstravelled.me/2018/03/13/what-h...,,0,,,0.0,16574217,
362,imartin2k,1.0,1.520829e+09,2018-03-12 04:36:40 UTC,You Think You Want Media Literacyâ?¦ Do You?,story,https://points.datasociety.net/you-think-you-w...,,0,,,0.0,16566240,
1173,imartin2k,3.0,1.520820e+09,2018-03-12 01:56:57 UTC,The Man Who Knew Too Little,story,https://www.nytimes.com/2018/03/10/style/the-m...,,0,,,0.0,16565744,
1182,imartin2k,97.0,1.520057e+09,2018-03-03 05:56:38 UTC,Hacking the Brain with Adversarial Images,story,https://spectrum.ieee.org/the-human-os/robotic...,,0,,,29.0,16508139,


In [174]:
im = getRecentCommentsBy("imartin2k")

In [189]:
im[im["type"] == "comment"]

Unnamed: 0,by,score,time,timestamp,title,type,url,text,parent,deleted,dead,descendants,id,ranking
242,imartin2k,,1.519230e+09,2018-02-21 16:16:21 UTC,,comment,,"Fittingly, from the recent NYT WeWork feature:...",16430259,,,,16430512,
249,imartin2k,,1.518987e+09,2018-02-18 20:43:16 UTC,,comment,,There is a so called Gender Inequality Index b...,16408353,,,,16408434,
92,imartin2k,,1.518866e+09,2018-02-17 11:09:05 UTC,,comment,,Itâ??s actually a bit more complicated. The Is...,16400101,,,,16400171,
1126,imartin2k,,1.518708e+09,2018-02-15 15:19:34 UTC,,comment,,"""Yes, neither car mechanics nor pre-school tea...",16383132,,,,16384665,
1139,imartin2k,,1.518689e+09,2018-02-15 10:08:18 UTC,,comment,,Neither. I only was critizing what I think is ...,16382916,,,,16382934,
317,imartin2k,,1.518689e+09,2018-02-15 09:59:01 UTC,,comment,,Itâ??s important to investigate this topic fro...,16382605,,,,16382894,
667,imartin2k,,1.518114e+09,2018-02-08 18:25:02 UTC,,comment,,Very nice one @ exolymph.\r\n\r\n,16334181,,,,16334297,
309,imartin2k,,1.518107e+09,2018-02-08 16:30:07 UTC,,comment,,Systematic outrage and Trump can be successful...,16331886,,,,16333322,
669,imartin2k,,1.517996e+09,2018-02-07 09:29:39 UTC,,comment,,"In my eyes, this is a case where no matter fro...",16322704,,,,16323250,
781,imartin2k,,1.517861e+09,2018-02-05 19:59:05 UTC,,comment,,I guess this situation will only permanently c...,16311192,,,,16311546,


nan

In [129]:
def getRecentCommentsBy(user):
    empty = pd.DataFrame(columns=['by', 'score', 'time', 'timestamp', 'title', 'type', 'url', 'text',
       'parent', 'deleted', 'dead', 'descendants', 'id', 'ranking'])
    for dataframe in dataframes:
        empty = pd.concat([dataframe[dataframe["by"] == user], empty])  
    
    return empty.reset_index(drop=True).sort_values(by="timestamp", ascending=False)

In [234]:
getMostRecentPostsBy("raymondgh")

['Perfect. I&#x27;ll send this with my cover letter next time :D',
 'What does it do? I see benefits but I don&#x27;t understand how the app will help me achieve those.',
 'Our language is evolving back to hieroglyphics']

In [224]:
url = "https://hacker-news.firebaseio.com/v0/item/" + str(16651855) + ".json"
response = requests.get(url, data=data)
response.json()["text"]

'Perfect. I&#x27;ll send this with my cover letter next time :D'

In [227]:
a + response.json()["text"]

TypeError: can only concatenate list (not "str") to list

In [205]:
a.append(a)

In [206]:
a

[1, 2, 3, [...]]

In [210]:
a = [1, 2, 3]

In [197]:
import requests
url = 'https://hacker-news.firebaseio.com/v0/item/16430512.json'
data = 0
response = requests.get(url, data=data)

[16651855, 16580264, 16571490]

In [195]:
from html2text import html2text

In [196]:
html2text(response.json()['text'])

'Fittingly, from the recent NYT WeWork feature: "More than most companies,\nWeWork promotes the consumption of alcohol as an inherent virtue. "\n[https://www.nytimes.com/2018/02/17/business/the-wework-\nmanif...](https://www.nytimes.com/2018/02/17/business/the-wework-manifesto-\nfirst-office-space-next-the-world.html)\n\n'

In [171]:
import html2text
h = html2text.HTML2Text()
h.ignore_links = True
print( h.handle("<p>Hello, <a href='http://earth.google.com/'>world</a>!"))

Hello, world!


