In [19]:
from google.cloud import bigquery
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import bq_helper
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="gcauth.json"
client = bigquery.Client()

In [3]:
hn = bq_helper.BigQueryHelper(active_project= "bigquery-public-data", 
                                       dataset_name = "hacker_news")

In [4]:
hn

<bq_helper.BigQueryHelper at 0x17efb2ad128>

In [5]:
hn.list_tables()

['comments', 'full', 'full_201510', 'stories']

In [6]:
hn.table_schema("full_201510")

[SchemaField('by', 'string', 'NULLABLE', 'Username of commenter or submitter', ()),
 SchemaField('score', 'integer', 'NULLABLE', 'Story score', ()),
 SchemaField('time', 'integer', 'NULLABLE', 'Unix time', ()),
 SchemaField('title', 'string', 'NULLABLE', 'Story title', ()),
 SchemaField('type', 'string', 'NULLABLE', 'Type of details (comment, comment_ranking, poll, story, job, pollopt)', ()),
 SchemaField('url', 'string', 'NULLABLE', 'Story url', ()),
 SchemaField('text', 'string', 'NULLABLE', 'Story or comment text', ()),
 SchemaField('parent', 'integer', 'NULLABLE', 'Parent comment ID', ()),
 SchemaField('deleted', 'boolean', 'NULLABLE', 'Is deleted?', ()),
 SchemaField('dead', 'boolean', 'NULLABLE', 'Is dead?', ()),
 SchemaField('descendants', 'integer', 'NULLABLE', 'Number of story or poll descendants', ()),
 SchemaField('id', 'integer', 'NULLABLE', 'Unique type ID', ()),
 SchemaField('ranking', 'integer', 'NULLABLE', 'Comment ranking', ())]

In [7]:
hn.table_schema("full")

[SchemaField('by', 'string', 'NULLABLE', "The username of the item's author.", ()),
 SchemaField('score', 'integer', 'NULLABLE', 'Story score', ()),
 SchemaField('time', 'integer', 'NULLABLE', 'Unix time', ()),
 SchemaField('timestamp', 'timestamp', 'NULLABLE', 'Timestamp for the unix time', ()),
 SchemaField('title', 'string', 'NULLABLE', 'Story title', ()),
 SchemaField('type', 'string', 'NULLABLE', 'Type of details (comment, comment_ranking, poll, story, job, pollopt)', ()),
 SchemaField('url', 'string', 'NULLABLE', 'Story url', ()),
 SchemaField('text', 'string', 'NULLABLE', 'Story or comment text', ()),
 SchemaField('parent', 'integer', 'NULLABLE', 'Parent comment ID', ()),
 SchemaField('deleted', 'boolean', 'NULLABLE', 'Is deleted?', ()),
 SchemaField('dead', 'boolean', 'NULLABLE', 'Is dead?', ()),
 SchemaField('descendants', 'integer', 'NULLABLE', 'Number of story or poll descendants', ()),
 SchemaField('id', 'integer', 'NULLABLE', "The item's unique id.", ()),
 SchemaField('ran

In [8]:
hn.head("comments")

Unnamed: 0,id,by,author,time,time_ts,text,parent,deleted,dead,ranking
0,2701393,5l,5l,1309184881,2011-06-27 14:28:01+00:00,And the glazier who fixed all the broken windo...,2701243,,,0
1,5811403,99,99,1370234048,2013-06-03 04:34:08+00:00,Does canada have the equivalent of H1B/Green c...,5804452,,,0
2,21623,AF,AF,1178992400,2007-05-12 17:53:20+00:00,"Speaking of Rails, there are other options in ...",21611,,,0
3,10159727,EA,EA,1441206574,2015-09-02 15:09:34+00:00,Humans and large livestock (and maybe even pet...,10159396,,,0
4,2988424,Iv,Iv,1315853580,2011-09-12 18:53:00+00:00,I must say I reacted in the same way when I re...,2988179,,,0


In [9]:
hn.head("full_201510")

Unnamed: 0,by,score,time,title,type,url,text,parent,deleted,dead,descendants,id,ranking
0,danmaz74,,1438616833,,comment,,Do those analyses also account for the energy ...,9996591,,,,9996887,
1,gbraad,,1438616954,,comment,,Full stack engineer who&#x27;s proficient with...,9996335,,,,9996912,
2,qeorge,,1260997308,,comment,,<i>It seems that IE has been a consistent impe...,999251,,,,999709,
3,Flammy,,1438617965,,comment,,"Hey all,<p>Ivy Softworks is looking for talent...",9996333,,,,9997096,
4,scottliquid15,,1438618127,,comment,,LiquidTalent is an exclusive marketplace for d...,9996333,,,,9997137,


In [10]:
hn.head("full")

Unnamed: 0,by,score,time,timestamp,title,type,url,text,parent,deleted,dead,descendants,id,ranking
0,roland-s,,1515171923,2018-01-05 17:05:23+00:00,,comment,,"Man, I typed a whole reply saying basically th...",16079456,,,,16079814,
1,tptacek,,1292178197,2010-12-12 18:23:17+00:00,,comment,,When modification happens often relative to ac...,1997519,,,,1997540,
2,the_d00d,,1483808474,2017-01-07 17:01:14+00:00,,comment,,Did he post derogatory platitudes about Columb...,13344935,,,,13345115,
3,mkull,,1246632234,2009-07-03 14:43:54+00:00,,comment,,Twitter is saying it is a fire -<p>tjeastmond:...,686046,,,,686098,
4,jonahx,,1425400984,2015-03-03 16:43:04+00:00,,comment,,Naming is incredibly important. I don&#x27;t ...,9138301,,,,9138638,


In [20]:
hn.head("stories")

Unnamed: 0,id,by,score,time,time_ts,title,url,text,deleted,dead,descendants,author
0,6940813,sarath237,0,1387536270,2013-12-20 10:44:30+00:00,Sheryl Brindo Hot Pics,http://www.youtube.com/watch?v=ym1cyxneB0Y,Sheryl Brindo Hot Pics,,True,,sarath237
1,6991401,123123321321,0,1388508751,2013-12-31 16:52:31+00:00,Are you people also put off by the culture of ...,,They&#x27;re pretty explicitly &#x27;startup f...,,True,,123123321321
2,1531556,ssn,0,1279617234,2010-07-20 09:13:54+00:00,New UI for Google Image Search,http://googlesystem.blogspot.com/2010/07/googl...,Again following on Bing's lead.,,,0.0,ssn
3,5012398,hoju,0,1357387877,2013-01-05 12:11:17+00:00,Historic website screenshots,http://webscraping.com/blog/Generate-website-s...,Python script to generate historic screenshots...,,,0.0,hoju
4,7214182,kogir,0,1401561740,2014-05-31 18:42:20+00:00,Placeholder,,Mind the gap.,,,0.0,kogir


In [11]:
query = """SELECT REGEXP_EXTRACT(url , '//([^/]*)/?') domain, COUNT(*) c
           FROM `bigquery-public-data.hacker_news.full`
           WHERE url != '' AND EXTRACT(YEAR FROM timestamp) = 2017
           GROUP BY domain ORDER BY c DESC LIMIT 10"""

hn.estimate_query_size(query)

0.3272185381501913

In [12]:
query = """SELECT REGEXP_EXTRACT(url , '//([^/]*)/?') domain, COUNT(*) c
           FROM `bigquery-public-data.hacker_news.full`
           WHERE url != '' AND EXTRACT(YEAR FROM timestamp) = 2016
           GROUP BY domain ORDER BY c DESC LIMIT 10"""

hn.estimate_query_size(query)

0.3272185381501913

In [13]:
top_10_websites = hn.query_to_pandas_safe(query)

In [14]:
top_10_websites

Unnamed: 0,domain,c
0,medium.com,18451
1,github.com,15029
2,www.youtube.com,9433
3,www.nytimes.com,6316
4,techcrunch.com,4074
5,www.theguardian.com,3536
6,www.bloomberg.com,3210
7,arstechnica.com,3021
8,www.bbc.com,2239
9,en.wikipedia.org,2199


In [15]:
query = """SELECT timestamp
           FROM `bigquery-public-data.hacker_news.full`
           ORDER BY timestamp DESC LIMIT 10"""

hn.estimate_query_size(query)

0.1234043762087822

In [16]:
recent_comments = hn.query_to_pandas_safe(query)

In [17]:
recent_comments

Unnamed: 0,timestamp
0,2018-03-15 09:14:00+00:00
1,2018-03-15 09:13:33+00:00
2,2018-03-15 09:13:10+00:00
3,2018-03-15 09:13:01+00:00
4,2018-03-15 09:12:58+00:00
5,2018-03-15 09:12:49+00:00
6,2018-03-15 09:12:29+00:00
7,2018-03-15 09:11:51+00:00
8,2018-03-15 09:11:41+00:00
9,2018-03-15 09:11:14+00:00


In [23]:
query = """SELECT *
           FROM `bigquery-public-data.hacker_news.full`
           ORDER BY timestamp DESC"""

hn.estimate_query_size(query)

6.134854851290584

In [30]:
query = """SELECT *
           FROM `bigquery-public-data.hacker_news.stories`
           WHERE descendants > 20
           ORDER BY descendants DESC LIMIT 1000"""

hn.estimate_query_size(query)

0.3932693460956216

In [31]:
stories = hn.query_to_pandas_safe(query)

In [35]:
stories.head()

Unnamed: 0,id,by,score,time,time_ts,title,url,text,deleted,dead,descendants,author
0,363,pg,262,1172085729,2007-02-21 19:22:09+00:00,Please tell us what features you'd like in new...,,,,,1585,pg
1,9172373,NickSarath,921,1425922757,2015-03-09 17:39:17+00:00,The new MacBook,https://www.apple.com/macbook/,,,,1270,NickSarath
2,9784470,imd23,1905,1435328035,2015-06-26 14:13:55+00:00,"Same-Sex Marriage Is a Right, Supreme Court Rules",http://www.nytimes.com/2015/06/27/us/supreme-c...,,,,1216,imd23
3,7525198,platz,982,1396551773,2014-04-03 19:02:53+00:00,Brendan Eich Steps Down as Mozilla CEO,https://blog.mozilla.org/blog/2014/04/03/brend...,,,,1171,platz
4,9812245,whoishiring,673,1435762922,2015-07-01 15:02:02+00:00,Ask HN: Who is hiring? (July 2015),,Please lead with the location of the position ...,,,1097,whoishiring


In [None]:
#chunksize = 10 ** 6
#for chunk in pd.read_csv(filename, chunksize=chunksize):
#    process(chunk)

In [37]:
stories.to_csv("data/interim/stories.csv")

In [42]:
stories = pd.read_csv("data/interim/stories.csv", encoding="latin-1").drop("Unnamed: 0", axis=1)

In [43]:
stories.head()

Unnamed: 0,id,by,score,time,time_ts,title,url,text,deleted,dead,descendants,author
0,363,pg,262,1172085729,2007-02-21 19:22:09+00:00,Please tell us what features you'd like in new...,,,,,1585,pg
1,9172373,NickSarath,921,1425922757,2015-03-09 17:39:17+00:00,The new MacBook,https://www.apple.com/macbook/,,,,1270,NickSarath
2,9784470,imd23,1905,1435328035,2015-06-26 14:13:55+00:00,"Same-Sex Marriage Is a Right, Supreme Court Rules",http://www.nytimes.com/2015/06/27/us/supreme-c...,,,,1216,imd23
3,7525198,platz,982,1396551773,2014-04-03 19:02:53+00:00,Brendan Eich Steps Down as Mozilla CEO,https://blog.mozilla.org/blog/2014/04/03/brend...,,,,1171,platz
4,9812245,whoishiring,673,1435762922,2015-07-01 15:02:02+00:00,Ask HN: Who is hiring? (July 2015),,Please lead with the location of the position ...,,,1097,whoishiring


In [45]:
hn.head("comments")

Unnamed: 0,id,by,author,time,time_ts,text,parent,deleted,dead,ranking
0,2701393,5l,5l,1309184881,2011-06-27 14:28:01+00:00,And the glazier who fixed all the broken windo...,2701243,,,0
1,5811403,99,99,1370234048,2013-06-03 04:34:08+00:00,Does canada have the equivalent of H1B/Green c...,5804452,,,0
2,21623,AF,AF,1178992400,2007-05-12 17:53:20+00:00,"Speaking of Rails, there are other options in ...",21611,,,0
3,10159727,EA,EA,1441206574,2015-09-02 15:09:34+00:00,Humans and large livestock (and maybe even pet...,10159396,,,0
4,2988424,Iv,Iv,1315853580,2011-09-12 18:53:00+00:00,I must say I reacted in the same way when I re...,2988179,,,0


In [50]:
query = """SELECT *
           FROM `bigquery-public-data.hacker_news.comments`
           """

hn.estimate_query_size(query)

3.407075739465654

In [52]:
# https://cloud.google.com/bigquery/docs/running-queries

In [51]:
comments = hn.query_to_pandas_safe(query, max_gb_scanned=3.6)

TimeoutError: Operation did not complete within the designated timeout.

In [None]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df=0, stop_words='english')
tf
corpus = [document1, document2, document3]
tfidf_matrix = tf.fit_transform(corpus)
feature_names = tf.get_feature_names()

len(feature_names)
dense_matrix = tfidf_matrix.todense()
dense_matrix
num_documents = len(corpus)

for i in np.arange(num_documents):
    document_words = dense_matrix[i].tolist()[0]
    document_scores = [pair for pair in zip(range(0, len(document_words)), document_words) if pair[1] > 0]
    
    sorted_scores = sorted(document_scores, key=lambda t: t[1] * -1)[0:3] #3 words
    
    print("Top words in document {}".format(i + 1))
    for scores_tuple in sorted_scores:
        print("\tWord: {}, TF-IDF: {}".format(feature_names[scores_tuple[0]], round(scores_tuple[1], 5)))
