In [13]:
from google.cloud import bigquery
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import bq_helper

In [14]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="gcauth.json"
client = bigquery.Client()

In [5]:
hacker_news = bq_helper.BigQueryHelper(active_project= "bigquery-public-data", 
                                       dataset_name = "hacker_news")

In [6]:
hacker_news

<bq_helper.BigQueryHelper at 0x179b538c710>

In [7]:
hacker_news.list_tables()

['comments', 'full', 'full_201510', 'stories']

In [27]:
hacker_news.table_schema("full_201510")

[SchemaField('by', 'string', 'NULLABLE', 'Username of commenter or submitter', ()),
 SchemaField('score', 'integer', 'NULLABLE', 'Story score', ()),
 SchemaField('time', 'integer', 'NULLABLE', 'Unix time', ()),
 SchemaField('title', 'string', 'NULLABLE', 'Story title', ()),
 SchemaField('type', 'string', 'NULLABLE', 'Type of details (comment, comment_ranking, poll, story, job, pollopt)', ()),
 SchemaField('url', 'string', 'NULLABLE', 'Story url', ()),
 SchemaField('text', 'string', 'NULLABLE', 'Story or comment text', ()),
 SchemaField('parent', 'integer', 'NULLABLE', 'Parent comment ID', ()),
 SchemaField('deleted', 'boolean', 'NULLABLE', 'Is deleted?', ()),
 SchemaField('dead', 'boolean', 'NULLABLE', 'Is dead?', ()),
 SchemaField('descendants', 'integer', 'NULLABLE', 'Number of story or poll descendants', ()),
 SchemaField('id', 'integer', 'NULLABLE', 'Unique type ID', ()),
 SchemaField('ranking', 'integer', 'NULLABLE', 'Comment ranking', ())]

In [8]:
hacker_news.table_schema("full")

[SchemaField('by', 'string', 'NULLABLE', "The username of the item's author.", ()),
 SchemaField('score', 'integer', 'NULLABLE', 'Story score', ()),
 SchemaField('time', 'integer', 'NULLABLE', 'Unix time', ()),
 SchemaField('timestamp', 'timestamp', 'NULLABLE', 'Timestamp for the unix time', ()),
 SchemaField('title', 'string', 'NULLABLE', 'Story title', ()),
 SchemaField('type', 'string', 'NULLABLE', 'Type of details (comment, comment_ranking, poll, story, job, pollopt)', ()),
 SchemaField('url', 'string', 'NULLABLE', 'Story url', ()),
 SchemaField('text', 'string', 'NULLABLE', 'Story or comment text', ()),
 SchemaField('parent', 'integer', 'NULLABLE', 'Parent comment ID', ()),
 SchemaField('deleted', 'boolean', 'NULLABLE', 'Is deleted?', ()),
 SchemaField('dead', 'boolean', 'NULLABLE', 'Is dead?', ()),
 SchemaField('descendants', 'integer', 'NULLABLE', 'Number of story or poll descendants', ()),
 SchemaField('id', 'integer', 'NULLABLE', "The item's unique id.", ()),
 SchemaField('ran

In [26]:
hacker_news.head("comments")

Unnamed: 0,id,by,author,time,time_ts,text,parent,deleted,dead,ranking
0,2701393,5l,5l,1309184881,2011-06-27 14:28:01+00:00,And the glazier who fixed all the broken windo...,2701243,,,0
1,5811403,99,99,1370234048,2013-06-03 04:34:08+00:00,Does canada have the equivalent of H1B/Green c...,5804452,,,0
2,21623,AF,AF,1178992400,2007-05-12 17:53:20+00:00,"Speaking of Rails, there are other options in ...",21611,,,0
3,10159727,EA,EA,1441206574,2015-09-02 15:09:34+00:00,Humans and large livestock (and maybe even pet...,10159396,,,0
4,2988424,Iv,Iv,1315853580,2011-09-12 18:53:00+00:00,I must say I reacted in the same way when I re...,2988179,,,0


In [25]:
hacker_news.head("full_201510")

Unnamed: 0,by,score,time,title,type,url,text,parent,deleted,dead,descendants,id,ranking
0,danmaz74,,1438616833,,comment,,Do those analyses also account for the energy ...,9996591,,,,9996887,
1,gbraad,,1438616954,,comment,,Full stack engineer who&#x27;s proficient with...,9996335,,,,9996912,
2,qeorge,,1260997308,,comment,,<i>It seems that IE has been a consistent impe...,999251,,,,999709,
3,Flammy,,1438617965,,comment,,"Hey all,<p>Ivy Softworks is looking for talent...",9996333,,,,9997096,
4,scottliquid15,,1438618127,,comment,,LiquidTalent is an exclusive marketplace for d...,9996333,,,,9997137,


In [9]:
hacker_news.head("full")

Unnamed: 0,by,score,time,timestamp,title,type,url,text,parent,deleted,dead,descendants,id,ranking
0,thephyber,,1498591140,2017-06-27 19:19:00+00:00,,comment,,"Not an accountant, but the article states is a...",14648377,,,,14648408,
1,leephillips,,1367364826,2013-04-30 23:33:46+00:00,,comment,,A stupid population makes stupid things popular.,5635548,,,,5635574,
2,dogma1138,,1461516417,2016-04-24 16:46:57+00:00,,comment,,That&#x27;s why i said blimps are the way to g...,11560183,,,,11560307,
3,TeMPOraL,,1464082961,2016-05-24 09:42:41+00:00,,comment,,&gt; <i>I know this sounds kind of &quot;well ...,11759278,,,,11760085,
4,davidf18,,1490218212,2017-03-22 21:30:12+00:00,,comment,,Mobile VR or Augmented VR could be cool.,13932571,,,,13935174,


In [10]:
query = """SELECT REGEXP_EXTRACT(url , '//([^/]*)/?') domain, COUNT(*) c
           FROM `bigquery-public-data.hacker_news.full`
           WHERE url != '' AND EXTRACT(YEAR FROM timestamp) = 2017
           GROUP BY domain ORDER BY c DESC LIMIT 10"""

hacker_news.estimate_query_size(query)

0.3270744448527694

In [22]:
query = """SELECT REGEXP_EXTRACT(url , '//([^/]*)/?') domain, COUNT(*) c
           FROM `bigquery-public-data.hacker_news.full`
           WHERE url != '' AND EXTRACT(YEAR FROM timestamp) = 2016
           GROUP BY domain ORDER BY c DESC LIMIT 10"""

hacker_news.estimate_query_size(query)

0.3270744448527694

In [23]:
top_10_websites = hacker_news.query_to_pandas_safe(query)

In [24]:
top_10_websites

Unnamed: 0,domain,c
0,medium.com,18451
1,github.com,15029
2,www.youtube.com,9433
3,www.nytimes.com,6316
4,techcrunch.com,4074
5,www.theguardian.com,3536
6,www.bloomberg.com,3210
7,arstechnica.com,3021
8,www.bbc.com,2239
9,en.wikipedia.org,2199


In [30]:
query = """SELECT timestamp
           FROM `bigquery-public-data.hacker_news.full`
           ORDER BY timestamp DESC LIMIT 10"""

hacker_news.estimate_query_size(query)

0.1233450323343277

In [31]:
recent_comments = hacker_news.query_to_pandas_safe(query)

In [32]:
recent_comments

Unnamed: 0,timestamp
0,2018-03-14 09:15:23+00:00
1,2018-03-14 09:15:12+00:00
2,2018-03-14 09:15:06+00:00
3,2018-03-14 09:14:59+00:00
4,2018-03-14 09:14:59+00:00
5,2018-03-14 09:14:49+00:00
6,2018-03-14 09:14:20+00:00
7,2018-03-14 09:14:08+00:00
8,2018-03-14 09:14:00+00:00
9,2018-03-14 09:13:40+00:00


In [33]:
query = """SELECT *
           FROM `bigquery-public-data.hacker_news.full`
           ORDER BY timestamp DESC"""

hacker_news.estimate_query_size(query)

6.131798793561757

In [None]:
chunksize = 10 ** 6
for chunk in pd.read_csv(filename, chunksize=chunksize):
    process(chunk)