In [1]:
import csv
import json
import sqlite3
from pathlib import Path

#from itertools import islice, count
import itertools
from datetime import datetime
import numpy as np
import pandas as pd
from hn import search_by_date

In [2]:
def get_type(post):
    tags = post['_tags']
    if len(tags) == 3 or (len(tags) == 4 and tags[-1] == 'front_page'):
        return tags[0]
    elif len(tags) == 4:
        return tags[-1]

    return tags[-2]

"""
import logging

logger = logging.getLogger('hn')

logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
"""
None

##### Updated with latest posts

In [3]:
COLUMNS_STR = "Title,Post Type,Author,Object ID,Created At,URL,Points,Number of Comments"
COLUMNS = COLUMNS_STR.split(',')
COLUMNS

['Title',
 'Post Type',
 'Author',
 'Object ID',
 'Created At',
 'URL',
 'Points',
 'Number of Comments']

In [4]:
BASE_DATA_DIRECTORY = Path('hn_data/')
assert BASE_DATA_DIRECTORY.exists()

In [5]:
MAIN_DATA_FILE = BASE_DATA_DIRECTORY / 'hn.csv'  # new data will be appended to this file

In [6]:
LIMIT = 5000
BUFFER_SIZE = 1024 * 1024 * 100

In [7]:
pd.read_csv(MAIN_DATA_FILE, nrows=1)

Unnamed: 0,Title,Post Type,Author,Object ID,Created At,URL,Points,Number of Comments
0,The real connection between interview performa...,story,leeny,19256573,1551203302,http://blog.interviewing.io/there-is-a-real-co...,1,0.0


In [8]:
last_datetime = pd.read_csv(MAIN_DATA_FILE, nrows=1).loc[0, 'Created At']

last_datetime = datetime.utcfromtimestamp(last_datetime)
last_datetime

datetime.datetime(2019, 2, 26, 17, 48, 22)

In [9]:
results = search_by_date(
    stories=True, ask_hn=True, show_hn=True, polls=True, hits_per_page=1000,
    created_at__gt=last_datetime.strftime('%Y-%m-%d %H:%M:%S'))

In [10]:
#NEW_FILE_NAME = f"partial_hn_{int(datetime.utcnow().timestamp())}.csv"
NEW_FILE_NAME = f"partial_hn.csv"
NEW_FILE_PATH = BASE_DATA_DIRECTORY / NEW_FILE_NAME
print(f"New data will be saved into '{NEW_FILE_NAME}'")

New data will be saved into 'partial_hn.csv'


In [11]:
#assert False, "WARNING: Comment me out"

with NEW_FILE_PATH.open('w', buffering=BUFFER_SIZE) as fp:
    writer = csv.writer(fp)
    for step in itertools.count():
        chunk = itertools.islice(results, 1000)
        posts = [(
            post['title'],
            get_type(post),
            post['author'],
            post['objectID'],
            post['created_at_i'],
            post['url'],
            post['points'],
            post['num_comments'],
        ) for post in chunk]
        if not posts:
            print("DONE. No more posts to process.")
            break
        if step > LIMIT:
            print("Limit %s exceeded. Quitting." % LIMIT)
            break

        writer.writerows(posts)
        if (step + 1) % 10 == 0:
            print("Saved %s records" % ((step + 1) * 1000))

Saved 10000 records
Saved 20000 records
Saved 30000 records
Saved 40000 records
Saved 50000 records
Saved 60000 records
Saved 70000 records
Saved 80000 records
Saved 90000 records
DONE. No more posts to process.


New downloaded partial file:

In [12]:
!wc -l hn_data/partial_hn.csv

   90856 hn_data/partial_hn.csv


Previews `hn.csv` file:

In [13]:
!wc -l hn_data/hn.csv

 2645700 hn_data/hn.csv


In [88]:
!cat hn_data/partial_hn.csv >> hn_data/hn.csv

In [89]:
!wc -l hn_data/hn.csv

 2645700 hn_data/hn.csv


---

## Cleaning downloaded posts

In [90]:
import pandas as pd

In [91]:
df = pd.read_csv('hn_data/hn.csv', dtype={'Post Type':'category'})

In [92]:
df['Created At'] = pd.to_datetime(df['Created At'], unit='s')

In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2645334 entries, 0 to 2645333
Data columns (total 8 columns):
Title                 object
Post Type             category
Author                object
Object ID             int64
Created At            datetime64[ns]
URL                   object
Points                int64
Number of Comments    float64
dtypes: category(1), datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 143.8+ MB


In [94]:
df.head()

Unnamed: 0,Title,Post Type,Author,Object ID,Created At,URL,Points,Number of Comments
0,The real connection between interview performa...,story,leeny,19256573,2019-02-26 17:48:22,http://blog.interviewing.io/there-is-a-real-co...,1,0.0
1,Disco: Modern Session Encryption,story,baby,19256570,2019-02-26 17:48:05,https://eprint.iacr.org/2019/180,2,0.0
2,Selenized: A CIELAB color space theme for term...,story,pera,19256558,2019-02-26 17:47:03,https://github.com/jan-warchol/selenized,2,0.0
3,What Do Humanitarian Groups Do with Surplus Su...,story,tshannon,19256554,2019-02-26 17:46:45,https://www.npr.org/sections/goatsandsoda/2019...,2,0.0
4,New TLS Padding Oracles,story,jakobdabo,19256553,2019-02-26 17:46:43,https://github.com/RUB-NDS/TLS-Padding-Oracles,2,0.0


In [95]:
assert df['Object ID'].dtype == np.int

In [96]:
df.shape

(2645334, 8)

In [98]:
df.duplicated('Object ID').sum()

0