In [14]:
import csv
import json
import sqlite3
from pathlib import Path

#from itertools import islice, count
import itertools
from datetime import datetime
import numpy as np
import pandas as pd
from hn import search_by_date

In [15]:
def get_type(post):
    tags = post['_tags']
    if len(tags) == 3 or (len(tags) == 4 and tags[-1] == 'front_page'):
        return tags[0]
    elif len(tags) == 4:
        return tags[-1]

    return tags[-2]


##### Updated with latest posts

In [16]:
COLUMNS_STR = "Object ID,Title,Post Type,Author,Created At,URL,Points,Number of Comments"
COLUMNS = COLUMNS_STR.split(',')
COLUMNS

['Object ID',
 'Title',
 'Post Type',
 'Author',
 'Created At',
 'URL',
 'Points',
 'Number of Comments']

In [17]:
BASE_DATA_DIRECTORY = Path('hn_data/')
assert BASE_DATA_DIRECTORY.exists()

In [18]:
MAIN_DATA_FILE = BASE_DATA_DIRECTORY / 'hn.csv'  # new data will be appended to this file

In [19]:
LIMIT = 5000
BUFFER_SIZE = 1024 * 1024 * 100

First record:

In [20]:
pd.read_csv(MAIN_DATA_FILE, nrows=1)

Unnamed: 0,Object ID,Title,Post Type,Author,Created At,URL,Points,Number of Comments
0,1,Y Combinator,story,pg,2006-10-09 18:21:51,http://ycombinator.com,61,18.0


Parse posts already downloaded:

In [34]:
df = pd.read_csv(
    'hn_data/hn.csv', #index_col='Object ID',
    dtype={
        'Post Type': 'category'
    }, parse_dates=['Created At'])

Make sure file is sorted by Created At:

In [38]:
assert df['Created At'].max() == df.iloc[-1]['Created At']

In [64]:
last_datetime = df.iloc[-1]['Created At']
last_datetime

Timestamp('2019-03-31 22:07:47')

Adding time padding to make sure we get every object:

In [81]:
results = search_by_date(
    stories=True, ask_hn=True, show_hn=True, polls=True, hits_per_page=1000,
    created_at__gt=last_datetime.strftime('%Y-%m-%d %H:%M:%S'))

In [82]:
#NEW_FILE_NAME = f"partial_hn_{int(datetime.utcnow().timestamp())}.csv"
NEW_FILE_NAME = f"partial_hn.csv"
NEW_FILE_PATH = BASE_DATA_DIRECTORY / NEW_FILE_NAME
print(f"New data will be saved into '{NEW_FILE_NAME}'")

New data will be saved into 'partial_hn.csv'


In [83]:
#assert False, "WARNING: Comment me out"

with NEW_FILE_PATH.open('w', buffering=BUFFER_SIZE) as fp:
    writer = csv.writer(fp)
    for step in itertools.count():
        chunk = itertools.islice(results, 1000)
        posts = [(
            post['objectID'],
            post['title'],
            get_type(post),
            post['author'],
            post['created_at_i'],
            post['url'],
            post['points'],
            post['num_comments'],
        ) for post in chunk]
        if not posts:
            print("DONE. No more posts to process.")
            break
        if step > LIMIT:
            print("Limit %s exceeded. Quitting." % LIMIT)
            break

        writer.writerows(posts)
        if (step + 1) % 10 == 0:
            print("Saved %s records" % ((step + 1) * 1000))

Saved 10000 records
Saved 20000 records
Saved 30000 records
Saved 40000 records
Saved 50000 records
Saved 60000 records
DONE. No more posts to process.


New downloaded partial file:

In [84]:
!wc -l hn_data/partial_hn.csv

   63497 hn_data/partial_hn.csv


### Concatenate both parts

In [85]:
df.columns.values

array(['Object ID', 'Title', 'Post Type', 'Author', 'Created At', 'URL',
       'Points', 'Number of Comments'], dtype=object)

In [86]:
partial = pd.read_csv(
    'hn_data/partial_hn.csv',
    dtype={
        'Post Type': 'category'
    }, names=df.columns.values)

In [87]:
partial['Created At'] = pd.to_datetime(partial['Created At'], unit='s')

In [88]:
partial['Created At'].is_monotonic

False

In [89]:
partial.sort_values(by=['Created At'], inplace=True)

In [90]:
partial.head()

Unnamed: 0,Object ID,Title,Post Type,Author,Created At,URL,Points,Number of Comments
63496,19538773,Innovations in Scientific Publishing,story,juretriglav,2019-03-31 22:13:42,https://juretriglav.si/innovations-in-scientif...,2,0
63495,19538779,WordPress theme provider Pipdig using customer...,story,JamieF1,2019-03-31 22:14:40,https://www.jemjabella.co.uk/2019/security-ale...,325,81
63494,19538814,"The big picture: Scott Walker, 1943-2019",story,drugme,2019-03-31 22:21:27,https://www.theguardian.com/music/2019/mar/31/...,2,0
63493,19538839,Succulent Poaching in California,story,mykowebhn,2019-03-31 22:28:36,https://www.sfgate.com/science/article/poachin...,66,15
63492,19538845,Intel’s First Confirmed Xe GPU Product: The Au...,story,rbanffy,2019-03-31 22:30:02,https://wccftech.com/intels-first-confirmed-xe...,6,0


In [99]:
new_df = df.append(partial)

In [101]:
new_df.shape[0] == df.shape[0] + partial.shape[0]

True

In [103]:
new_df['Object ID'].duplicated().any()

False

In [104]:
new_df['Object ID'].duplicated().sum()

0

In [105]:
df.to_csv('hn_data/hn.csv', index=False)