/
01_fetch_hn_data.py
117 lines (97 loc) · 3.4 KB
/
01_fetch_hn_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from tqdm import tqdm
import asyncio
import aiohttp
import sqlite3
import random
# Number of retries
RETRIES = 5
# Base delay in seconds
DELAY = 1
async def fetch(session, url, progress):
for i in range(RETRIES):
try:
async with session.get(url) as response:
data = await response.json()
progress.update(1) # Update the progress bar
return data
except Exception as e:
# Exponential backoff with jitter
await asyncio.sleep(DELAY * 2**i + random.uniform(0.1, 0.3))
print(f"Failed to fetch {url} after {RETRIES} retries")
return None
async def fetch_stories(session, story_ids, progress, c):
tasks = []
for story_id in story_ids:
# Check if the story already exists in the database
c.execute("SELECT 1 FROM stories WHERE id = ?", (story_id,))
if c.fetchone() is None:
# If the story doesn't exist, fetch it
story_url = f"https://hacker-news.firebaseio.com/v0/item/{story_id}.json"
tasks.append(
fetch(session, story_url, progress)
) # Pass the progress bar to the fetch function
stories = await asyncio.gather(*tasks)
return stories
# Create a new SQLite database and a table
conn = sqlite3.connect("./hn.db")
c = conn.cursor()
c.execute(
"""
CREATE TABLE IF NOT EXISTS stories
(id INT PRIMARY KEY NOT NULL,
deleted BOOLEAN,
type TEXT,
by TEXT,
time INT,
text TEXT,
dead BOOLEAN,
parent INT,
poll INT,
kids TEXT,
url TEXT,
score INT,
title TEXT,
parts TEXT,
descendants INT);
"""
)
conn.commit()
# Generate the list of story ids for the last year
story_ids = list(range(34209496, 37778496))
chunk_size = 500
progress = tqdm(total=len(story_ids))
async def main():
async with aiohttp.ClientSession() as session:
for i in range(0, len(story_ids), chunk_size):
chunk_ids = story_ids[i : i + chunk_size]
# Fetch individual stories
stories = await fetch_stories(session, chunk_ids, progress, c)
# Insert the data into the database
for story in stories:
if story is not None: # Check if the story is not None
c.execute(
"""
INSERT OR IGNORE INTO stories (id, deleted, type, by, time, text, dead, parent, poll, kids, url, score, title, parts, descendants)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
story["id"],
story.get("deleted"),
story.get("type"),
story.get("by"),
story.get("time"),
story.get("text"),
story.get("dead"),
story.get("parent"),
story.get("poll"),
str(story.get("kids")),
story.get("url"),
story.get("score"),
story.get("title"),
str(story.get("parts")),
story.get("descendants"),
),
)
conn.commit()
# Run the main function
asyncio.run(main())