-
Notifications
You must be signed in to change notification settings - Fork 116
/
fetch.js
394 lines (352 loc) · 12.2 KB
/
fetch.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
// @flow
/**
* Class for retrieving data from the Discourse API.
*
* The Discourse API implements the JSON endpoints for all functionality of the actual site.
* As such, it tends to return a lot of information that we don't care about (in contrast
* to a GraphQL API which would give us only what we ask for). As such, we implement a simple
* interface over it, which both abstracts over calling the API, and does some post-processing
* on the results to simplify it to data that is relevant for us.
*/
import fetch from "isomorphic-fetch";
import Bottleneck from "bottleneck";
import * as NullUtil from "../../util/null";
export type UserId = number;
export type PostId = number;
export type TopicId = number;
export type CategoryId = number;
/**
* The "view" received from the Discourse API
* when getting a topic by ID.
*
* This filters some relevant data like bumpedMs,
* and the type separation makes this distinction clear.
*/
export type TopicView = {|
+id: TopicId,
+categoryId: CategoryId,
+title: string,
+timestampMs: number,
+authorUsername: string,
|};
/**
* The "latest" format Topic from the Discourse API
* when getting a list of sorted topics.
*
* This filters relevant data like authorUsername,
* and the type separation makes this distinction clear.
*/
export type TopicLatest = {|
+id: TopicId,
+categoryId: CategoryId,
+title: string,
+timestampMs: number,
+bumpedMs: number,
|};
/**
* A complete Topic object.
*/
export type Topic = {|
...TopicView,
...TopicLatest,
|};
export type Post = {|
+id: PostId,
+topicId: TopicId,
// Which number post this was within the topic (starts at 1)
+indexWithinTopic: number,
// The indexWithinTopic of the post within the same topic that this post was a
// reply to. Will be `null` if this post was the first post, or if it was a
// reply to the first post.
+replyToPostIndex: number | null,
+timestampMs: number,
+authorUsername: string,
// The post HTML for rendering.
+cooked: string,
|};
export type TopicWithPosts = {|
+topic: TopicView,
// Guaranteed to contain all the Posts in the topic.
+posts: $ReadOnlyArray<Post>,
|};
export type LikeAction = {|
// The user who liked something
+username: string,
// The post being liked
+postId: PostId,
+timestampMs: number,
|};
/**
* Interface over the external Discourse API, structured to suit our particular needs.
* We have an interface (as opposed to just an implementation) to enable easy mocking and
* testing.
*/
export interface Discourse {
// Retrieve the Topic with Posts for a given id.
// Will resolve to null if the response status is 403 or 404. 403 because the
// topic may be hidden from the API user; 404 because we sometimes see
// 404s in prod and want to ignore those topic ids. (Not sure why it happens.)
// May reject if the status is not OK and is not 404 or 403.
topicWithPosts(id: TopicId): Promise<TopicWithPosts | null>;
/**
* Retrieves the like actions that were initiated by the target user.
* May be 404 on the server, which will return a null here.
*/
likesByUser(
targetUsername: string,
offset: number
): Promise<LikeAction[] | null>;
// Gets the topic IDs for every "about-x-category" topic.
// Discourse calls this a "definition" topic.
categoryDefinitionTopicIds(): Promise<Set<TopicId>>;
/**
* Fetches Topics that have been bumped to a higher timestamp than `sinceMs`.
*
* Note: this will not be able to find "about-x-category" category definition topics.
* due to a hard-coded filter in the API.
* https://github.com/discourse/discourse/blob/594925b8965a26c512665371092fec3383320b58/app/controllers/list_controller.rb#L66
*
* Use categoryDefinitionTopicIds() to find those topics.
*/
topicsBumpedSince(sinceMs: number): Promise<TopicLatest[]>;
}
const MAX_API_REQUESTS_PER_MINUTE = 55;
export class Fetcher implements Discourse {
// We limit the rate of API requests, as documented here:
// https://meta.discourse.org/t/global-rate-limits-and-throttling-in-discourse/78612
// Note this limit is for admin API keys. If we change to user user API keys
// (would be convenient as the keys would be less sensitive), we will need to lower
// this rate limit by a factor of 3
// TODO: I've set the max requests per minute to 55 (below the stated limit
// of 60) to be a bit conservative, and avoid getting limited by the server.
// We could improve our throughput by increasing the requests per minute to the
// stated limit, and incorporating retry logic to account for the occasional 529.
+options: DiscourseFetchOptions;
+_fetchImplementation: typeof fetch;
constructor(
options: DiscourseFetchOptions,
// fetchImplementation shouldn't be provided by clients, but is convenient for testing.
fetchImplementation?: typeof fetch,
// Used to avoid going over the Discourse API rate limit
minTimeMs?: number
) {
this.options = options;
const minTime = NullUtil.orElse(
minTimeMs,
(1000 * 60) / MAX_API_REQUESTS_PER_MINUTE
);
// n.b. the rate limiting isn't programmatically tested. However, it's easy
// to tell when it's broken: try to load a nontrivial Discourse server, and see
// if you get a 429 failure.
const limiter = new Bottleneck({minTime});
const unlimitedFetch = NullUtil.orElse(fetchImplementation, fetch);
this._fetchImplementation = limiter.wrap(unlimitedFetch);
}
_fetch(endpoint: string): Promise<Response> {
const {serverUrl} = this.options;
if (!endpoint.startsWith("/")) {
throw new Error(`invalid endpoint: ${endpoint}`);
}
if (!serverUrl.startsWith("http") || serverUrl.endsWith("/")) {
throw new Error(`invalid server url: ${serverUrl}`);
}
const fetchOptions = {
method: "GET",
headers: {
Accept: "application/json",
},
};
const fullUrl = `${serverUrl}${endpoint}`;
return this._fetchImplementation(fullUrl, fetchOptions);
}
async categoryDefinitionTopicIds(): Promise<Set<TopicId>> {
const topicIdRE = new RegExp("/t/[\\w-]+/(\\d+)$");
const urls: string[] = [];
const categoriesWithSubcategories: CategoryId[] = [];
// Root categories
const response = await this._fetch(
`/categories.json?show_subcategory_list=true`
);
failIfMissing(response);
failForNotOk(response);
const {categories: rootCategories} = (await response.json()).category_list;
for (const cat of rootCategories) {
if (cat.topic_url != null) {
urls.push(cat.topic_url);
}
if (cat.subcategory_ids) {
categoriesWithSubcategories.push(cat.id);
}
}
// Subcategories
for (const rootCatId of categoriesWithSubcategories) {
const subResponse = await this._fetch(
`/categories.json?show_subcategory_list=true&parent_category_id=${rootCatId}`
);
failIfMissing(subResponse);
failForNotOk(subResponse);
const {categories: subCategories} = (
await subResponse.json()
).category_list;
for (const cat of subCategories) {
if (cat.topic_url != null) {
urls.push(cat.topic_url);
}
}
}
const ids = urls.map((url) => {
const match = topicIdRE.exec(url);
if (match == null) {
throw new Error(
`Encountered topic URL we failed to parse it's TopicId from: ${url}`
);
}
return Number(match[1]);
});
return new Set(ids);
}
async topicWithPosts(id: TopicId): Promise<TopicWithPosts | null> {
const response = await this._fetch(`/t/${id}.json`);
const {status} = response;
if (status === 403 || status === 404 || status === 410) {
// The topic is hidden, deleted, or otherwise missing.
// Example of a 404 topic: https://discourse.sourcecred.io/t/116
return null;
}
failForNotOk(response);
const json = await response.json();
const {posts_count: postCount} = json;
let posts = json.post_stream.posts.map(parsePost);
const topic: TopicView = {
id: json.id,
categoryId: json.category_id,
title: json.title,
timestampMs: Date.parse(json.created_at),
authorUsername: json.details.created_by.username,
};
// This shouldn't could cause infinite loops when the API is weird.
// As requesting pages beyond the last page will produce a 404.
// Pagination here is 1-based, and we already had page 1.
let page = 2;
while (postCount > posts.length) {
const resNext = await this._fetch(`/t/${id}.json?page=${page}`);
failForNotOk(resNext);
const subPosts = (await resNext.json()).post_stream.posts.map(parsePost);
posts = [...posts, ...subPosts];
page++;
}
return {topic, posts};
}
async likesByUser(
username: string,
offset: number
): Promise<LikeAction[] | null> {
const response = await this._fetch(
`/user_actions.json?username=${username}&filter=1&offset=${offset}`
);
const {status} = response;
if (status === 404) {
// The user probably no longer exists. This is expected, see #1440.
return null;
}
failIfMissing(response);
failForNotOk(response);
const json = await response.json();
return json.user_actions.map(parseLike);
}
async topicsBumpedSince(sinceMs: number): Promise<TopicLatest[]> {
const topics: TopicLatest[] = [];
let lastUnpinnedTimestamp: number = Infinity;
let morePages: boolean = true;
let page: number = 0;
// Keep going till we've found timestamps older than sinceMs.
while (lastUnpinnedTimestamp >= sinceMs && morePages) {
const response = await this._fetch(
`/latest.json?order=activity&ascending=false&page=${page}`
);
failIfMissing(response);
failForNotOk(response);
const {topic_list: topicList} = await response.json();
// Having the same amount of results as expected by pagination, assume there's another page.
morePages = topicList.per_page == topicList.topics.length;
for (const jsonTopic of topicList.topics) {
const topic = parseLatestTopic(jsonTopic);
// Due to how pinning works, we may have some topics in here that weren't bumped past `sinceMs`.
// Filter those out now.
if (topic.bumpedMs > sinceMs) {
topics.push(topic);
}
// Make sure we ignore pinned topics for this value, as pinned topics move to the top,
// and are unhelpful in knowing whether we should fetch another page.
if (!jsonTopic.pinned) {
lastUnpinnedTimestamp = Math.min(
lastUnpinnedTimestamp,
topic.bumpedMs
);
}
}
page++;
}
return topics;
}
}
function failIfMissing(response: Response) {
if (response.status === 404) {
throw new Error(`404 Not Found on: ${response.url}; maybe bad serverUrl?`);
}
if (response.status === 403) {
throw new Error(`403 Forbidden: bad API username or key?`);
}
if (response.status === 410) {
throw new Error(`410 Gone`);
}
}
function failForNotOk(response: Response) {
if (!response.ok) {
throw new Error(`not OK status ${response.status} on ${response.url}`);
}
}
/**
* Parses a "latest" topic.
*
* A "latest" topic, is a topic as returned by the /latest.json API call,
* and has a distinct assumptions:
* - bumped_at is always present.
*
* usernamesById map used to resolve these IDs to usernames.
*/
function parseLatestTopic(json: any): TopicLatest {
if (json.bumped_at == null) {
throw new Error(
`Unexpected missing bumped_at field for /latest.json request for topic ID ${json.id}.`
);
}
return {
id: json.id,
categoryId: json.category_id,
title: json.title,
timestampMs: Date.parse(json.created_at),
bumpedMs: Date.parse(json.bumped_at),
};
}
function parsePost(json: any): Post {
return {
id: json.id,
timestampMs: Date.parse(json.created_at),
indexWithinTopic: json.post_number,
replyToPostIndex: json.reply_to_post_number,
topicId: json.topic_id,
authorUsername: json.username,
cooked: json.cooked,
};
}
function parseLike(json: any): LikeAction {
return {
username: json.target_username,
postId: json.post_id,
timestampMs: Date.parse(json.created_at),
};
}
export type DiscourseFetchOptions = {|
serverUrl: string,
|};