/
index.ts
335 lines (287 loc) · 10.1 KB
/
index.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
/**
* Redirects the user to a URL based on the `url` query parameter, with optional
* Wayback Machine support.
*
* If the `referer` header is not present or not approved, the user is redirected
* to the index page.
*
* If the target URL is not reachable, the user is redirected to the Wayback
* Machine version of the URL (if available).
*
* If the Wayback Machine version is not available, the user is redirected to the
* original URL with a 302 status code.
**/
import type { Config, Context } from 'https://edge.netlify.com';
const defaultEndpoint = '/';
const defaultRedirectPage = '/index.html';
const defaultTimeout = 2000;
const allowUnapprovedToFollow = true; // set to false if self hosting
const debug = Deno.env.get('NETLIFY_DEV');
const preApproved = ['remysharp.com', 'unrot.link'];
if (Deno.env.get('NETLIFY_DEV')) {
preApproved.push('localhost');
}
// this is the netlify path config, nothing more
export const config: Config = {
path: defaultEndpoint,
};
function index(req: Request) {
// if you're running your own version of this, you may not want this redirect
return new URL(defaultRedirectPage, req.url);
}
function approved(referer: string) {
return preApproved.some((domain) => referer.includes(domain));
}
export default async function (req: Request, { next }: Context) {
const referer: string = req.headers.get('referer') || '';
const startTime = performance.now();
let waybackResponseStart = 0;
// if you landed on this url without a referer, then we'll redirect you to
// the defaultRedirectPage
if (!referer) {
return index(req);
}
const root = new URL(referer);
// this lets me work out if you're testing an XHR request (useful for debugging)
const acceptsHTML = req.headers.get('accept')?.includes('text/html');
// A helper function to generate a response depending on the request accept
// header and if they don't (i.e. XHR) then we'll return JSON
const redirect = (url: string, status: number) => {
if (acceptsHTML) {
return Response.redirect(url, status);
} else {
const now = performance.now();
return new Response(
JSON.stringify({
status,
url,
ms: now - startTime,
wayback: waybackResponseStart ? now - waybackResponseStart : -1,
}),
{
headers: {
'content-type': 'application/json',
},
}
);
}
};
// Get the URL from the query string parameter 'url'
const url = new URL(req.url);
// urlParam will be converted into a "real" url later on and stored in targetUrl
const urlParam = url.searchParams.get('url');
// require a query `url` param, otherwise we get redirected to the index page
if (urlParam === null) {
return index(req);
}
// if there's a query (i.e. we passed the above) but the referer is not approved
// then we'll redirect to the visitor to the page they intended to visit.
// previously this would send to the the access page, but I'm wary of breaking
// people's pages and having an unexpected result.
if (!approved(referer)) {
if (allowUnapprovedToFollow) {
return Response.redirect(urlParam, 302);
} else {
// if you're not approved, then we'll just return a 204 (no content)
return new Response(null, { status: 204 });
}
}
try {
const res = await next({ sendConditionalRequest: true });
if (res.status === 304) {
// if the client (browser) is has a cached version, just let them use it
return res;
}
let useWayback = !!url.searchParams.get('wayback');
const originMatch = !!url.searchParams.get('origin-match');
const timeout = parseInt(
url.searchParams.get('timeout') || defaultTimeout + '',
10
);
let targetUrl = null;
try {
targetUrl = new URL(urlParam);
} catch (_) {
// if the URL can't be parsed properly, redirect the user _back_ to the
// site they came from adding the query string for later debugging.
return redirect(root.toString() + '?bad-url=' + urlParam, 302);
}
if (debug) {
console.log('request settings', {
targetUrl: targetUrl.toString(),
useWayback,
originMatch,
timeout,
});
}
// now we'll try to _quickly_ connect to the origin, allowing for a 200ms
// timeout (which _should_ be enough). If the connection times out, then
// it's very likely the host is down, so we'll use the wayback machine.
if (!useWayback) {
try {
const connectPromise = Deno.connect({
hostname: targetUrl.hostname,
port: parseInt(targetUrl.port, 10) || 80,
});
await Promise.race([
connectPromise.then((conn) => conn.close()),
timeoutPromise(200),
]);
if (debug) {
console.log('connected to origin');
}
} catch (_) {
// if connect to origin fails/times out, then we'll use the wayback machine
if (debug) {
console.log(
'failed to connect to origin - switching to wayback',
_.message
);
}
useWayback = true;
}
}
let status = 0;
// if we're not using the wayback machine, then just try to request the URL
if (!useWayback) {
try {
const response = await fetchWithTimeout(
targetUrl,
{
method: 'HEAD',
headers: {
'user-agent':
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
accept: '*/*',
},
},
timeout
);
if (debug) {
console.log('fetch response', response);
}
status = response.status;
// if we've been redirected, then try to detect for zombie pages
if (response.redirected) {
const redirectUrl = new URL(response.url);
// first, if the origin completely changed, and we don't want origin swaps
// let's use the archive
if (redirectUrl.hostname !== targetUrl.hostname) {
// origin redirect
if (originMatch) {
status = 404;
}
}
// if we redirected and the path is the root (and we weren't looking for
// the root) then it's a 404/zombie, even if it's a 200
if (redirectUrl.pathname === '/' && targetUrl.pathname !== '/') {
if (debug) {
console.log('path redirected to root, flagging as 404');
}
status = 404;
}
}
} catch (_) {
status = 400;
if (_.message.includes('429')) {
// too many requests, origin is telling unrot.link to back off
// assume they'll let the client reach the origin though
status = 200;
}
if (debug) {
console.log(
'target request error (incl 404)',
targetUrl.toString(),
_.message
);
} else if (!_.message.includes('404')) {
console.log('target request error', targetUrl.toString(), _.message);
}
}
}
// Check the status code of the response
if (status === 200) {
// the target page is fine, so redirect to it as a perma-redirect
return redirect(urlParam, 301);
} else {
// If the status code is not 200, fetch the Wayback Machine CDX API
let waybackUrl = `https://web.archive.org/cdx/search/cdx?output=json&filter=statuscode:200&url=${encodeURIComponent(
urlParam
)}&`;
// then add the date of the blog post (if we can from that) to get an
// good representative of the page at the time
const date = url.searchParams.get('date');
if (date) {
waybackUrl += `from=${date.replace(/\D/g, '')}&`;
}
const headers = {
'user-agent': 'unrot.link',
};
waybackResponseStart = performance.now();
// -1 should work to get the latest result, but doesn't always…
const waybackResponse = await fetch(waybackUrl + `limit=-1`, {
headers,
});
let waybackData = (await waybackResponse.json()) as
| [string[], string[]]
| [];
if (waybackData.length === 0) {
// do it again, but this time with a limit of 1
const waybackResponse = await fetch(waybackUrl + `limit=1`, {
headers,
});
waybackData = (await waybackResponse.json()) as [string[], string[]];
}
if (debug) {
console.log('wayback data', waybackData[1]);
}
// Check if the Wayback Machine response includes a value of 200
if (waybackData && waybackData.length > 1 && waybackData[1]) {
// Redirect to the URL from Wayback Machine
const waybackUrl = new URL(
`https://web.archive.org/web/${waybackData[1][1]}/${waybackData[1][2]}`
);
return redirect(waybackUrl.toString(), 301);
} else {
// fail: sending 302 to original URL, there's no wayback data available
return redirect(targetUrl.toString(), 302);
}
}
} catch (error) {
// Handle any errors that occur during the execution
console.log('[fail] errored: ' + error.message);
return redirect(root.toString() + '?source=unrot.link-failed', 302);
}
}
async function fetchWithTimeout(
uri: URL,
options = {},
time = 5000
): Promise<Response> {
const controller = new AbortController();
const config = { ...options, signal: controller.signal };
setTimeout(() => controller.abort(), time);
try {
const response = await fetch(uri, config);
if (!response.ok) {
if (response.status === 405) {
return fetchWithTimeout(uri, { ...options, method: 'GET' }, time);
}
throw new Error(`${response.status}: ${response.statusText}`);
}
return response;
} catch (error) {
// When we abort our `fetch`, the controller conveniently throws
// a named error, allowing us to handle them separately from
// other errors.
if (error.name === 'AbortError') {
throw new Error('Response timed out');
}
throw new Error(error.message);
}
}
function timeoutPromise(ms: number) {
return new Promise((_, reject) =>
setTimeout(() => reject(new Error('Timeout')), ms)
);
}