-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
local-crawler.js
360 lines (328 loc) · 13.2 KB
/
local-crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
const { Builder, By, until } = require("selenium-webdriver");
const firefox = require("selenium-webdriver/firefox");
const fs = require("fs");
const { parse } = require("csv-parse");
const { LOCATION_VALUES, ONE_MINUTE_IN_MS, FOREVER } = require("./constants");
// for the time being, the extension will need to have these values fed into it, otherwise it will not work
var TARGET_LAT = 41.5569;
var TARGET_LONG = -72.6652;
var TARGET_ZIP = "06457";
var TEST_MODE = false;
var DEBUG_MODE = false;
var WAIT_TIME = ONE_MINUTE_IN_MS;
var START_INDEX = 0;
// Argument processing
if (process.argv.length > 2) {
process.argv.forEach((value, index) => {
// Test mode
if (value == "test" && index > 1) {
TEST_MODE = true;
WAIT_TIME = FOREVER;
console.log("Testing mode enabled!");
}
// Debug Mode
if (value == "debug" && index > 1) {
DEBUG_MODE = true;
console.log("Debugging enabled!");
}
// Location check
if (Object.keys(LOCATION_VALUES).includes(value.toUpperCase())) {
const location_arg = process.argv[index].toUpperCase();
console.log("You are crawling from: ", location_arg);
TARGET_LAT = LOCATION_VALUES[location_arg].lat;
TARGET_LONG = LOCATION_VALUES[location_arg].long;
TARGET_ZIP = LOCATION_VALUES[location_arg].zip;
}
// Starting point: Start from the specified index. Helpful if you need to restart a crawl after a crash
if (value.indexOf("site") > -1) {
START_INDEX = parseInt(value.split("=")[1]);
console.log("Starting from site ID: ", START_INDEX);
}
});
}
var total_begin = Date.now(); //start logging time
var err_obj = new Object();
// Loads sites to crawl
const sites = [];
fs.createReadStream("./validation-lists/test-list3.csv")
//fs.createReadStream("../test_crawl_lists/us-ca_test_list.csv")
//fs.createReadStream("sites.csv")
//fs.createReadStream("val_set_sites1.csv")
.pipe(parse({ delimiter: ",", from_line: 2 }))
.on("data", function (row) {
sites.push(row[0]);
})
.on("error", function (error) {
console.log(error.message);
});
var options;
let driver;
// write a custom error
// we throw this the title of the site has a human check
// then we can identify sites that we can't crawl with the vpn on
class HumanCheckError extends Error {
constructor(message) {
super(message);
this.name = "HumanCheckError";
}
}
async function setup() {
await new Promise((resolve) => setTimeout(resolve, 3000));
options = new firefox.Options()
.setBinary(firefox.Channel.NIGHTLY)
.setBinary("C:/Program Files/Firefox Nightly/firefox.exe")
//.setBinary("/Applications/Firefox Nightly.app/Contents/MacOS/firefox-bin")
.setPreference("xpinstall.signatures.required", false)
.setPreference("geo.enabled", true)
.setPreference("geo.provider.use_corelocation", true)
.setPreference("geo.prompt.testing", true)
.setPreference(
"geo.provider.network.url",
"https://www.googleapis.com/geolocation/v1/geolocate?key=%GOOGLE_LOCATION_SERVICE_API_KEY%"
)
.setPreference(
"browser.region.network.url",
"https://location.services.mozilla.com/v1/country?key=%MOZILLA_API_KEY%"
)
.setPreference("geo.prompt.testing.allow", true)
.setPreference("privacy.trackingprotection.cryptomining.enabled", false)
.setPreference("privacy.trackingprotection.enabled", false)
.setPreference("privacy.partition.network_state", false)
.setPreference("privacy.antitracking.enableWebcompat", false)
.setPreference(
"privacy.trackingprotection.emailtracking.pbmode.enabled",
false
)
.setPreference("privacy.trackingprotection.fingerprinting.enabled", false)
.setPreference("privacy.trackingprotection.pbmode.enabled", false)
.setPreference("network.cookie.cookieBehavior", 0)
.setPreference("privacy.fingerprintingProtection.pbmode", false);
DEBUG_MODE
? options.addExtensions("extDebug.xpi")
: options.addExtensions("ext.xpi");
options.addArguments("--headful");
TEST_MODE ? options.addArguments("-devtools") : {};
driver = new Builder()
.forBrowser("firefox")
.setFirefoxOptions(options)
.build();
// set timeout so that if a page doesn't load in 30 s, it times out
await driver
.manage()
.setTimeouts({ implicit: 0, pageLoad: WAIT_TIME, script: WAIT_TIME });
console.log("built");
//const privacyPioneerWindow = await driver.getWindowHandle();
await new Promise((resolve) => setTimeout(resolve, 2000));
const windows = await driver.getAllWindowHandles();
const originalWindow = windows[0];
const privacyPioneerWindow = windows[1]; // we know that PP will open up to the second window
await driver.switchTo().window(privacyPioneerWindow);
console.log("all windows: ");
console.log(windows);
console.log("PP window:" + privacyPioneerWindow);
console.log("switch to window");
try {
// first, close the initial alert ("Privacy Pioneer does not collect your data...")
await new Promise((resolve) => setTimeout(resolve, 7000));
await driver.switchTo().alert().accept(); //close the alert
console.log("closed alert");
// next, for each prompt that pops up, we need to switch to that window, provide the appropriate values, and close it
await new Promise((resolve) => setTimeout(resolve, 3000));
await driver.switchTo().alert().sendKeys(TARGET_LAT.toString());
await driver.switchTo().alert().accept();
await new Promise((resolve) => setTimeout(resolve, 3000));
await driver.switchTo().alert().sendKeys(TARGET_LONG.toString());
await driver.switchTo().alert().accept();
await new Promise((resolve) => setTimeout(resolve, 3000));
await driver.switchTo().alert().sendKeys(TARGET_ZIP);
console.log("input zip code:", TARGET_ZIP);
await driver.switchTo().alert().accept();
await new Promise((resolve) => setTimeout(resolve, 3000));
// now, we click skip tour button
await driver
.findElement(By.xpath("/html/body/div[3]/div/div/div/div[2]/div/button"))
.click()
.finally();
console.log("clicked alert");
await new Promise((resolve) => setTimeout(resolve, 2000));
console.log("alert closed/tour skipped");
await driver.close(); //close pp window
await new Promise((resolve) => setTimeout(resolve, 4000));
await driver.switchTo().window(originalWindow);
await new Promise((resolve) => setTimeout(resolve, 3000));
console.log("setup complete");
} catch (e) {
console.log("Error: " + e);
console.log("Error occurred during setup. Restarting driver...");
await setup();
}
}
async function visit_site(sites, site_id) {
var error_value = "no_error";
console.log(site_id, ": ", sites[site_id]);
try {
await driver.get(sites[site_id]);
// console.log(Date.now()); to compare to site loading time in debug table
await new Promise((resolve) => setTimeout(resolve, WAIT_TIME));
// check if access is denied
// if so, throw an error so it gets tagged as a human check site
var title = await driver.getTitle();
let iframeElement = await driver.findElements(By.xpath("//iframe")); // check for the existence of an iframe element...
if (iframeElement.length > 0) {
// console.log("switching to iframe...");
try {
await driver.switchTo().frame(iframeElement[0]);
// console.log("switched");
// check if the iframe is one that indicates a human-check error.
let robo_check = await driver.findElements(
By.xpath(
'//*[contains(text(), "You are browsing and clicking at a speed much faster than expected of a human being.")]' // A common phrase on sites that block crawlers
)
);
let captchaElement = await driver.findElements(
By.xpath('//*[contains(@class, "captcha")]')
);
if (robo_check.length > 0 || captchaElement.length > 0) {
// if the site has this phrase within an iframe, throw the HumanCheck error
throw new HumanCheckError("Human Check");
}
} catch (e) {
// log the errors in an object so you don't have to sort through manually
if (e.name + msg in err_obj) {
err_obj[e.name].push(sites[site_id]);
} else {
err_obj[e.name] = [sites[site_id]];
}
console.log(err_obj);
error_value = e.name; // update error value
///////////////
// converting the JSON object to a string
var err_data = JSON.stringify(err_obj);
// writing the JSON string content to a file
fs.writeFile("error-logging.json", err_data, (error) => {
// throwing the error
// in case of a writing problem
if (error) {
// logging the error
console.log("Failed to write error at error-logging.json!");
console.error(error);
// make a note at which site we were at:
console.log("Writing failed at site: " + sites[site_id]);
console.log("-------------------");
console.log(err_data + " @ " + sites[site_id]);
// throw error;
}
console.log("error-logging.json written correctly");
});
}
// console.log(robo_check);
await driver.switchTo().defaultContent();
}
if (
(title.match(/Access/i) && title.match(/Denied/i)) ||
title.match(/error/i) ||
(title.match(/service/i) && title.match(/unavailable/i)) ||
title.match(/Just a moment.../i) ||
title.match(/you have been blocked/i) ||
title.match(/site not available/i) ||
title.match(/attention required/i) ||
title.match(/access to this page has been blocked/i) ||
(title.match(/site/i) && title.match(/temporarily unavailable/i)) ||
(title.match(/site/i) && title.match(/temporarily down/i)) ||
title.match(/403 forbidden/i) ||
title.match(/pardon our interruption/i) ||
title.match(/robot or human/i) ||
title.match(/are you a robot/i) ||
title.match(/block -/i) ||
title.match(/Human Verification/i)
) {
throw new HumanCheckError("Human Check");
}
// console.log("GRAB THE HAR FILE NOW!");
// await new Promise((resolve) => setTimeout(resolve, 15000));
} catch (e) {
console.log(e);
var msg = "";
// we want to separate the reaching an error page from other webdriver errors
if (e.message.match(/reached error page/i)) {
msg = ": Reached Error Page";
}
// log the errors in an object so you don't have to sort through manually
if (e.name + msg in err_obj) {
err_obj[e.name + msg].push(sites[site_id]);
} else {
err_obj[e.name + msg] = [sites[site_id]];
}
console.log(err_obj);
error_value = e.name; // update error value
///////////////
// converting the JSON object to a string
var err_data = JSON.stringify(err_obj);
// writing the JSON string content to a file
fs.writeFile("error-logging.json", err_data, (error) => {
// throwing the error
// in case of a writing problem
if (error) {
// logging the error
console.log("Failed to write error at error-logging.json!");
console.error(error);
// make a note at which site we were at:
console.log("Writing failed at site: " + sites[site_id]);
console.log("-------------------");
console.log(err_data + " @ " + sites[site_id]);
// throw error;
}
console.log("error-logging.json written correctly");
});
//////////////////////
// if it's just a human check site, we don't need to restart
if (e.name != "HumanCheckError") {
if (e.message.match(/Failed to decode response from marionette/i)) {
console.log(
e.name + ": " + e.message + "-- driver should already have quit "
);
}
console.log("------restarting driver------");
// Now, check if the error has been handled gracefully, or whether there has been a crash of the entire browser. This can happen on some sites when using Firefox Nightly.
let driver_running = true;
try {
let testTitleGrab = await driver.getTitle(); // try grabbing the title - if it fails, you know that the driver has crashed
} catch {
driver_running = false;
console.log(
"Looks like " + sites[site_id] + " caused the browser to crash"
);
}
if (driver_running) {
await driver.quit(); // only try to quit once it has been confirmed that the driver still exists and hasn't crashed
}
await new Promise((resolve) => setTimeout(resolve, 30000)); // Necessary to allow the driver to quit properly, otherwise errors that can't be handled are thrown.
await setup();
}
}
return error_value;
}
(async () => {
await setup();
var error_value = "no_error";
for (
let current_site = START_INDEX;
current_site < sites.length;
current_site++
) {
// const site_id = Number(site_id_str);
var begin_site = Date.now(); // for timing
await new Promise((resolve) => setTimeout(resolve, 3500));
error_value = await visit_site(sites, current_site);
var end_site = Date.now();
var timeSpent_site = (end_site - begin_site) / 1000;
console.log(
"time spent: ",
timeSpent_site,
"total elapsed: ",
(end_site - total_begin) / 1000
);
}
//@ts-ignore
driver.quit();
})();