-
Notifications
You must be signed in to change notification settings - Fork 0
/
stackoverflow.js
151 lines (128 loc) · 3.43 KB
/
stackoverflow.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
var async = require('async');
var _ = require('underscore');
var Xray = require('x-ray');
var xray = Xray();
var MongoClient = require('mongodb').MongoClient;
// TODO: move to config.
var mongoDbAddr = 'localhost';
var proxy = {
user: 'proxy_user',
password: 'proxy_password',
url: 'http://proxy.com:8080',
};
var stackoverflowBaseUrl = 'http://careers.stackoverflow.com';
function Crawler(searchBaseUrl, http) {
this.searchBaseUrl = searchBaseUrl;
/**
* Retrieve HTML page by the specified URI.
*/
this.getHtml = function(uri, onResponse) {
http.request({
uri: uri,
headers: {
'Proxy-Authorization':
makeProxyBasicAuthHeader(proxy.user, proxy.password),
'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
},
proxy: proxy.url,
}, onResponse);
};
this.parseHtml = function(resp, body, onParsed) {
xray(body, '.-job', [{
company: '.employer',
// Sometimes class names start with '-'.
company_: '.-employer',
link: 'a.job-link@href',
}])(onParsed);
};
this.saveCompanies = function (scrapedJobs, callback) {
jobs = makeJobRecords(scrapedJobs, stackoverflowBaseUrl);
console.log('Job ads:', jobs);
saveCompanies(jobs);
// success
callback(null);
};
this.onDone = function(err) {
if (err) {
console.log('Failed to scrape companies:', err);
} else {
console.log('Successfully scraped companies.');
}
};
self = this;
/**
* Executes the scraper.
*
* @param searchKeyword search stackoverflow careers with this keyword.
*/
this.exec = function(searchKeyword) {
var uri = makeSearchUrl(self.searchBaseUrl, searchKeyword)
async.waterfall([
function (callback) {
callback(null, uri);
},
this.getHtml,
this.parseHtml,
this.saveCompanies,
], this.onDone);
};
}
exports.Crawler = Crawler;
/**
* Returns an array of job records with time when job ad was found appended.
*
* @param scrapedJobs array of scraped jobs. It is not mutated.
* @return new array of job ads.
*/
function makeJobRecords(scrapedJobs, baseLinkUrl) {
var jobRecords = scrapedJobs.slice();
_.each(jobRecords, function(element, index, list) {
if ('company' in element) {
list[index]['company'] = element['company'].trim();
} else if ('company_' in element) {
list[index]['company'] = element['company_'].trim();
delete list[index]['company_'];
}
if ('link' in list[index]) {
list[index]['link'] = baseLinkUrl + element['link'];
}
list[index]['found_at'] = new Date();
});
return jobRecords;
}
exports.makeJobRecords = makeJobRecords;
function makeSearchUrl(searchBaseUrl, keyword) {
return searchBaseUrl + keyword;
}
function makeProxyBasicAuthHeader(username, password) {
return 'Basic ' + new Buffer(username + ':' + password).toString('base64');
}
/**
* Stores scraped companies to mongo db.
*/
function saveCompanies(companies) {
var url = 'mongodb://' + mongoDbAddr + ':27017/scraping_companies';
async.waterfall([
function(callback) {
MongoClient.connect(url, callback);
},
function(db, callback) {
db.collection('companies').insert(
companies,
{ keepGoing: true },
function(err, insertedRecords) {
callback(err, db, insertedRecords);
}
);
}
], function(err, db, records) {
if (err && err.code != 11000) {
console.log('Failed to store scraped companies to DB:',
err);
} else {
console.log('Companies saved in DB');
}
db.close();
});
}