-
Notifications
You must be signed in to change notification settings - Fork 0
/
job51Spider.py
148 lines (139 loc) · 5.63 KB
/
job51Spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import requests
import json
import re
import fake_useragent
from bs4 import BeautifulSoup
import os
import sys
sys.path.append('d:\\Projects\\Python\\job-analyse\\')
import Proxy.ipProxy
import random
import Store.mongoStore
class Job51Spider(object):
def __init__(self):
self.session = requests.session()
self.proxyUrl = 'https://www.kuaidaili.com/free/'
def extractCity(self, cityName=None):
if (os.path.exists('bossCity.json') and cityName != None):
with open('bossCity.json', 'rt', encoding='utf-8') as fp:
cityList = json.load(fp)
for city in cityList:
if (city['name'] == cityName):
return city['code']
else:
response = requests.get(self.cityUrl)
response.raise_for_status()
json_data = response.json();
if (json_data['code'] == 0 and json_data['zpData'] != None):
cityList = []
for level in json_data['zpData']['cityList']:
cityList.extend(self.unfoldLevel(level))
with open('bossCity.json', 'wt', encoding='utf-8') as fp:
json.dump(cityList, fp)
if (cityName != None):
for city in cityList:
if (city['name'] == cityName):
return city['code']
else:
return json_data['zpData']['locationCity']['code']
def unfoldLevel(self, level):
result = []
result.append({'name':level['name'], 'code':level['code']})
if (level['subLevelModelList'] == None):
return result
else:
for subLevel in level['subLevelModelList']:
result.extend(self.unfoldLevel(subLevel))
return result
def extractCompany(self, detail):
companyItem = {}
# CompanyId
companyItem['cId'] = detail['coid']
companyItem['title'] = detail['company_name']
# 行业、经营状态、规模
companyItem['industry'] = detail['companyind_text']
companyItem['finance'] = detail['companytype_text']
companyItem['size'] = detail['companysize_text']
# 标签
companyItem['tags'] = detail['jobwelf_list']
companyItem['href'] = detail['company_href']
return companyItem
def extractJob(self, detail):
jobItem = {}
# JobId
jobItem['jId'] = detail['jobid']
# 工作名称
jobItem['title'] = detail['job_title'];
# 工作区域
jobItem['area'] = detail['workarea_text']
for attr in detail['attribute_text']:
if (attr in ['不限','经验不限'] or '年' in attr):
jobItem['exps'] = attr
if (attr in ['不限','高中','大专','硕士','本科','博士']):
jobItem['eduInfo'] = attr
# 工资、经验、学历
jobItem['salary'] = detail['providesalary_text']
# 标签
jobItem['tags'] = detail['tags']
if (len(jobItem['tags']) <= 0):
jobItem['tags'] = detail['jobwelf_list']
# 描述
jobItem['desc'] = ''
jobItem['href'] = detail['job_href']
jobItem['industry'] = detail['companyind_text']
jobItem['company'] = detail['company_name']
return jobItem
def makeRequest(self, url):
try:
ua = fake_useragent.UserAgent(verify_ssl=False)
headers = {
'User-Agent':ua.random,
'TE':'Trailers',
}
proxy = Proxy.ipProxy.IpProxy(self.proxyUrl)
ipList = proxy.getIpList()
ip = random.choice(ipList)
proxies = {
'http': ip,
}
response = self.session.get(url, timeout=30, headers=headers, proxies=proxies, verify=False)
response.raise_for_status()
return response.content.decode("gbk")
except Exception as ex:
print(ex)
return None
def searchJobs(self, cityName, query, page=1):
cityCode = cityName
if (cityCode != None):
searchUrl = 'https://search.51job.com/list/200200,000000,0000,00,9,99,+,2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(page=str(page))
html = self.makeRequest(searchUrl)
data = re.findall('window.__SEARCH_RESULT__ =(.+)}</script>', str(html))[0] + "}"
details = json.loads(data)['engine_search_result']
jobItems = []
companyItems = []
for detail in details:
jobItem = self.extractJob(detail)
if (jobItem == None):
continue
else:
jobItems.append(jobItem)
companyItem = self.extractCompany(detail)
if (companyItem == None):
continue
else:
companyItems.append(companyItem)
return (jobItems,companyItems)
if __name__ == '__main__':
spider = Job51Spider();
store = Store.mongoStore.MongoStore('default')
# spider.extractCity('西宁')
# spider.extractCity()
# spider.extractJob('https://www.zhipin.com/job_detail/a046e72a1336b3820nd-0t6-EFc~.html')
page = 1;
result = spider.searchJobs('854','',page)
while(len(result[0]) > 0 and len(result[1]) > 0):
page += 1
print('正在抓取第' + str(page) + '页')
result = spider.searchJobs('854','',page)
store.insert('job',result[0])
store.insert('company',result[1])