In [1]:
import scrapy

In [12]:
from scrapy import Item, Field

In [6]:
from scrapy.loader import ItemLoader

In [9]:
from scrapy.selector import Selector

In [46]:
from scrapy.loader.processors import Join, MapCompose, TakeFirst, Compose

In [36]:
from w3lib.html import remove_tags

In [2]:
import os

In [3]:
files = os.listdir('job_info/job_page_htmls/')

In [211]:
with open('job_info/job_page_htmls/' + files[0], 'r') as f:
    body = f.read()

In [212]:
selector = Selector(text=body)

# Information to collect
1. Title
2. Responsibility
3. Requirements
4. Company information
5. Salary
6. Industry
7. Location
8. Number of people to be recruited

In [13]:
class JobSummary(Item):
    title = Field()
    salary = Field()
    company = Field()
    location = Field()
    experience = Field()
    degree = Field()
    number_to_recruit = Field()

In [83]:
class JobSummaryLoader(ItemLoader):
    default_output_processor = Compose(TakeFirst(), remove_tags, lambda x: x.strip())

In [84]:
loader = JobSummaryLoader(item=JobSummary(), selector=selector)

In [87]:
loader.add_css('title', 'li.info-h3::text')
loader.add_css('salary', '.info-money strong::text')
loader.add_css('company', '.company a::text')
loader.add_css('location', '.info-three a::text')
loader.add_css('experience', '.info-three span:nth-child(2)')
loader.add_css('degree', '.info-three span:nth-child(3)')
loader.add_css('number_to_recruit', '.info-three span:nth-child(4)')

In [88]:
loader.load_item()

{'company': '满金坝（深圳）科技有限公司',
 'degree': '大专',
 'experience': '5-10年',
 'location': '深圳',
 'number_to_recruit': '招1人',
 'salary': '18000-25000元/月',
 'title': '.net WPF高级开发工程师'}

In [173]:
class JobDescription(Item):
    description = Field()

In [175]:
def filter_empty(value):
    if value: return value

In [176]:
class JobDescriptionLoader(ItemLoader):
    default_output_processor = MapCompose(remove_tags, lambda x: x.strip(), filter_empty)

In [194]:
job_desc_loader = JobDescriptionLoader(JobDescription(), selector=selector)

In [195]:
job_desc_loader.add_css('description', 'div.responsibility.pos-common *::text')

In [196]:
job_desc_loader.load_item()

{'description': ['技能要求:',
                 'C#/.NET',
                 '职位描述',
                 '岗位职责：',
                 '1、参与系统平台和项目的设计、开发、维护和管理；',
                 '2、制定开发计划，独立完成模块级的设计及规划,负责核心及模块源代码的编写',
                 '3. 主导PC端即时通讯聊天系统的整体设计、任务安排、WPF界面开发、核心代码重构',
                 '任职要求：',
                 '1. 6年以上.Net开发工作经验，有即时通讯平台研发经验',
                 '2. 对.Net技术有深入研究，熟练掌握TCP/UDP, 多线程及异步编程。',
                 '3. '
                 '精通WPF框架，擅长WPF控件的定制开发，在数据传输较大，操作密集的情况下能保证用户界面的展示与操作的流畅性，能解决多线程并发造成的线程阻塞，界面卡顿等问题',
                 '4. 具有较强的面向对象的分析、设计、开发能力，熟悉多种设计模式，能遵循良好的编程规范；',
                 '5.熟悉SQLite数据库，能开发高性能本地数据库；',
                 '6. 良好的职业素质、团队合作精神和团队沟通能力；',
                 '查看全部內容']}

In [216]:
class CompanyInfo(Item):
    industry = Field()
    register_type = Field()
    company_size = Field()
    company_url = Field()
    company_location = Field()

In [225]:
class CompanyInfoLoader(ItemLoader):
    default_output_processor = Compose(TakeFirst(), remove_tags, lambda x: x.strip())

In [226]:
company_loader = CompanyInfoLoader(CompanyInfo(), selector=selector)

In [227]:
company_loader.add_css('industry', '.icon-promulgator-person+ strong a::text')
company_loader.add_css('register_type', '.icon-promulgator-type+ strong::text')
company_loader.add_css('company_size', '.icon-promulgator-link+ strong::text')
company_loader.add_css('company_url', '.icon-promulgator-url+ strong a::text')
company_loader.add_css('company_location', '.icon-promulgator-addres+ strong::text')

In [228]:
company_loader.load_item()

{'company_location': '深圳市福田区华强北路群星广场A座24楼',
 'company_size': '100-499人',
 'company_url': 'https://www.manjd.com/home',
 'industry': '互联网/电子商务',
 'register_type': '民营'}

# Actual loading

In [188]:
for (k, v) in summary_loader.item.items():
    print(k, v)

title .net WPF高级开发工程师
salary 18000-25000元/月
company 满金坝（深圳）科技有限公司
location 深圳
experience 5-10年
degree 大专
number_to_recruit 招1人


In [229]:
all_info = []

for i, f in enumerate(files):
    print('Processing file ', i)
    
    info = {}
    
    info['job_link'] = f
    
    with open('job_info/job_page_htmls/' + f, 'r') as f:
        body = f.read()
    
    selector = Selector(text=body)
    
    summary_loader = JobSummaryLoader(item=JobSummary(), selector=selector)
    summary_loader.add_css('title', 'li.info-h3::text')
    summary_loader.add_css('salary', '.info-money strong::text')
    summary_loader.add_css('company', '.company a::text')
    summary_loader.add_css('location', '.info-three a::text')
    summary_loader.add_css('experience', '.info-three span:nth-child(2)')
    summary_loader.add_css('degree', '.info-three span:nth-child(3)')
    summary_loader.add_css('number_to_recruit', '.info-three span:nth-child(4)')
    summary_loader.load_item()
    
    desc_loader = JobDescriptionLoader(item=JobDescription(), selector=selector)
    desc_loader.add_css('description', 'div.responsibility.pos-common *::text')
    desc_loader.load_item()
    
    
    company_loader = CompanyInfoLoader(CompanyInfo(), selector=selector)
    company_loader.add_css('industry', '.icon-promulgator-person+ strong a::text')
    company_loader.add_css('register_type', '.icon-promulgator-type+ strong::text')
    company_loader.add_css('company_size', '.icon-promulgator-link+ strong::text')
    company_loader.add_css('company_url', '.icon-promulgator-url+ strong a::text')
    company_loader.add_css('company_location', '.icon-promulgator-addres+ strong::text')
    company_loader.load_item()
    
    for k, v in summary_loader.item.items():
        info[k] = v
    
    for k, v in desc_loader.item.items():
        info[k] = v
    
    for k, v in company_loader.item.items():
        info[k] = v
    
    all_info.append(info)

Processing file  0
Processing file  1
Processing file  2
Processing file  3
Processing file  4
Processing file  5
Processing file  6
Processing file  7
Processing file  8
Processing file  9
Processing file  10
Processing file  11
Processing file  12
Processing file  13
Processing file  14
Processing file  15
Processing file  16
Processing file  17
Processing file  18
Processing file  19
Processing file  20
Processing file  21
Processing file  22
Processing file  23
Processing file  24
Processing file  25
Processing file  26
Processing file  27
Processing file  28
Processing file  29
Processing file  30
Processing file  31
Processing file  32
Processing file  33
Processing file  34
Processing file  35
Processing file  36
Processing file  37
Processing file  38
Processing file  39
Processing file  40
Processing file  41
Processing file  42
Processing file  43
Processing file  44
Processing file  45
Processing file  46
Processing file  47
Processing file  48
Processing file  49
Processing

Processing file  418
Processing file  419
Processing file  420
Processing file  421
Processing file  422
Processing file  423
Processing file  424
Processing file  425
Processing file  426
Processing file  427
Processing file  428
Processing file  429
Processing file  430
Processing file  431
Processing file  432
Processing file  433
Processing file  434
Processing file  435
Processing file  436
Processing file  437
Processing file  438
Processing file  439
Processing file  440
Processing file  441
Processing file  442
Processing file  443
Processing file  444
Processing file  445
Processing file  446
Processing file  447
Processing file  448
Processing file  449
Processing file  450
Processing file  451
Processing file  452
Processing file  453
Processing file  454
Processing file  455
Processing file  456
Processing file  457
Processing file  458
Processing file  459
Processing file  460
Processing file  461
Processing file  462
Processing file  463
Processing file  464
Processing fi

Processing file  812
Processing file  813
Processing file  814
Processing file  815
Processing file  816
Processing file  817
Processing file  818
Processing file  819
Processing file  820
Processing file  821
Processing file  822
Processing file  823
Processing file  824
Processing file  825
Processing file  826
Processing file  827
Processing file  828
Processing file  829
Processing file  830
Processing file  831
Processing file  832
Processing file  833
Processing file  834
Processing file  835
Processing file  836
Processing file  837
Processing file  838
Processing file  839
Processing file  840
Processing file  841
Processing file  842
Processing file  843
Processing file  844
Processing file  845
Processing file  846
Processing file  847
Processing file  848
Processing file  849
Processing file  850
Processing file  851
Processing file  852
Processing file  853
Processing file  854
Processing file  855
Processing file  856
Processing file  857
Processing file  858
Processing fi

In [198]:
import pandas as pd

In [230]:
job_info = pd.DataFrame(all_info)

In [233]:
job_info

Unnamed: 0,company,company_location,company_size,company_url,degree,description,experience,industry,job_link,location,number_to_recruit,register_type,salary,title
0,满金坝（深圳）科技有限公司,深圳市福田区华强北路群星广场A座24楼,100-499人,https://www.manjd.com/home,大专,"[技能要求:, C#/.NET, 职位描述, 岗位职责：, 1、参与系统平台和项目的设计、开...",5-10年,互联网/电子商务,CC535117221J00259153403.html,深圳,招1人,民营,18000-25000元/月,.net WPF高级开发工程师
1,深圳途阳电子商务有限公司,龙华区龙观路与清泉路交汇处中执NEX ONE三楼325室,20-99人,,中专,"[技能要求:, ﻿销售，售前支持，售后支持, 职位描述, 1., 打字速度快，, 专业能力过...",1-3年,互联网/电子商务,CZ819993820J00286207307.html,深圳,招1人,民营,6001-8000元/月,淘宝客服（高底薪高提成+弹性工作）
2,上海兔子玲商贸有限公司,上海市自由贸易试验区世博大道1368号第G层4G02-25室,20人以下,,学历不限,"[职位描述, 工作时间：早, 10:00~晚22：00，做一休一，可调整。, 吃苦耐劳, 诚...",经验不限,零售/批发,CZ718757780J00116407303.html,上海,招2人,民营,4000-8000元/月,世博源 母婴店店员
3,浙江链家房地产经纪有限公司,杭州市萧山区广孚联合国际中心九层,10000人以上,,大专,"[职位描述, 链家集团2001年创立于北京，是一家集地产经纪、互联网科技、地产金融、资产管理...",经验不限,中介服务,CC513429627J00108606803.html,杭州,招10人,民营,8001-10000元/月,链家聘销售代表储备经理大客户销售实习生
4,深圳市环球易购电子商务有限公司,深圳市南山区东滨路与南光路交汇处，永新汇3号楼,1000-9999人,http://www.globalegrow.com,本科,"[职位描述, 岗位职责:, 1. 负责法语平台（CD/PM/AmazoneBay/Fnac等...",经验不限,互联网/电子商务,426256413251086.html,深圳,招10人,上市公司,8001-10000元/月,法语运营助理/专员
5,深圳市华利民科技开发有限公司,深圳市福田区华强北路现代之窗大厦B座17G,20-99人,http://www.szhlm.com,大专,"[职位描述, 1、与供应商和客户对接，核对好公司销售及采购单据，催收货款, 2、负责公司银行...",1-3年,通信/电信运营、增值服务,CZ258905910J00209072508.html,深圳,招1人,合资,4001-6000元/月,会计
6,深圳市众祥安全科技有限公司,深圳市龙岗区坂田街道坂田高新技术工业园一号楼微谷120,100-499人,http://www.volks-safety.com/,本科,"[职位描述, 职位描述, 职责：, 1、负责公司产品拍摄、, 图片处理；, 2、宣传册设计、...",1-3年,电气/电力/水利,CZ717859740J00188708807.html,深圳,招1人,民营,6000-8000元/月,平面设计
7,三千茶农茶业集团股份有限公司,济南市槐荫区南辛庄泉景同润商务大厦4楼408室,100-499人,http://www.3000chanong.com,学历不限,"[技能要求:, ﻿销售，客户代表，营销，销售顾问, 职位描述, 岗位职责：, 1.对咨询客户...",经验不限,快速消费品（食品/饮料/烟酒/日化）,544180926250395.html,济南,招5人,民营,5000-10000元/月,无责底薪3000无加班高提成聘销售
8,杭州助拍信息技术有限公司,杭州市西湖区通普路中天MCC 2幢601室,20-99人,http://hangzhou.51zhupai.com/,大专,"[职位描述, 岗位职责：, 1、 根据国家法律和公司财务制度，进行资金处理；, 2、 负责公...",1-3年,互联网/电子商务,CC419077337J00217144706.html,杭州,招1人,民营,4001-6000元/月,出纳
9,成都不赚酒业销售有限公司,光华东三路486号中铁西城写字楼5栋707,500-999人,,学历不限,"[职位描述, 岗位描述：, 1、负责客户关系沟通、网上订单处理及跟踪；, 2、负责接听客服热...",经验不限,互联网/电子商务,CC841799800J00293176605.html,成都,招5人,民营,4001-6000元/月,呼叫中心客服
