In [106]:
import pandas as pd
import requests
import lxml.html
from lxml.html.clean import Cleaner
import re
#import json
from time import sleep
from random import randint
import csv


def parse_job_listings(start=17100, end=17101):
    
    listing_base_url = 'https://www.kaggle.com/jobs/'
    headers = {'User-agent':'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'}

    main_results = []

    for _listing in range(start, end, 100):
    
        _dat = {}

        url = ''.join([listing_base_url, str(_listing), '/datarobot-data-scientist-japan'])

        print('Processing listing page %s' % ''.join([listing_base_url, str(_listing)]))

        try:

            #random time delay for scraping
            sleep(randint(0,5))

            r = requests.get(url, headers=headers)
            
            doc = lxml.html.fromstring(r.content)

            # request status
            _dat['job_id'] = _listing
            _dat['status_code'] = r.status_code


            # warning message, check if exists or contains expired
            if doc.cssselect('div.message-inside'):
                _dat['alert_message'] = doc.cssselect('div.message-inside')[0].text

            # title
            _dat['job_title'] = doc.cssselect('div.title h1')[0].text
            _dat['company'] = doc.cssselect('div.title h2')[0].text
            _dat['location'] = doc.cssselect('div.title h3')[0].text

            # posted date
            _dat['post_date'] = doc.cssselect('p.submission-date span')[0].get('title')
            _dat['views'] = re.compile('\((\d*)\sviews\)').findall(doc.cssselect('p.submission-date')[0].text_content().replace(',',''))[0]

            # body
            _dat['body_raw'] = doc.cssselect('div.jobs-board-post-content')[0].text_content()
            #_dat['body'] = doc.cssselect('div.jobs-board-post-content p')[1].text_content().replace('\r\n',' ')
            
            _dat['body_p'] = len(doc.cssselect('div.jobs-board-post-content p'))
            _dat['body_ul'] = len(doc.cssselect('div.jobs-board-post-content ul'))
            #_dat['body_nodes'] = len(doc.cssselect('div.jobs-board-post-content').get_children())

            main_results.append(_dat)
        
        except:
            print('Error getting listing details for: %s' % url)
    

    print('Done processing job listings')
    return main_results


if __name__ == '__main__':
    
    #get results
    listings = parse_job_listings(start=16785, end=17128)
    
    df = pd.DataFrame.from_dict(listings, dtype=None)
    
    print(df.head())

    #create filename
    #filename = location.replace('--', '_').replace('-', '_').lower() + '.csv'
    filename = 'job-listings.csv'
    
    #get all keys
    headings = sorted(list(set().union(*(d.keys() for d in listings))))
    
    #write to csv file
    with open(filename, 'w') as output_file:
        dict_writer = csv.DictWriter(output_file, headings)
        dict_writer.writeheader()
        dict_writer.writerows(listings)



Processing listing page https://www.kaggle.com/jobs/16785
Processing listing page https://www.kaggle.com/jobs/16885
Processing listing page https://www.kaggle.com/jobs/16985
Processing listing page https://www.kaggle.com/jobs/17085
Error getting listing details for: https://www.kaggle.com/jobs/17085/datarobot-data-scientist-japan
Done processing job listings
                                       alert_message  body_p  \
0  This job post has expired (either the position...       5   
1  This job post has expired (either the position...      20   
2  This job post has expired (either the position...      19   

                                            body_raw  body_ul      company  \
0  \r\n    Posted 6 months ago (1,841 views)\r\n ...        2   DataRobot    
1  \r\n    Posted 4 months ago (551 views)\r\n   ...        0  Capital One   
2  \r\n    Posted 2 months ago (1,499 views)\r\n ...        0       msg.ai   

   job_id                                          job_title       lo

In [99]:
df

Unnamed: 0,alert_message,body_raw,company,job_title,location,post_date,status_code,views
0,This job post has expired (either the position...,"\r\n Posted 6 months ago (1,841 views)\r\n ...",DataRobot,Data Scientist,Japan,10/9/2015 9:16:55 AM UTC,200,1841
1,This job post has expired (either the position...,\r\n Posted 4 months ago (551 views)\r\n ...,Capital One,Digital Data Analysis Manager,"Richmond, VA",12/1/2015 6:07:46 PM UTC,200,551
2,This job post has expired (either the position...,"\r\n Posted 2 months ago (1,499 views)\r\n ...",msg.ai,Data Scientist / Machine Learning / NLP at Y-C...,San Francisco,2/3/2016 3:33:27 AM UTC,200,1499
