In [1]:
# Source : https://blog.nycdatascience.com/student-works/project-3-web-scraping-company-data-from-indeed-com-and-dice-com/

# load the library
from bs4 import BeautifulSoup as Soup
import urllib, requests, re, pandas as pd
import lxml, time


import urllib2 # Website connections
from collections import Counter # Keep track of our term counts
from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'



# indeed.com url
#base_url = 'http://www.indeed.com/jobs?q=data+scientist&jt=fulltime&sort='
#base_url = 'http://www.indeed.fr/jobs?q=data+scientist&jt=fulltime&sort='

base_url = (
    "http://www.indeed.fr/emplois?as_and=data&as_any=science+scientist+analytics+"
    "analyst+analyste+visualisation+visualization+mining+dataming+dataminer+miner+learning+apprentissage"
    "&sort="
 )

#base_url = (
#    "http://www.indeed.fr/emplois?q=r+and+tableau+and+sas+and+spss"
#    "&sort="
# )

sort_by = 'date'          # sort by data
start_from = '&start='    # start page number

pd.set_option('max_colwidth',500)    # to remove column limit (Otherwise, we'll lose some info)


start_ts = time.strftime("%Y%m%d-%H%M%S")

In [2]:
# Taken from http://nbviewer.jupyter.org/github/jmsteinw/Notebooks/blob/master/IndeedJobs.ipynb#
def text_cleaner(website):
    '''
    This function just cleans up the raw html so that I can look at it.
    Inputs: a URL to investigate
    Outputs: Cleaned text only
    '''
    try:
        site = urllib2.urlopen(website).read() # Connect to the job posting
    except: 
        return   # Need this in case the website isn't there anymore or some other weird connection problem 
    
    soup_obj = Soup(site) # Get the html from the site
    
    if len(soup_obj) == 0: # In case the default parser lxml doesn't work, try another one
        soup_obj = Soup(site, 'html5lib')
    
    
    for script in soup_obj(["script", "style"]):
        script.extract() # Remove these two elements from the BS4 object
    
    

    text = soup_obj.get_text() # Get the text from this

    

    lines = (line.strip() for line in text.splitlines()) # break into lines

    
    
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each
    

    text = ''.join(chunk for chunk in chunks if chunk).encode('utf-8') # Get rid of all blank lines and ends of line
    
    # Now clean out all of the unicode junk (this line works great!!!)
    
    try:
        text = text.decode('unicode_escape').encode('ascii', 'ignore') # Need this as some websites aren't formatted
    except:                                                            # in a way that this works, can occasionally throw
        return                                                         # an exception
   
    
    text = re.sub("[^a-zA-Z+3]"," ", text)  # Now get rid of any terms that aren't words (include 3 for d3.js)
                                             # Also include + for C++
    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text) # Fix spacing issue from merged words
    
    text = text.lower().split()  # Go to lower case and split them apart
    
    
    stop_words = set(stopwords.words("english")) # Filter out any stop words
    text = [w for w in text if not w in stop_words]
    
    
    
    text = list(set(text)) # Last, just get the set of these. Ignore counts (we are just looking at whether a term existed
                           # or not on the website)
    
    return text

In [4]:
#%debug
# 10 (unsponsored) listings / page
# TODO : change moronic for loop to stop when no more pages



# Get the number of jobs
url = "%s%s" % (base_url, sort_by) # get full url 
target = Soup(urllib.urlopen(url), "lxml")

# Now find out how many jobs there were
num_jobs_area = target.find(id = 'searchCount').string.encode('utf-8') # Now extract the total number of jobs found
                                                                     # The 'searchCount' object has this

job_numbers = re.findall('\d+', num_jobs_area) # Extract the total jobs found from the search result


if len(job_numbers) > 3: # Have a total number of jobs greater than 1000
    total_num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
else:
    total_num_jobs = int(job_numbers[2]) 

print 'There were', total_num_jobs, 'jobs found' # Display how many jobs were found

num_pages = total_num_jobs/10 # This will be how we know the number of times we need to iterate over each new
                              # search result page

df = pd.DataFrame()   # create a new data frame
job_descriptions = []

#for page in range(1,101): # page from 1 to 100 (last page we can scrape is 100)
for page_nb in range(1,num_pages + 1):
    print("page %d" % (page_nb))
    
    page = (page_nb-1) * 10  
    url = "%s%s%s%d" % (base_url, sort_by, start_from, page) # get full url 
    target = Soup(urllib.urlopen(url), "lxml")     
    
    targetElements = target.findAll('div', attrs={'class' : '  row  result'}) # we're interested in each row (= each job)
    
    job_nb = 1
    # trying to get each specific job information (such as company name, job title, urls, ...)
    for elem in targetElements: 
        print('job nb %d' %(job_nb))
        
        #comp_name = elem.find('span', attrs={'itemprop':'name'}).getText().strip()
        #job_title = elem.find('a', attrs={'class':'turnstileLink'}).attrs['title']
        #job_addr = elem.find('span', attrs={'itemprop':'addressLocality'}).getText()
        
        found_elem = elem.find('span', attrs={'itemprop':'name'})
        if found_elem is not None:            
            comp_name = found_elem.getText().strip()
            comp_link_overall = found_elem.find('a')
        
        found_elem = elem.find('a', attrs={'class':'turnstileLink'})
        if found_elem is not None:            
            job_title = found_elem.attrs['title']
        
        found_elem = elem.find('span', attrs={'itemprop':'addressLocality'})
        if found_elem is not None:            
            job_addr = found_elem.getText()
        
        home_url = "http://www.indeed.fr"
        job_link = "%s%s" % (home_url,elem.find('a').get('href'))        
        job_posted = elem.find('span', attrs={'class': 'date'}).getText()

        
        if comp_link_overall != None: # if company link exists, access it. Otherwise, skip.
            comp_link_overall = "%s%s" % (home_url, comp_link_overall.attrs['href'])
        else: comp_link_overall = None

        # add a job info to our data frame
        df = df.append({'page_nb' : page_nb, 'job_nb' : job_nb,
                        'comp_name': comp_name, 'job_title': job_title, 
                        'job_link': job_link, 'job_posted': job_posted,
                        'overall_link': comp_link_overall, 'job_location': job_addr,
                        'overall_rating': None, 'wl_bal_rating': None, 
                        'benefit_rating': None, 'jsecurity_rating': None, 
                        'mgmt_rating': None, 'culture_rating': None
                       }, ignore_index=True)
    
        # Get job description
        final_description = text_cleaner(job_link)
        if final_description: # So that we only append when the website was accessed correctly
            #print("job description OK")
            job_descriptions.append("page_nb%d, job_nb%d, %s" % (page_nb, job_nb, final_description))
        else:
            job_descriptions.append("page_nb%d, job_nb%d, NA" % (page_nb, job_nb))
        
        
        job_nb = job_nb + 1
    # Sleep to avoid spamming the server and getting banned
    time.sleep(0.1)

df

#print(job_descriptions[:2])


There were 20 jobs found
page 1
job nb 1
job nb 2
job nb 3
job nb 4
job nb 5
job nb 6
job nb 7
job nb 8
job nb 9
page 2
job nb 1
job nb 2
job nb 3
job nb 4
job nb 5
job nb 6
job nb 7


Unnamed: 0,benefit_rating,comp_name,culture_rating,job_link,job_location,job_nb,job_posted,job_title,jsecurity_rating,mgmt_rating,overall_link,overall_rating,page_nb,wl_bal_rating
0,,KPMG Audit,,http://www.indeed.fr/rc/clk?jk=27b981c1217c5aae&fccid=bab3d06b147e0dc2,La Défense (92),1.0,il y a 3 jours,Senior consultant en data analytics,,,http://www.indeed.fr/cmp/Kpmg-Audit,,1.0,
1,,Altaide,,http://www.indeed.fr/rc/clk?jk=ffb82923782ebfcd&fccid=9e2a49e69460b045,Boulogne-Billancourt (92),2.0,il y a 4 jours,Data Scientist confirmé(e) / Incubateur Groupe de premier plan (H/F),,,,,1.0,
2,,Altaide,,http://www.indeed.fr/rc/clk?jk=161c0944b517a619&fccid=ce783271e420d14a,Hauts-de-Seine,3.0,il y a 5 jours,Consultant BI / Big Data H/F,,,,,1.0,
3,,Capgemini,,http://www.indeed.fr/rc/clk?jk=e6320f38af2307df&fccid=105ecfd0283f415f,Rennes (35),4.0,il y a 13 jours,Consultant BI/BIGDATA H/F,,,http://www.indeed.fr/cmp/Capgemini,,1.0,
4,,Churchill Frank,,http://www.indeed.fr/rc/clk?jk=becf532267edf04d&fccid=ed5c68ee55ba553e,Paris (75),5.0,il y a 12 jours,Data Scientist,,,,,1.0,
5,,Churchill Frank,,http://www.indeed.fr/rc/clk?jk=c413a9267eec4a3d&fccid=ce783271e420d14a,Ille-et-Vilaine,6.0,il y a 11 jours,Consultant BI/BIGDATA H/F,,,,,1.0,
6,,Churchill Frank,,http://www.indeed.fr/rc/clk?jk=e44eef8290b8ba72&fccid=ce783271e420d14a,Hauts-de-Seine,7.0,il y a 23 jours,Data Scientist confirméH/F,,,,,1.0,
7,,Churchill Frank,,http://www.indeed.fr/rc/clk?jk=b355208068c3de38&fccid=ce783271e420d14a,Hauts-de-Seine,8.0,il y a 23 jours,Data Scientist expérimenté H/F,,,,,1.0,
8,,CGI,,http://www.indeed.fr/rc/clk?jk=945bccb3401c14ce&fccid=d2841a5c0380b93d,Montpellier (34),9.0,il y a 29 jours,Expert technique Datastage PX H/F,,,http://www.indeed.fr/cmp/CGI-Group,,1.0,
9,,KPMG,,http://www.indeed.fr/rc/clk?jk=0f7ae1d6260adb41&fccid=2dd390c3a48a7ed0,Paris (75),1.0,il y a 30+ jours,Senior Consultant en Data Analytics - Technology Transformation,,,http://www.indeed.fr/cmp/Kpmg,,2.0,


In [6]:
#len(df)

#job_descriptions

#t = text_cleaner('http://www.indeed.com/rc/clk?jk=8e31da42970f8397&fccid=43c5b67847a9226c')
#print(job_descriptions)

#len(df)
#len(job_descriptions)

job_descriptions
#len(df)

#'1' + str(job_descriptions[3])



['page_nb1, job_nb1, NA',
 'page_nb1, job_nb2, NA',
 "page_nb1, job_nb3, ['postule', 'annuaire', 'mission', 'hadoop', 'recruteurs', 'travaux', 'vidos', 'signataire', 'contactez', 'cole', 'hauts', 'tlcommunications', 'industries', 'mahout', 'transformer', 'le', 'anglais', 'euros', 'analytics', 'esprit', 'ssis', 'contribuer', 'mtiers', 'tm', 'mentions', 'besoins', 'maitrisez', 'aux', 'cls', 'informatique', 'pleine', 'march', 'finance', 'veuillez', 'big', 'de', 'handi', 'projet', 'monde', 'talend', 'sont', 'dposez', 'voluerez', 'passionnant', 'dt', 'du', 'organiser', 'recrutementen', 'matrise', 'nos', 'cloudera', 'vos', 'offres', 'jobin', 'h', 'ecosystme', 'l', 'rigueur', 'server', 'teradata', 'excellence', 'bac', 'partager', 'prparer', 'mettre', 'page', 'cognos', 'www', 'affaires', 'leaders', 'sommes', 'recrutement', 'dvelopper', 'bnficiez', 'faites', 'fonctionnelles', 'chef', 'consultants', 'service', 'modlisation', 'premire', 'tlcoms', 'et', 'aider', 'tes', 'collaborateurs', 'est', 'pr

In [7]:
df_received = df

for i in range(0,len(df_received)):  # get all the company details (
    target_comp_name = df_received.iloc[i]['comp_name']

    url_2nd = df.iloc[i]['overall_link'] 
    if url_2nd != None:
        target_2nd = Soup(urllib.urlopen(url_2nd), "lxml")
        
        comp_logo = target_2nd.find("div", {"id": "cmp-header-logo"}).find('img')
        if comp_logo != None:
            comp_logo = target_2nd.find("div", {"id": "cmp-header-logo"}).find('img').attrs['src']
        else: comp_logo = None
          
        # total 6 ratings: overall rating, work-life balance rating, compensation / benefit rating, job security rating, management rating, company culture rating
        comp_rating_overall = target_2nd.find("span", {"class": "cmp-star-large-on"}).attrs['style']
        wl_bal_rating = target_2nd.find("dl", {"id": "cmp-reviews-attributes"}).find_all("span", {"class": "cmp-star-on"})[0].attrs['style'] 
        benefit_rating = target_2nd.find("dl", {"id": "cmp-reviews-attributes"}).find_all("span", {"class": "cmp-star-on"})[1].attrs['style'] 
        jsecurity_rating = target_2nd.find("dl", {"id": "cmp-reviews-attributes"}).find_all("span", {"class": "cmp-star-on"})[2].attrs['style'] 
        mgmt_rating =  target_2nd.find("dl", {"id": "cmp-reviews-attributes"}).find_all("span", {"class": "cmp-star-on"})[3].attrs['style'] 
        culture_rating = target_2nd.find("dl", {"id": "cmp-reviews-attributes"}).find_all("span", {"class": "cmp-star-on"})[4].attrs['style'] 

        # Some regular expression stuffs to remove unnecessary characters
        comp_rating_overall = re.sub('[width: ]', '', comp_rating_overall)
        comp_rating_overall = re.sub('[px;]', '', comp_rating_overall)
        comp_rating_overall = round((float(comp_rating_overall)*5.0)/120, 1)

        wl_bal_rating = re.sub('[width: ]', '', wl_bal_rating)
        wl_bal_rating = re.sub('[px]', '', wl_bal_rating)
        wl_bal_rating = round((float(wl_bal_rating)*5.0)/86, 1) # 86 pixel

        benefit_rating = re.sub('[width: ]', '', benefit_rating)
        benefit_rating = re.sub('[px]', '', benefit_rating)
        benefit_rating = round((float(benefit_rating)*5.0)/86, 1)

        jsecurity_rating = re.sub('[width: ]', '', jsecurity_rating)
        jsecurity_rating = re.sub('[px]', '', jsecurity_rating)
        jsecurity_rating = round((float(jsecurity_rating)*5.0)/86, 1)

        mgmt_rating = re.sub('[width: ]', '', mgmt_rating)
        mgmt_rating = re.sub('[px]', '', mgmt_rating)
        mgmt_rating = round((float(mgmt_rating)*5.0)/86, 1)

        culture_rating = re.sub('[width: ]', '', culture_rating)
        culture_rating = re.sub('[px]', '', culture_rating)
        culture_rating = round((float(culture_rating)*5.0)/86, 1)
    
        # Store cleaned characters into data frame
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'overall_rating'] = comp_rating_overall
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'wl_bal_rating'] = wl_bal_rating
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'benefit_rating'] = benefit_rating
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'jsecurity_rating'] = jsecurity_rating
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'mgmt_rating'] = mgmt_rating
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'culture_rating'] = culture_rating

In [8]:
# Save the result to CSV
fname_out = './data/indeed_companies_data_%s.csv' % (start_ts)
df_received.to_csv(fname_out, encoding='utf-8')




In [9]:
# Save jop description to file
fname_job = './data/indeed_job_descr_%s.csv' % (start_ts)
f_job = open(fname_job, 'w')

for item in job_descriptions:
  print>>f_job, item