In [16]:
# Source : https://blog.nycdatascience.com/student-works/project-3-web-scraping-company-data-from-indeed-com-and-dice-com/

# load the library
from bs4 import BeautifulSoup as Soup
import urllib, requests, re, pandas as pd
import lxml, time


import urllib2 # Website connections
from collections import Counter # Keep track of our term counts
from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'



# indeed.com url
#base_url = 'http://www.indeed.com/jobs?q=data+scientist&jt=fulltime&sort='
#base_url = 'http://www.indeed.fr/jobs?q=data+scientist&jt=fulltime&sort='

base_url = (
    "http://www.indeed.fr/emplois?as_and=data&as_any=science+scientist+analytics+"
    "analyst+analyste+visualisation+visualization+mining+dataming+dataminer+miner+learning+apprentissage"
    "&jt=fulltime&sort="
 )

sort_by = 'date'          # sort by data
start_from = '&start='    # start page number

pd.set_option('max_colwidth',500)    # to remove column limit (Otherwise, we'll lose some info)
df = pd.DataFrame()   # create a new data frame

start_ts = time.strftime("%Y%m%d-%H%M%S")

In [17]:
# Taken from http://nbviewer.jupyter.org/github/jmsteinw/Notebooks/blob/master/IndeedJobs.ipynb#
def text_cleaner(website):
    '''
    This function just cleans up the raw html so that I can look at it.
    Inputs: a URL to investigate
    Outputs: Cleaned text only
    '''
    try:
        site = urllib2.urlopen(website).read() # Connect to the job posting
    except: 
        return   # Need this in case the website isn't there anymore or some other weird connection problem 
    
    soup_obj = Soup(site) # Get the html from the site
    
    if len(soup_obj) == 0: # In case the default parser lxml doesn't work, try another one
        soup_obj = Soup(site, 'html5lib')
    
    
    for script in soup_obj(["script", "style"]):
        script.extract() # Remove these two elements from the BS4 object
    
    

    text = soup_obj.get_text() # Get the text from this

    

    lines = (line.strip() for line in text.splitlines()) # break into lines

    
    
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each
    

    text = ''.join(chunk for chunk in chunks if chunk).encode('utf-8') # Get rid of all blank lines and ends of line
    
    # Now clean out all of the unicode junk (this line works great!!!)
    
    try:
        text = text.decode('unicode_escape').encode('ascii', 'ignore') # Need this as some websites aren't formatted
    except:                                                            # in a way that this works, can occasionally throw
        return                                                         # an exception
   
    
    text = re.sub("[^a-zA-Z+3]"," ", text)  # Now get rid of any terms that aren't words (include 3 for d3.js)
                                             # Also include + for C++
    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text) # Fix spacing issue from merged words
    
    text = text.lower().split()  # Go to lower case and split them apart
    
    
    stop_words = set(stopwords.words("english")) # Filter out any stop words
    text = [w for w in text if not w in stop_words]
    
    
    
    text = list(set(text)) # Last, just get the set of these. Ignore counts (we are just looking at whether a term existed
                           # or not on the website)
    
    return text

In [18]:
#%debug
# 10 (unsponsored) listings / page
# TODO : change moronic for loop to stop when no more pages

job_descriptions = []

#for page in range(1,101): # page from 1 to 100 (last page we can scrape is 100)
for page in range(1,2): # page from 1 to 100 (last page we can scrape is 100)
    page = (page-1) * 10  
    url = "%s%s%s%d" % (base_url, sort_by, start_from, page) # get full url 
    target = Soup(urllib.urlopen(url), "lxml")     
    
    targetElements = target.findAll('div', attrs={'class' : '  row  result'}) # we're interested in each row (= each job)
    
    # trying to get each specific job information (such as company name, job title, urls, ...)
    for elem in targetElements: 
        comp_name = elem.find('span', attrs={'itemprop':'name'}).getText().strip()
        job_title = elem.find('a', attrs={'class':'turnstileLink'}).attrs['title']
        home_url = "http://www.indeed.fr"
        job_link = "%s%s" % (home_url,elem.find('a').get('href'))
        job_addr = elem.find('span', attrs={'itemprop':'addressLocality'}).getText()
        job_posted = elem.find('span', attrs={'class': 'date'}).getText()

        comp_link_overall = elem.find('span', attrs={'itemprop':'name'}).find('a')
        if comp_link_overall != None: # if company link exists, access it. Otherwise, skip.
            comp_link_overall = "%s%s" % (home_url, comp_link_overall.attrs['href'])
        else: comp_link_overall = None

        # add a job info to our data frame
        df = df.append({'comp_name': comp_name, 'job_title': job_title, 
                        'job_link': job_link, 'job_posted': job_posted,
                        'overall_link': comp_link_overall, 'job_location': job_addr,
                        'overall_rating': None, 'wl_bal_rating': None, 
                        'benefit_rating': None, 'jsecurity_rating': None, 
                        'mgmt_rating': None, 'culture_rating': None
                       }, ignore_index=True)
    
        # Get job description
        final_description = text_cleaner(job_link)
        if final_description: # So that we only append when the website was accessed correctly
            job_descriptions.append(final_description)
        
    
    # Sleep to avoid spamming the server and getting banned
    time.sleep(0.5)

df

#print(job_descriptions[:2])


Unnamed: 0,benefit_rating,comp_name,culture_rating,job_link,job_location,job_posted,job_title,jsecurity_rating,mgmt_rating,overall_link,overall_rating,wl_bal_rating
0,,DATAWORDS,,http://www.indeed.com/company/DATAWORDS/jobs/Display-Campaign-Analyst-a88eb7a9a35f3cad?fccid=a24cd78bcc10acd9,Levallois-Perret (92),Publiée à l'instant,Display Campaign Analyst,,,,,
1,,World Health Organization,,http://www.indeed.com/rc/clk?jk=f69d6cf72f665b0e&fccid=02615ec3cf4fde2c,Lyon (69),Publiée à l'instant,Assistant (Web Development) - Local recruitment,,,,,
2,,Big Cloud,,http://www.indeed.com/company/Big-Cloud-Recruitment/jobs/Deep-Learning-Researcher-Hot-Geolocation-Startup-968bcebf0676cbda?fccid=a22e33158fb49b51,Paris (75),Publiée à l'instant,Deep Learning Researcher – Hot Geolocation Startup,,,,,
3,,Sublime Skinz,,http://www.indeed.com/rc/clk?jk=1fe1d9276c26e243&fccid=65878a1a3cc89839,Paris (75),Publiée à l'instant,Lead Développeur,,,,,
4,,Institut Curie,,http://www.indeed.com/rc/clk?jk=8ea029e95fe66b5b&fccid=93077f683e849963,Paris (75),Publiée à l'instant,Scientific developer position,,,http://www.indeed.com/cmp/Institut-Curie,,
5,,Gilead,,http://www.indeed.com/rc/clk?jk=39fb602f0a9ca492&fccid=0aed3f67f6a631df,Paris (75),Publiée à l'instant,Customer Services & Sales Administration Assistant,,,,,
6,,Fed Supply,,http://www.indeed.com/rc/clk?jk=5e7bb5020c2ac404&fccid=0893f5955ccafaf1,Paris (75),Publiée à l'instant,Chef de projet Supply Chain et opérations H/F,,,,,
7,,Total,,http://www.indeed.com/rc/clk?jk=8e31da42970f8397&fccid=43c5b67847a9226c,Palaiseau (91),Publiée à l'instant,Post Doc : Caractérisation avancée et stabilité des cellules solaires perovskite de nouvelle génération,,,http://www.indeed.com/cmp/Total,,
8,,Total,,http://www.indeed.com/rc/clk?jk=c7b29e9d537b9816&fccid=43c5b67847a9226c,Toulouse (31),Publiée à l'instant,Thèse : Caractérisation avancée de matériaux innovants pour cellules solaires,,,http://www.indeed.com/cmp/Total,,


In [21]:
#len(df)

#job_descriptions

#t = text_cleaner('http://www.indeed.com/rc/clk?jk=8e31da42970f8397&fccid=43c5b67847a9226c')
print(job_descriptions)

len(job_descriptions)

len(df)


[['cdd', 'semantic', 'skip', 'global', 'dynamic', 'results', 'month', 'manager', 'managed', 'issues', 'looking', 'terms', 'paris', 'young', 'languages', 'finally', 'asking', 'spain', 'teaching', 'advantage', 'tagging', 'tickets', 'skills', 'philippines', 'upload', 'suite', 'team', 'cookies', 'work', 'dealing', 'sign', 'click', 'established', 'companycity', 'essence', 'business', 'localization', 'international', 'increasing', 'full', 'transportation', 'degree', 'french', 'understanding', 'hours', 'pioneered', 'deliverables', 'active', 'strong', 'mindset', 'search', 'technical', 'involved', 'employees', 'experience', 'advertising', 'studies', 'reprocessing', 'campaign', 'apply', 'tools', 'environment', 'eye', 'synchronized', 'working', 'positive', 'objectives', 'visit', 'france', 'type', 'today', 'company', 'flag', 'excellent', 'must', 'account', 'clients', 'learn', 'growing', 'meet', 'figure', 'process', 'high', 'tag', 'needs', 'united', 'monitoring', 'hoc', 'provide', 'tableau', 'locat

9

In [None]:
t2 = Soup(urllib.urlopen(url), "lxml")

In [14]:
df_received = df

for i in range(0,len(df_received)):  # get all the company details (
    target_comp_name = df_received.iloc[i]['comp_name']

    url_2nd = df.iloc[i]['overall_link'] 
    if url_2nd != None:
        target_2nd = Soup(urllib.urlopen(url_2nd), "lxml")
        
        comp_logo = target_2nd.find("div", {"id": "cmp-header-logo"}).find('img')
        if comp_logo != None:
            comp_logo = target_2nd.find("div", {"id": "cmp-header-logo"}).find('img').attrs['src']
        else: comp_logo = None
          
        # total 6 ratings: overall rating, work-life balance rating, compensation / benefit rating, job security rating, management rating, company culture rating
        comp_rating_overall = target_2nd.find("span", {"class": "cmp-star-large-on"}).attrs['style']
        wl_bal_rating = target_2nd.find("dl", {"id": "cmp-reviews-attributes"}).find_all("span", {"class": "cmp-star-on"})[0].attrs['style'] 
        benefit_rating = target_2nd.find("dl", {"id": "cmp-reviews-attributes"}).find_all("span", {"class": "cmp-star-on"})[1].attrs['style'] 
        jsecurity_rating = target_2nd.find("dl", {"id": "cmp-reviews-attributes"}).find_all("span", {"class": "cmp-star-on"})[2].attrs['style'] 
        mgmt_rating =  target_2nd.find("dl", {"id": "cmp-reviews-attributes"}).find_all("span", {"class": "cmp-star-on"})[3].attrs['style'] 
        culture_rating = target_2nd.find("dl", {"id": "cmp-reviews-attributes"}).find_all("span", {"class": "cmp-star-on"})[4].attrs['style'] 

        # Some regular expression stuffs to remove unnecessary characters
        comp_rating_overall = re.sub('[width: ]', '', comp_rating_overall)
        comp_rating_overall = re.sub('[px;]', '', comp_rating_overall)
        comp_rating_overall = round((float(comp_rating_overall)*5.0)/120, 1)

        wl_bal_rating = re.sub('[width: ]', '', wl_bal_rating)
        wl_bal_rating = re.sub('[px]', '', wl_bal_rating)
        wl_bal_rating = round((float(wl_bal_rating)*5.0)/86, 1) # 86 pixel

        benefit_rating = re.sub('[width: ]', '', benefit_rating)
        benefit_rating = re.sub('[px]', '', benefit_rating)
        benefit_rating = round((float(benefit_rating)*5.0)/86, 1)

        jsecurity_rating = re.sub('[width: ]', '', jsecurity_rating)
        jsecurity_rating = re.sub('[px]', '', jsecurity_rating)
        jsecurity_rating = round((float(jsecurity_rating)*5.0)/86, 1)

        mgmt_rating = re.sub('[width: ]', '', mgmt_rating)
        mgmt_rating = re.sub('[px]', '', mgmt_rating)
        mgmt_rating = round((float(mgmt_rating)*5.0)/86, 1)

        culture_rating = re.sub('[width: ]', '', culture_rating)
        culture_rating = re.sub('[px]', '', culture_rating)
        culture_rating = round((float(culture_rating)*5.0)/86, 1)
    
        # Store cleaned characters into data frame
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'overall_rating'] = comp_rating_overall
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'wl_bal_rating'] = wl_bal_rating
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'benefit_rating'] = benefit_rating
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'jsecurity_rating'] = jsecurity_rating
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'mgmt_rating'] = mgmt_rating
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'culture_rating'] = culture_rating

In [16]:
# Save the result to CSV
fname_out = './data/indeed_companies_data_%s.csv' % (start_ts)
df_received.to_csv(fname_out, encoding='utf-8')