In [15]:
# Source : https://blog.nycdatascience.com/student-works/project-3-web-scraping-company-data-from-indeed-com-and-dice-com/

# load the library
from bs4 import BeautifulSoup as Soup
import urllib, requests, re, pandas as pd
import lxml, time

# indeed.com url
#base_url = 'http://www.indeed.com/jobs?q=data+scientist&jt=fulltime&sort='
#base_url = 'http://www.indeed.fr/jobs?q=data+scientist&jt=fulltime&sort='

base_url = (
    "http://www.indeed.fr/emplois?as_and=data&as_any=science+scientist+analytics+"
    "analyst+analyste+visualisation+visualization+mining+dataming+dataminer+miner+learning+apprentissage"
    "&jt=fulltime&sort="
 )

sort_by = 'date'          # sort by data
start_from = '&start='    # start page number

pd.set_option('max_colwidth',500)    # to remove column limit (Otherwise, we'll lose some info)
df = pd.DataFrame()   # create a new data frame

start_ts = time.strftime("%Y%m%d-%H%M%S")

In [13]:
# 10 (unsponsored) listings / page
for page in range(1,101): # page from 1 to 100 (last page we can scrape is 100)
    page = (page-1) * 10  
    url = "%s%s%s%d" % (base_url, sort_by, start_from, page) # get full url 
    target = Soup(urllib.urlopen(url), "lxml") 

    targetElements = target.findAll('div', attrs={'class' : '  row  result'}) # we're interested in each row (= each job)
    
    # trying to get each specific job information (such as company name, job title, urls, ...)
    for elem in targetElements: 
        comp_name = elem.find('span', attrs={'itemprop':'name'}).getText().strip()
        job_title = elem.find('a', attrs={'class':'turnstileLink'}).attrs['title']
        home_url = "http://www.indeed.com"
        job_link = "%s%s" % (home_url,elem.find('a').get('href'))
        job_addr = elem.find('span', attrs={'itemprop':'addressLocality'}).getText()
        job_posted = elem.find('span', attrs={'class': 'date'}).getText()

        comp_link_overall = elem.find('span', attrs={'itemprop':'name'}).find('a')
        if comp_link_overall != None: # if company link exists, access it. Otherwise, skip.
            comp_link_overall = "%s%s" % (home_url, comp_link_overall.attrs['href'])
        else: comp_link_overall = None

				# add a job info to our data frame
        df = df.append({'comp_name': comp_name, 'job_title': job_title, 
                        'job_link': job_link, 'job_posted': job_posted,
                        'overall_link': comp_link_overall, 'job_location': job_addr,
                        'overall_rating': None, 'wl_bal_rating': None, 
                        'benefit_rating': None, 'jsecurity_rating': None, 
                        'mgmt_rating': None, 'culture_rating': None
                       }, ignore_index=True)
    
    # Sleep to avoid spamming the server and getting banned
    #time.sleep(0.5)

df


Unnamed: 0,benefit_rating,comp_name,culture_rating,job_link,job_location,job_posted,job_title,jsecurity_rating,mgmt_rating,overall_link,overall_rating,wl_bal_rating
0,,Umanis,,http://www.indeed.com/rc/clk?jk=a4bbc171f9a129f0&fccid=7f5a14d62fba631c,Levallois-Perret (92),Aujourd'hui,consultant informatique BI (H/F),,,http://www.indeed.com/cmp/Umanis,,
1,,Umanis,,http://www.indeed.com/rc/clk?jk=1a41d3ab2eed56d8&fccid=7f5a14d62fba631c,Levallois-Perret (92),Aujourd'hui,consultant informatique BIG DATA H/F,,,http://www.indeed.com/cmp/Umanis,,
2,,Digitalent,,http://www.indeed.com/rc/clk?jk=4fa97b2d49ab8890&fccid=bad09fa1d074b123,Châtillon (92),Aujourd'hui,Data analyste informatique - Big Data,,,,,
3,,Atos,,http://www.indeed.com/rc/clk?jk=c2576c54559d4c47&fccid=ee6a99db163236c2,Bezons (95),Aujourd'hui,Architecte Java H/F,,,http://www.indeed.com/cmp/Atos,,
4,,Umanis,,http://www.indeed.com/rc/clk?jk=8d293825df7d2e30&fccid=7f5a14d62fba631c,Levallois-Perret (92),Aujourd'hui,AMOA recette informatique ISTQB H/F IDF,,,http://www.indeed.com/cmp/Umanis,,
5,,Synthesio,,http://www.indeed.com/rc/clk?jk=5da4c8af1d9a1aab&fccid=a6d91dc9e1dab558,Paris (75),Aujourd'hui,"Account Executive, Paris Office",,,,,
6,,Atos,,http://www.indeed.com/rc/clk?jk=167e2ef050051698&fccid=ee6a99db163236c2,Nice (06),Aujourd'hui,Stage de développement dans le domaine de l'e-Education (H/F),,,http://www.indeed.com/cmp/Atos,,
7,,Umanis,,http://www.indeed.com/rc/clk?jk=944f144b14df4e60&fccid=7f5a14d62fba631c,Levallois-Perret (92),Aujourd'hui,IED .NET WEB (H/F),,,http://www.indeed.com/cmp/Umanis,,
8,,AGILICIO,,http://www.indeed.com/rc/clk?jk=07ef68ab9ce1d2cf&fccid=f3ce1bc7b2a4fcc1,Évry (91),Aujourd'hui,Analyste Développeur MSBI - MDX- EVRY,,,,,
9,,eFounders,,http://www.indeed.com/rc/clk?jk=e6810132631453b7&fccid=e31f0c5f57371493,Paris (75),il y a 1 jour,Frontend hacker - Hivy YC W17,,,,,


In [14]:
df_received = df

for i in range(0,len(df_received)):  # get all the company details (
    target_comp_name = df_received.iloc[i]['comp_name']

    url_2nd = df.iloc[i]['overall_link'] 
    if url_2nd != None:
        target_2nd = Soup(urllib.urlopen(url_2nd), "lxml")
        
        comp_logo = target_2nd.find("div", {"id": "cmp-header-logo"}).find('img')
        if comp_logo != None:
            comp_logo = target_2nd.find("div", {"id": "cmp-header-logo"}).find('img').attrs['src']
        else: comp_logo = None
          
        # total 6 ratings: overall rating, work-life balance rating, compensation / benefit rating, job security rating, management rating, company culture rating
        comp_rating_overall = target_2nd.find("span", {"class": "cmp-star-large-on"}).attrs['style']
        wl_bal_rating = target_2nd.find("dl", {"id": "cmp-reviews-attributes"}).find_all("span", {"class": "cmp-star-on"})[0].attrs['style'] 
        benefit_rating = target_2nd.find("dl", {"id": "cmp-reviews-attributes"}).find_all("span", {"class": "cmp-star-on"})[1].attrs['style'] 
        jsecurity_rating = target_2nd.find("dl", {"id": "cmp-reviews-attributes"}).find_all("span", {"class": "cmp-star-on"})[2].attrs['style'] 
        mgmt_rating =  target_2nd.find("dl", {"id": "cmp-reviews-attributes"}).find_all("span", {"class": "cmp-star-on"})[3].attrs['style'] 
        culture_rating = target_2nd.find("dl", {"id": "cmp-reviews-attributes"}).find_all("span", {"class": "cmp-star-on"})[4].attrs['style'] 

        # Some regular expression stuffs to remove unnecessary characters
        comp_rating_overall = re.sub('[width: ]', '', comp_rating_overall)
        comp_rating_overall = re.sub('[px;]', '', comp_rating_overall)
        comp_rating_overall = round((float(comp_rating_overall)*5.0)/120, 1)

        wl_bal_rating = re.sub('[width: ]', '', wl_bal_rating)
        wl_bal_rating = re.sub('[px]', '', wl_bal_rating)
        wl_bal_rating = round((float(wl_bal_rating)*5.0)/86, 1) # 86 pixel

        benefit_rating = re.sub('[width: ]', '', benefit_rating)
        benefit_rating = re.sub('[px]', '', benefit_rating)
        benefit_rating = round((float(benefit_rating)*5.0)/86, 1)

        jsecurity_rating = re.sub('[width: ]', '', jsecurity_rating)
        jsecurity_rating = re.sub('[px]', '', jsecurity_rating)
        jsecurity_rating = round((float(jsecurity_rating)*5.0)/86, 1)

        mgmt_rating = re.sub('[width: ]', '', mgmt_rating)
        mgmt_rating = re.sub('[px]', '', mgmt_rating)
        mgmt_rating = round((float(mgmt_rating)*5.0)/86, 1)

        culture_rating = re.sub('[width: ]', '', culture_rating)
        culture_rating = re.sub('[px]', '', culture_rating)
        culture_rating = round((float(culture_rating)*5.0)/86, 1)
    
        # Store cleaned characters into data frame
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'overall_rating'] = comp_rating_overall
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'wl_bal_rating'] = wl_bal_rating
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'benefit_rating'] = benefit_rating
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'jsecurity_rating'] = jsecurity_rating
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'mgmt_rating'] = mgmt_rating
        df_received.loc[ df_received['comp_name'] == target_comp_name, 'culture_rating'] = culture_rating

In [16]:
# Save the result to CSV
fname_out = './data/indeed_companies_data_%s.csv' % (start_ts)
df_received.to_csv(fname_out, encoding='utf-8')