

Source: https://medium.com/@msalmon00/web-scraping-job-postings-from-indeed-96bd588dcb4b

In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time

In [4]:
URL = "https://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=New+York&start=10"
#conducting a request of the stated URL above:
page = requests.get(URL)
#specifying a desired format of “page” using the html parser - this allows python to read the various components of the page, rather than treating it as one long string.
soup = BeautifulSoup(page.text, "html.parser")
#printing soup in a more structured tree format that makes for easier reading
print(soup.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="en">
 <head>
  <meta content="text/html;charset=utf-8" http-equiv="content-type"/>
  <script src="//d3fw5vlhllyvee.cloudfront.net/s/767009c/en_US.js" type="text/javascript">
  </script>
  <script>
   window.indeed=window.indeed||{},indeed.crashtext=indeed.crashtext||{},indeed.crashtext.populate=indeed.crashtext.populate||function(e,n){function t(e){for(var n=a,t=e.length;t;)n=33*n^e.charCodeAt(--t);return n>>>0}var r=window[n];if(r){var i=r[""]||{},a=i.salt;if(i.hasOwnProperty("salt"))for(var d in e)r[function(e){var n=t(e);return i.hasOwnProperty("id_length")&&(n=String(n).substring(0,i.id_length)),n}(d)]=e[d];else for(var d in e)r[d]=[null].concat(e[d])}};indeed.crashtext.populate({"serptitle_v0_l_alpha\u0004Jobs, Employment in {0} {2} | {4}":["Jobs in {0} - {2} | {4}"],"serptitle_v0_q_alpha\u0004{0} Jobs, Employment {2} | {4}":["{0} Jobs - {2} | {4}"],"serptitle_v0_ql_alpha\u0004{0} Jobs, Employment in {1} {3} | {4}":["{0} Jobs in {1} - {3} | {4

In [6]:
def extract_job_title_from_result(soup): 
    jobs = []
    for div in soup.find_all(name="div", attrs={"class":"row"}):
        for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
            jobs.append(a["title"])
    return(jobs)
extract_job_title_from_result(soup)

['Data Scientist, Product',
 'Junior Data Scientist',
 'Data Scientist 1',
 'Data Scientist',
 'Marketing Data Scientist',
 'Sr. Business Data Analyst',
 'Data Scientist',
 'Junior Data Scientist',
 'Data Scientist',
 'Data Scientist',
 'Data Scientist',
 'Data Scientist',
 'Data Scientist',
 'Data Scientist',
 'Data Scientist']

In [8]:
def extract_company_from_result(soup): 
    companies = []
    for div in soup.find_all(name="div", attrs={"class":"row"}):
        company = div.find_all(name="span", attrs={"class":"company"})
    if len(company) > 0:
        for b in company:
            companies.append(b.text.strip())
    else:
        sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
        for span in sec_try:
            companies.append(span.text.strip())
    return(companies)
 
extract_company_from_result(soup)

['Sharecare Inc']

In [9]:
def extract_location_from_result(soup): 
    locations = []
    spans = soup.findAll('span', attrs={'class': 'location'})
    for span in spans:
        locations.append(span.text)
    return(locations)
extract_location_from_result(soup)

['New York State',
 'New York, NY',
 'New York, NY 10018 (Clinton area)',
 'Saratoga Springs, NY 12866',
 'New York, NY 10005 (Financial District area)',
 'Bronx, NY 10464',
 'New York, NY',
 'New York, NY',
 'New York, NY 10119 (Chelsea area)',
 'New York, NY',
 'New York State',
 'New York, NY 10036 (Garment District area)',
 'New York State',
 'New York State',
 'New York, NY 10013 (SoHo area)']

In [14]:
def extract_salary_from_result(soup): 
    salaries = []
    for div in soup.find_all(name="div", attrs={"class":"row"}):
        try:
            salaries.append(div.find('nobr').text)
        except:
            try:
                div_two = div.find(name="div", attrs={"class":"sjcl"})
                div_three = div_two.find("div")
                salaries.append(div_three.text.strip())
            except:
                salaries.append("Nothing_found")
    return(salaries)
extract_salary_from_result(soup)

['Splice.com',
 'Viacom\n\n\n\n4.0',
 'PayPal\n\n\n\n3.9',
 'Fingerpaint Marketing Inc',
 'Emerald\n\n\n\n3.6',
 'UCA services',
 'Comet.ml',
 'Decode_M',
 'Biz2Credit Inc',
 'Source Enterprises\n\n\n\n4.3',
 'Phreesia\n\n\n\n2.7',
 'Booz Allen Hamilton\n\n\n\n3.9',
 'Roivant Sciences\n\n\n\n4.5',
 'Vic.ai',
 'Sharecare Inc\n\n\n\n3.3']

In [15]:
def extract_summary_from_result(soup): 
    summaries = []
    spans = soup.findAll('span', attrs={'class': 'summary'})
    for span in spans:
        summaries.append(span.text.strip())
    return(summaries)
extract_summary_from_result(soup)

[]

In [17]:
max_results_per_city = 100
city_set = ['New+York','Chicago','San+Francisco', 'Austin', 'Seattle', 'Los+Angeles', 'Philadelphia', 'Atlanta', 'Dallas', 'Pittsburgh', 'Portland', 'Phoenix', 'Denver', 'Houston', 'Miami', 'Washington+DC', 'Boulder']
columns = ["city", "job_title", "company_name", "location", "summary", "salary"]
sample_df = pd.DataFrame(columns = columns)

In [18]:
sample_df

Unnamed: 0,city,job_title,company_name,location,summary,salary


In [None]:
#scraping code:
for city in city_set:
    for start in range(0, max_results_per_city, 10):
    page = requests.get(‘http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=' + str(city) + ‘&start=’ + str(start))
    time.sleep(1)  #ensuring at least 1 second between page grabs
    soup = BeautifulSoup(page.text, “lxml”, from_encoding=”utf-8")
    for div in soup.find_all(name=”div”, attrs={“class”:”row”}): 
        #specifying row num for index of job posting in dataframe
        num = (len(sample_df) + 1) 
        #creating an empty list to hold the data for each posting
        job_post = [] 
        #append city name
        job_post.append(city) 
    #grabbing job title
    for a in div.find_all(name=”a”, attrs={“data-tn-element”:”jobTitle”}):
        job_post.append(a[“title”]) 
        #grabbing company name
        company = div.find_all(name=”span”, attrs={“class”:”company”}) 
    if len(company) > 0:
        for b in company:
        job_post.append(b.text.strip()) 
    else: 
      sec_try = div.find_all(name=”span”, attrs={“class”:”result-link-source”})
      for span in sec_try:
        job_post.append(span.text) 
    #grabbing location name
    c = div.findAll(‘span’, attrs={‘class’: ‘location’}) 
    for span in c: 
      job_post.append(span.text) 
    #grabbing summary text
    d = div.findAll(‘span’, attrs={‘class’: ‘summary’}) 
      for span in d:
        job_post.append(span.text.strip()) 
    #grabbing salary
    try:
      job_post.append(div.find(‘nobr’).text) 
    except:
      try:
        div_two = div.find(name=”div”, attrs={“class”:”sjcl”}) 
        div_three = div_two.find(“div”) 
        job_post.append(div_three.text.strip())
      except:
        job_post.append(“Nothing_found”) 
    #appending list of job post info to dataframe at index num
    sample_df.loc[num] = job_post

#saving sample_df as a local csv file — define your own local path to save contents 
sample_df.to_csv(“[filepath].csv”, encoding=’utf-8')