In [31]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time

In [33]:
URL = "https://www.indeed.com/jobs?q=software+engineer"
#conducting a request of the stated URL above:
page = requests.get(URL)
#specifying a desired format of “page” using the html parser - this allows python to read the various components of the page, rather than treating it as one long string.
soup = BeautifulSoup(page.text, "html.parser")
#printing soup in a more structured tree format that makes for easier reading
#print(soup.prettify())

In [34]:
def extract_job_title_from_result(soup): 
  jobs = []
  for div in soup.find_all(name="div", attrs={"class":"row"}):
      for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
        jobs.append(a["title"])
  return(jobs)

extract_job_title_from_result(soup)

[u'Software Engineer',
 u'Software Engineer (full stack node.js)',
 u'Embedded Software Developer',
 u'Software API Engineer',
 u'Software Engineering Internship (Summer 2018)',
 u'Software Development Engineer I',
 u'Software Dev Engineer I',
 u'Software Engineer',
 u'Java Systems Engineer',
 u'Factory Software Integration Support Engineer',
 u'Jr. Software Engineer (REF7034T)',
 u'Software Engineer',
 u'Software Engineer - New College Grad',
 u'Software Engineer',
 u'Software Developer',
 u'Software Engineer (node.js)']

In [35]:
def extract_company_from_result(soup):
  companies = []
  for div in soup.find_all(name="div", attrs={"class":"row"}):
    company = div.find_all(name="span", attrs={"class":"company"})
    if len(company) > 0:
      for b in company:
        companies.append(b.text.strip())
    else:
      sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
      for span in sec_try:
            companies.append(span.text.strip())
            
  return(companies)
 
extract_company_from_result(soup)

[u'Indeed Prime',
 u'Pluralsight',
 u'Alveo Technologies',
 u'Nevro Corporation',
 u'WePay',
 u'Amazon.com',
 u'Oath Inc',
 u'Tesla',
 u'Wells Fargo',
 u'Tesla',
 u'Visa',
 u'Microsoft',
 u'Proofpoint',
 u'Cisco',
 u'BeaconMD',
 u'Pluralsight']

In [36]:
def extract_location_from_result(soup): 
  locations = []
  spans = soup.findAll("span", attrs={"class": "location"})
  for span in spans:
    locations.append(span.text)
  return(locations)

extract_location_from_result(soup)

[u'San Francisco, CA',
 u'South Jordan, UT',
 u'Alameda, CA 94501',
 u'Redwood City, CA 94065',
 u'Redwood City, CA',
 u'Palo Alto, CA',
 u'Sunnyvale, CA',
 u'Fremont, CA',
 u'Fremont, CA 94537 (Cabrillo area)',
 u'Fremont, CA',
 u'Foster City, CA',
 u'Palo Alto, CA',
 u'Sunnyvale, CA 94089',
 u'Milpitas, CA',
 u'Guaynabo, PR 00968',
 u'Boston, MA']

In [37]:
def extract_salary_from_result(soup): 
  salaries = []
  for div in soup.find_all(name="div", attrs={"class":"row"}):
    try:
      salaries.append(div.find("nobr").text)
    except:
      try:
        div_two = div.find(name="div", attrs={"class":"sjcl"})
        div_three = div_two.find("div")
        salaries.append(div_three.text.strip())
      except:
        salaries.append("Nothing_found")
  return(salaries)
extract_salary_from_result(soup)

['Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found']

In [38]:
def extract_summary_from_result(soup): 
  summaries = []
  spans = soup.findAll("span", attrs={"class": "summary"})
  for span in spans:
    summaries.append(span.text.strip())
  return(summaries)
extract_summary_from_result(soup)

[u'How Indeed Prime Works Apply to Prime in 5 minutes. Apply to 100+ top companies with 1 simple application to Indeed Prime....',
 u"Continuous Delivery - teams independently ship code to prod every day. We do this through the tech industry's leading learning platform for serious Developer,...",
 u'Your primary responsibility would be building to develop embedded software for new products and prototypes. Define and develop embedded software architecture...',
 u'Develops software in a regulated environment in accordance with internal operating procedures and external standards and regulations....',
 u'Our stack is primarily based on Python and Java. 500 fastest-growing private companies list before its December 2017 acquisition by JPMorgan Chase & Co., and...',
 u"Proficiency in, at least, one modern programming language such as C, C++, Java, or Perl. Join us, and you'll be taking part in changing the future of everyday...",
 u'You will be coding in Java and/or C++ on Unix platform to 

In [39]:
max_results_per_city = 100
city_set = ["New+York","Chicago","San+Francisco", "Austin"]
columns = ["city", "job_title", "company_name", "location", "summary", "salary"]
sample_df = pd.DataFrame(columns = columns)

In [None]:
#scraping code:
for city in city_set:
  for start in range(0, max_results_per_city, 10):
      page = requests.get("http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=" + str(city) + "&start=" + str(start))
      time.sleep(1)  #ensuring at least 1 second between page grabs
      soup = BeautifulSoup(page.text, "lxml", from_encoding="utf-8")
      for div in soup.find_all(name="div", attrs={"class":"row"}): 
        #specifying row num for index of job posting in dataframe
        num = (len(sample_df) + 1) 
        #creating an empty list to hold the data for each posting
        job_post = [] 
        #append city name
        job_post.append(city) 
        #grabbing job title
        for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
            job_post.append(a["title"]) 
        #grabbing company name
        company = div.find_all(name="span", attrs={"class":"company"}) 
        if len(company) > 0: 
            for b in company:
                job_post.append(b.text.strip()) 
        else:
            sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
            for span in sec_try:
                job_post.append(span.text) 
        #grabbing location name
        c = div.findAll("span", attrs={"class": "location"}) 
        for span in c: 
            job_post.append(span.text) 
        #grabbing summary text
        d = div.findAll("span", attrs={"class": "summary"}) 
        for span in d:
            job_post.append(span.text.strip()) 
        #grabbing salary
        try:
            job_post.append(div.find("nobr").text) 
        except:
            try:
                div_two = div.find(name="div", attrs={"class":"sjcl"}) 
                div_three = div_two.find("div") 
                job_post.append(div_three.text.strip())
            except:
                job_post.append("Nothing_found") 
        #appending list of job post info to dataframe at index num
        sample_df.loc[num] = job_post

#saving sample_df as a local csv file — define your own local path to save contents 
sample_df.to_csv("scraped-jobs.csv", encoding="utf-8")

In [None]:
def extract_job(jobId):
    URL = "https://www.indeed.com/jobs?q=sw&vjk=" + str(jobId)
    #conducting a request of the stated URL above:
    page = requests.get(URL)
    #specifying a desired format of “page” using the html parser - this allows python to read the various components of the page, rather than treating it as one long string.
    soup = BeautifulSoup(page.text, "html.parser")
    #printing soup in a more structured tree format that makes for easier reading
    #print(soup.prettify())

extract_job("febf687db2ce1f38")