# Imports and Installs

In [1]:
!pip install selenium
!pip install requests
!pip install BeautifulSoup4
!pip install pandas
!pip install lxml
!pip install bs4
!pip install html5lib



In [2]:
!pip list


Package             Version
------------------- ----------
altair              4.1.0
anyio               2.1.0
argon2-cffi         20.1.0
astor               0.8.1
async-generator     1.10
attrs               20.3.0
Babel               2.9.0
backcall            0.2.0
base58              2.1.0
beautifulsoup4      4.9.3
bleach              3.3.0
blinker             1.4
bs4                 0.0.1
cachetools          4.2.1
certifi             2020.12.5
cffi                1.14.5
chardet             4.0.0
click               7.1.2
colorama            0.4.4
cycler              0.10.0
decorator           4.4.2
defusedxml          0.6.0
entrypoints         0.3
gitdb               4.0.5
GitPython           3.1.13
html5lib            1.1
idna                2.10
ipykernel           5.5.0
ipython             7.20.0
ipython-genutils    0.2.0
ipywidgets          7.6.3
jedi                0.18.0
Jinja2              2.11.3
joblib              1.0.1
json5               0.9.5
jsonschema          3.2.0
j

In [3]:
# imports
import os
import sys
from selenium import webdriver # import web driver
from time import sleep 
import json
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
import lxml

# Globals 
CITY='Sacramento'
INDUSTRY='healthcare'
MAX_DISTANCE_KM = '15'  # [5, 10, 15, 25, 50, 100]
BASE_URL='https://www.indeed.com/'
LEVEL='entry_level'

# Testing Functionality

In [4]:
URL = '{base_url}jobs?q={industry}&l={city}&radius={distance}&explvl={level}'.format(base_url=BASE_URL, city=CITY, industry=INDUSTRY, distance=MAX_DISTANCE_KM, level=LEVEL)

#conducting a request of the stated URL above:
page = requests.get(URL)

#specifying a desired format of “page” using the html parser - this allows python to read the various components of the page, rather than treating it as one long string.
soup = BeautifulSoup(page.text, 'html.parser')

#printing soup in a more structured tree format that makes for easier reading
print(soup.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="en">
 <head>
  <meta content="text/html;charset=utf-8" http-equiv="content-type"/>
  <script src="//d3fw5vlhllyvee.cloudfront.net/s/ffe72ff/en_US.js" type="text/javascript">
  </script>
  <link href="//d3fw5vlhllyvee.cloudfront.net/s/105b986/jobsearch_all.css" rel="stylesheet" type="text/css"/>
  <link href="https://rss.indeed.com/rss?q=healthcare&amp;l=Sacramento&amp;radius=15&amp;explvl=entry_level" rel="alternate" title="Healthcare Jobs, Employment in Sacramento, CA" type="application/rss+xml"/>
  <link href="/m/jobs?q=healthcare&amp;l=Sacramento&amp;radius=15&amp;explvl=entry_level" media="only screen and (max-width: 640px)" rel="alternate"/>
  <script type="text/javascript">
   if (typeof window['closureReadyCallbacks'] == 'undefined') {
window['closureReadyCallbacks'] = [];
}

function call_when_jsall_loaded(cb) {
if (window['closureReady']) {
cb();
} else {
window['closureReadyCallbacks'].push(cb);
}
}
  </script>
  <meta content="1" name="p

In [5]:
def extract_job_title_from_result(soup): 
    jobs = []
    for div in soup.find_all(name='div', attrs={'class':'row'}):
        for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
            jobs.append(a['title'])
    return(jobs)
extract_job_title_from_result(soup)

['Administrative Assistant for Medical Front Office',
 'Director of Medical Records',
 'Medical Scheduler',
 'Administrative Assistant II',
 'Unit Secretary - NICU',
 '(Healthcare) Contracts Administrator 3',
 'Administrative Assistant II-Purchasing',
 'Customer Service Representative I',
 'Certified Medical Assistants needed ASAP!',
 'Patient Services Representative',
 'Chiropractic Assistant',
 'Receptionist',
 'Surgery Scheduling Clerk-R',
 'Ambulatory Surgery Center Administrator',
 'Medical Scribe']

In [6]:
def extract_company_from_result(soup): 
    companies = []
    for div in soup.find_all(name='div', attrs={'class':'row'}):
        company = div.find_all(name='span', attrs={'class':'company'})
        if len(company) > 0:
            for b in company:
                companies.append(b.text.strip())
        else:
            sec_try = div.find_all(name='span', attrs={'class':'result-link-source'})
            for span in sec_try:
                companies.append(span.text.strip())
    return(companies)
extract_company_from_result(soup)

['California IVF Fertility Center',
 'Fairmont Rehabilitation Hospital',
 'Sacramento Ear, Nose and Throat',
 'UC Davis Health',
 'Sutter Health',
 'UC Davis Health',
 'UC Davis Health',
 'Delta Dental',
 'visit-healthcare',
 'Sutter Health',
 'Baratta Holistic Center',
 'Western Dental & Orthodontics',
 'Methodist Hospital Sacramento',
 'Clinical Management Consultants',
 'Capital OB/GYN, Inc.']

In [7]:
def extract_location_from_result(soup): 
    locations = []
    spans = soup.findAll('span', attrs={'class': 'location'})
    for span in spans:
        locations.append(span.text)
    return(locations)
extract_location_from_result(soup)

['Sacramento, CA',
 'Sacramento, CA',
 'Sacramento, CA',
 'Sacramento, CA',
 'Rancho Cordova, CA 95670 (Prospect Park area)',
 'Sacramento, CA',
 'Sacramento, CA',
 'Carmichael, CA 95608',
 'Sacramento, CA 95829',
 'Sacramento, CA 95823 (Valley High-North Laguna area)']

In [8]:
def extract_summary_from_result(soup): 
    summaries = []
    divs = soup.findAll('div', attrs={'class': 'summary'})
    for div in divs:
        summaries.append(div.text.strip())
    return(summaries)
extract_summary_from_result(soup)

['Position includes reception, registration, telephones, medical records, and patient scheduling.\nExperience preferred in medical front office.',
 'High school or equivalent (Preferred).\nIt is essential that candidates have medical record experience in a skilled nursing setting or demonstrate excellent…',
 'Open to applicants who do not have a high school diploma/GED.\nOpen to applicants who do not have a college diploma.\nRelays necessary messages to staff.',
 'Responsible for editing/correcting previously signed out cases (Amendments), creating and formatting addendums for signed out cases, and preparing cases for…',
 'High School Graduate, GED or an equivalent combination of education and experience, plus successful completion of medical terminology course is required.',
 'As changes in federal law, state law and University policy occur from time to time, The Administrator monitors contracts to ensure continued compliance.',
 '2 years’ recent experience providing customer service 

In [9]:
def extract_date_posted_from_result(soup): 
    dates = []
    spans = soup.findAll('span', attrs={'class': 'date'})
    for span in spans:
        dates.append(span.text)
    return(dates)
extract_date_posted_from_result(soup)

['30+ days ago',
 '10 days ago',
 '22 days ago',
 '1 day ago',
 '1 day ago',
 '2 days ago',
 '5 days ago',
 '3 days ago',
 '23 days ago',
 '1 day ago',
 '6 days ago',
 '8 days ago',
 '2 days ago',
 '26 days ago',
 '19 days ago']

In [10]:
def extract_summary_from_result_new(soup, current_url=URL): 
    jobs = []
    for div in soup.find_all(name='div', attrs={'class':'row'}):
        for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
            print(a)
            print(a['title'])
            print(current_url+a['href'])
            page = requests.get(current_url+a['href'])
            summary_soup = BeautifulSoup(page.text, 'html.parser')
            print(extract_summary_from_result(summary_soup))
            break
    return(jobs)
extract_summary_from_result_new(soup)

<a class="jobtitle turnstileLink" data-tn-element="jobTitle" href="/pagead/clk?mo=r&amp;ad=-6NYlbfkN0C57XTUXzEslpkRU6GhCwIDvJsbjChpRBStfzYmWK420WzX5BopbTU5JdhHvzPoUOpXfPhCp7ProypCwj1V77PpR3behjALz-XoNZwebZA92tYVfHmjMVQ_YRpQSCea4FF7YaPezSZOpBSmFm82iexWxniLSsz4veGz6SXoBBmIrFinqdpawVGUUGH6bkCqvyjvFonuxuOvQZSSZXj7R81hqSt8rqLbUXzBqF2ENepROBBcmTmI6DiD2z_Ac669jtuw8EUwtUA149KJ6sHMj6LOGH-_JdslI8nTX2BcdT3A9Wy-V5QFzQ-30WZhUAuUqEMbG3jpgSjyVHchd3oW7PJ5h9_-7tIZRtHxwhwzxBu3s5iAFAbjgjQcFD5LG7MUeP499o7Y_m09Nld0KPwwtp1znlhu1TYXcZqVtDDJsJ8DQZ80ivq2owzS1LsmfFeHXvy2haEsiJ5LVVeiLVTDMunMj4pE6UIxqYUqyWeEU-DF7Q==&amp;p=0&amp;fvj=1&amp;vjs=3" id="sja0" onclick="setRefineByCookie(['radius', 'explvl']); sjoc('sja0', 0); convCtr('SJ'); rclk(this,jobmap[0],true,0);" onmousedown="sjomd('sja0'); clk('sja0'); rclk(this,jobmap[0],0);" rel="noopener nofollow" target="_blank" title="Administrative Assistant for Medical Front Office">
Administrative Assistant for Medical Front Office</a>
Administrative Assistant for Medic

['We offer full-time positions and benefits such as: medical, dental, vision, life insurance, supplemental and 401(k) with a company match.', 'Coordinate with IT managers, technicians, developers and other contractors to guide the delivery of new systems.\nKnowledge of project management principles.', 'Open to applicants who do not have a high school diploma/GED.\nOpen to applicants who do not have a college diploma.\nRelays necessary messages to staff.', 'Answer phones, scheduling appointments, billing, basic computer work.\nWe are utilizing all safety measures including a plastic protector at the front desk;…', 'Position includes reception, registration, telephones, medical records, and patient scheduling.\nExperience preferred in medical front office.', 'Medical Scribe needed for a busy OB/GYN office to work closely with physician , will be responsible for but not limited to, real time charting, retrieving…', 'Competency in HR Policies including knowledge with CMS, DHS, and OSHA sur

['We offer full-time positions and benefits such as: medical, dental, vision, life insurance, supplemental and 401(k) with a company match.', 'Coordinate with IT managers, technicians, developers and other contractors to guide the delivery of new systems.\nKnowledge of project management principles.', 'Open to applicants who do not have a high school diploma/GED.\nOpen to applicants who do not have a college diploma.\nRelays necessary messages to staff.', 'Answer phones, scheduling appointments, billing, basic computer work.\nWe are utilizing all safety measures including a plastic protector at the front desk;…', 'Position includes reception, registration, telephones, medical records, and patient scheduling.\nExperience preferred in medical front office.', 'Medical Scribe needed for a busy OB/GYN office to work closely with physician , will be responsible for but not limited to, real time charting, retrieving…', 'This is a full-time position and duties include, but are not limited to, 

['We offer full-time positions and benefits such as: medical, dental, vision, life insurance, supplemental and 401(k) with a company match.', 'Coordinate with IT managers, technicians, developers and other contractors to guide the delivery of new systems.\nKnowledge of project management principles.', 'Open to applicants who do not have a high school diploma/GED.\nOpen to applicants who do not have a college diploma.\nRelays necessary messages to staff.', 'Answer phones, scheduling appointments, billing, basic computer work.\nWe are utilizing all safety measures including a plastic protector at the front desk;…', 'Position includes reception, registration, telephones, medical records, and patient scheduling.\nExperience preferred in medical front office.', 'Medical Scribe needed for a busy OB/GYN office to work closely with physician , will be responsible for but not limited to, real time charting, retrieving…', 'This is a full-time position and duties include, but are not limited to, 

['We offer full-time positions and benefits such as: medical, dental, vision, life insurance, supplemental and 401(k) with a company match.', 'Coordinate with IT managers, technicians, developers and other contractors to guide the delivery of new systems.\nKnowledge of project management principles.', 'Open to applicants who do not have a high school diploma/GED.\nOpen to applicants who do not have a college diploma.\nRelays necessary messages to staff.', 'Answer phones, scheduling appointments, billing, basic computer work.\nWe are utilizing all safety measures including a plastic protector at the front desk;…', 'Position includes reception, registration, telephones, medical records, and patient scheduling.\nExperience preferred in medical front office.', 'Medical Scribe needed for a busy OB/GYN office to work closely with physician , will be responsible for but not limited to, real time charting, retrieving…', 'Competency in HR Policies including knowledge with CMS, DHS, and OSHA sur

['We offer full-time positions and benefits such as: medical, dental, vision, life insurance, supplemental and 401(k) with a company match.', 'Coordinate with IT managers, technicians, developers and other contractors to guide the delivery of new systems.\nKnowledge of project management principles.', 'Open to applicants who do not have a high school diploma/GED.\nOpen to applicants who do not have a college diploma.\nRelays necessary messages to staff.', 'Answer phones, scheduling appointments, billing, basic computer work.\nWe are utilizing all safety measures including a plastic protector at the front desk;…', 'Position includes reception, registration, telephones, medical records, and patient scheduling.\nExperience preferred in medical front office.', 'Medical Scribe needed for a busy OB/GYN office to work closely with physician , will be responsible for but not limited to, real time charting, retrieving…', 'Competency in HR Policies including knowledge with CMS, DHS, and OSHA sur

[]

# Run Full Scrape

In [11]:
max_results_per_industry = 100
columns = ['industry', 'job_title', 'company_name', 'location', 'summary', 'date_posted']
industry_set = ['healthcare', 'education', 'teaching', 'construction', 'services', 'hospitality', 'technology']
scrape_df = pd.DataFrame(columns = columns)

In [13]:
#scraping code:
for industry in industry_set:
    
    for start in range(0, max_results_per_industry, 10):
        
        url = '{base_url}jobs?q={industry}&l={city}&radius={distance}&explvl={level}&start={start}'.format(base_url=BASE_URL, city=CITY, industry=industry, distance=MAX_DISTANCE_KM, level=LEVEL, start=start)
        page = requests.get(url)
        
        time.sleep(1)  #ensuring at least 1 second between page grabs
        soup = BeautifulSoup(page.text, features='html.parser', from_encoding='utf-8')
        count=0
        for div in soup.find_all(name='div', attrs={'class':'row'}): 
            #specifying row num for index of job posting in dataframe
            num = (len(scrape_df) + 1) 

            #creating an empty list to hold the data for each posting
            job_post = [] 
            job_post.append(industry) 

            #grabbing job title
            for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
                job_post.append(a['title']) 
            if len(job_post) < 2:
                job_post.append('N/A')

            #grabbing company name
            company = div.find_all(name='span', attrs={'class':'company'}) 
            if len(company) > 0: 
                for b in company:
                    job_post.append(b.text.strip()) 
            
            else: 
                sec_try = div.find_all(name='span', attrs={'class':'result-link-source'})
                for span in sec_try:
                    job_post.append(span.text)
            if len(job_post) < 3:
                job_post.append('N/A')

            #grabbing location name
            c = div.findAll('div', attrs={'class': 'location'}) 
            for div in c: 
                job_post.append(div.text)
            if len(job_post) < 4:
                job_post.append('N/A')

            #grabbing summary text
            div = soup.findAll('div', attrs={'class': 'summary'})[count]
            job_post.append(div.text.strip())
            if len(job_post) < 5:
                job_post.append('N/A')
                
            #grabbing date posted
            span = soup.findAll('span', attrs={'class': 'date'})[count]
            job_post.append(span.text)
            if len(job_post) < 6:
                job_post.append('N/A')

            count +=1
            
            #appending list of job post info to dataframe at index num
            if len(job_post) == 6:
                scrape_df.loc[num] = job_post
            else:
                print(job_post)


#saving scrape_df as a local csv file — define your own local path to save contents 
scrape_df.to_csv('indeed_scrape.csv', encoding='utf-8')

In [14]:
scrape_df

Unnamed: 0,industry,job_title,company_name,location,summary,date_posted
1,healthcare,Medical Scheduler,"Sacramento Ear, Nose and Throat","Sacramento, CA 95815 (Johnson Business Park area)",Open to applicants who do not have a high scho...,22 days ago
2,healthcare,Chiropractic Assistant,Baratta Holistic Center,"Carmichael, CA 95608","Answer phones, scheduling appointments, billin...",6 days ago
3,healthcare,Administrative Assistant for Medical Front Office,California IVF Fertility Center,"Sacramento, CA 95833 (Gateway Center area)","Position includes reception, registration, tel...",30+ days ago
4,healthcare,Medical Scribe,"Capital OB/GYN, Inc.","Sacramento, CA 95825 (Campus Commons area)",Medical Scribe needed for a busy OB/GYN office...,19 days ago
5,healthcare,Surgery Scheduler,Northern California Orthopaedic Associates,"Sacramento, CA 95825 (Sierra Oaks area)",High school or equivalent (Preferred).\nPrevio...,30+ days ago
...,...,...,...,...,...,...
1051,technology,Cloud Security Engineer,Cotocus,,The cloud security engineer specializes in pro...,30+ days ago
1052,technology,Jenkins Engineer,Cotocus,,You should have in-depth knowledge of at least...,30+ days ago
1053,technology,GCP DevOps Engineer,Cotocus,,GCP DevOps Engineer is accountable for efficie...,30+ days ago
1054,technology,"ESRI GIS SOLUTIONS ARCHITECT -Sacramento, CA-C...",ASAP Solutions,,Experience working with ESRI Enterprise soluti...,30+ days ago
