In [20]:
# import libraries
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import numpy as np
import pandas as pd

## Essential Data and Variables Required
---

In [41]:
# indeed homepage url
indeed_url = r"https://www.indeed.com/"

# list of all 50 states
all_states = pd.read_csv("./usa_all_states.csv")["StateName"].tolist()

# chromedriver location
chromedriver_loc = r"./chromedriver"

# search terms
search_terms = ["title:(data engineer)", "title:(data scientist)", "title:(data analyst)"]

## Data Extraction from Indeed.com
---

In [25]:
def fill_in_what_field(SEARCH_TERM):
    """
    Select and fill in the "What" field on the main page of Indeed.com
    with the SEARCH_TERM provided as an argument
    """
    # find the element
    job_search_elem = driver.find_element(by="id", value="text-input-what")
    
    # click on it and clear its contents
    job_search_elem.click()
    job_search_elem.send_keys(Keys.CONTROL + "a")
    job_search_elem.send_keys(Keys.DELETE)
    
    # pass in the search term
    job_search_elem.send_keys(SEARCH_TERM)

In [26]:
def fill_in_where_field(SEARCH_TERM):
    """
    Select and fill in the "Where" field on the main page of Indeed.com
    with the SEARCH_TERM provided as an argument
    """
    # find the element
    location_search_elem = driver.find_element(by="id", value="text-input-where")
    
    # click on it and clear its contents
    location_search_elem.click()
    location_search_elem.send_keys(Keys.CONTROL + "a")
    location_search_elem.send_keys(Keys.DELETE)
    
    # pass in the search term
    location_search_elem.send_keys(SEARCH_TERM)

In [44]:
def click_on_find_jobs():
    """
    Simply find and click on the "Find Jobs" button
    """
    # find the button element
    find_jobs_btn = driver.find_element(by="class name", value="yosegi-InlineWhatWhere-primaryButton")
    
    # click on the button
    find_jobs_btn.click()

In [45]:
def extract_data_required():
    """
    Extracts the position and location summary as well as the number of jobs
    from the search results page
    """
    # extract the stated position and location
    position_location = driver.find_element(by="id", value="jobsInLocation").text
    
    # extract the number of jobs (or pages)
    num_of_jobs = driver.find_element(by="id", value="searchCountPages").text
    
    return position_location, num_of_jobs

In [None]:
# create a chromedriver instance
driver = webdriver.Chrome(chromedriver_loc)

# go to indeed
driver.get(indeed_url)

In [42]:
# store of all data
extracted_data = []

In [48]:
# iterate across all states and search terms
for us_state in all_states:
    for search_term in search_terms:
        # fill in the search fields
        fill_in_what_field(search_term)
        time.sleep(1)
        fill_in_where_field(us_state)
        time.sleep(1)
        click_on_find_jobs()
        
        # sleep until it loads
        time.sleep(5)
        
        # extract the data we require
        position_location, num_of_jobs = extract_data_required()
        
        # store in our master list
        extracted_data.append({
            "us_state": us_state,
            "search_term": search_term,
            "position_location": position_location,
            "num_of_jobs": num_of_jobs
        })
        
        # export backup
        pd.DataFrame(extracted_data).to_csv("scrape_backup.csv", index=False)
        
        # sleep before resuming
        time.sleep(5)

In [79]:
# store extracted data in dataframe
df = pd.DataFrame(extracted_data)

## Light Data Cleaning and Export
---

In [80]:
# drop duplicates
df = df.drop_duplicates()

# extract search string from advanced search query
df["search_string"] = df["search_term"].apply(lambda x: x.split(":")[-1].replace("(", "").replace(")", "").strip().title())

# extract job count and turn to int
df["job_count"] = df["num_of_jobs"].apply(lambda x: x.split(" ")[-2].replace(",", "").strip())
df["job_count"] = df["job_count"].astype(int)

In [83]:
# export data
df.to_csv("Extracted_From_Indeed_011922.csv", index=False)