<a href="https://colab.research.google.com/github/sumedhajoshi/python-best-practices-cookiecutter/blob/main/bls_ooh_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web Scraper for BLS.gov

https://www.bls.gov/ maintains webpages of outlook of jobs in the US. Each page has a lot of good information, but it is not possible to see the whole information at one place.

The code below is a web-scraper that getches the webpages, extracts the information of interests, and collects it in a dictionary.

In [None]:
from bs4 import BeautifulSoup
import requests
import json


def get_quick_facts(url1: str) -> dict:
  page1 = requests.get(url1)
  soup1 = BeautifulSoup(page1.content, "html.parser")
  title1 = soup1.title.string
  role1 = title1.split(":")[0].strip()

  facts1 = {"role": role1, "Reference": url1}
  quick_facts = soup1.find('table')
  for row in quick_facts.find_all('tr')[1:]:
    key1 = row.find('th').text.strip()
    val1 = row.find('td').text.strip()
    facts1[key1] = val1

  return facts1

def parse_pay(pay_str: str) -> int:
  # "$101,780     per year\r\n                                        $48.93     per hour"
  val1 = pay_str.split(' ')[0]
  val1 = val1.replace('$', '').replace(',','')
  return int(val1)

def parse_nop(x):
  return x

def parse_jobs(job_str: str) -> int:
  # '303,800'
  val1 = job_str.replace(',','')
  return int(val1)

def parse_outlook(outlook_str: str) -> int:
  # '3% (Slower than average)'
  val1 = outlook_str.split('%')[0]
  return int(val1)

def parse_facts(facts1: dict) -> dict:
  parsers = {
    "role":  {"label": "role", "parser": parse_nop},
    "Reference": {"label": "ref", "parser": parse_nop},
    "2021     Median Pay": {"label": "pay2021", "parser": parse_pay},
    "Typical Entry-Level Education": {"label": "education", "parser": parse_nop},
    "Work Experience in a Related Occupation": {"label": "experience", "parser": parse_nop},
    "On-the-job Training": {"label": "training", "parser": parse_nop},
    "Number of Jobs, 2021": {"label": "jobs2021", "parser": parse_jobs},
    "Job Outlook, 2021-31": {"label": "growth10", "parser": parse_outlook},
  }

  res = {}
  for k, v in parsers.items():
    res[v["label"]] = v["parser"](facts1[k])
  return res

def get_role_url(group_url: str) -> list:
  page1 = requests.get(group_url)
  soup1 = BeautifulSoup(page1.content, "html.parser")

  roles = []
  all_roles = soup1.find('table')
  base_url = "https://www.bls.gov"
  for row in all_roles.find_all('tr')[1:]:
    role1 = row.find('td')
    rel_url = role1.find('a')['href']
    roles.append(f"{base_url}/{rel_url}")

  return roles

def get_group_facts(group_url1: str) -> list:
  all_roles = get_role_url(group_url1)
  all_facts = []
  for role1 in all_roles:
    all_facts.append(parse_facts(get_quick_facts(role1)))
  return all_facts

#url1 = "https://www.bls.gov/ooh/architecture-and-engineering/electrical-and-electronics-engineers.htm"
#fact1 = get_quick_facts(url1)
#parse_facts(fact1)
group_url1 = "https://www.bls.gov/ooh/architecture-and-engineering/home.htm"
group_facts = get_group_facts(group_url1)
for f in group_facts:
  print(json.dumps(f, indent=4))

{
    "role": "Aerospace Engineering and Operations Technologists and Technicians",
    "ref": "https://www.bls.gov//ooh/architecture-and-engineering/aerospace-engineering-and-operations-technicians.htm",
    "pay2021": 73580,
    "education": "Associate's degree",
    "experience": "None",
    "training": "None",
    "jobs2021": 11300,
    "growth10": 6
}
{
    "role": "Aerospace Engineers",
    "ref": "https://www.bls.gov//ooh/architecture-and-engineering/aerospace-engineers.htm",
    "pay2021": 122270,
    "education": "Bachelor's degree",
    "experience": "None",
    "training": "None",
    "jobs2021": 58800,
    "growth10": 6
}
{
    "role": "Agricultural Engineers",
    "ref": "https://www.bls.gov//ooh/architecture-and-engineering/agricultural-engineers.htm",
    "pay2021": 82640,
    "education": "Bachelor's degree",
    "experience": "None",
    "training": "None",
    "jobs2021": 1200,
    "growth10": 1
}
{
    "role": "Architects",
    "ref": "https://www.bls.gov//ooh/archit