In [103]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import csv

In [104]:
base_url = "https://nationalcareers.service.gov.uk"
careers_url = "https://nationalcareers.service.gov.uk/explore-careers/"


In [105]:
html = requests.get(careers_url).content
soup = BeautifulSoup(html, 'html.parser')

In [106]:
category_links = [a['href'] for a in soup.find_all('a', href=True) if '/job-categories/' in a['href']]

data = []
for links in category_links:
    category_url = base_url + links
    category_response = requests.get(category_url)
    category_data = BeautifulSoup(category_response.content, "html.parser")
    category_name = category_data.find("h1", class_="govuk-heading-xl").text
    careers = category_data.find_all("li", class_="job-categories_item")
    for career in careers:
        title = career.find("a").text
        decription = career.find("p").text
        job_url = base_url + career.find("a")["href"]
        data.append({"Category": category_name, "Job Title": title,"Job Description": decription, "Job URL": job_url})

In [107]:
with open("careers.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["Category","Job Title", "Job Description", "Job URL"])
    writer.writeheader()
    writer.writerows(data)

In [108]:
df = pd.read_csv("careers.csv")
for index, row in df.iterrows():
    job_url = row["Job URL"]
    job_response = requests.get(job_url)
    job_data = BeautifulSoup(job_response.content, "html.parser")
    salary = job_data.find("div",class_="job-profile-salary job-profile-heroblock-content").text.strip()
    salary = " ".join(salary.split())
    hours = job_data.find("div",class_="job-profile-hours job-profile-heroblock-content").text.strip()
    hours = " ".join(hours.split())
    working_hours = job_data.find("div",class_="job-profile-pattern job-profile-heroblock-content").text.strip()
    working_hours = " ".join(working_hours.split())
    df.loc[index, "Average Salary"] = salary
    df.loc[index, "Typical Hours(a week)"] = hours
    df.loc[index, "You could work"] = working_hours
    job_details = job_data.find_all("div", class_="govuk-accordion__section")
    for details in job_details:
        section_heading = details.find("h2", class_="govuk-accordion__section-heading").text.strip()
        subsection  = details.find_all("div", class_="govuk-accordion__section-content")
        content = ""
        for section in subsection:
            for h3 in section.find_all("h3"):
                content += h3.text + "\n"
            for p in section.find_all("p"):
                content += p.text + "\n"
            for li in section.find_all("li"):
                content += li.text + "\n"
            for h4 in section.find_all("h4"):
                content += h4.text + "\n"
            for a in section.find_all("a"):
                content += a.text + "\n"
            content = " ".join(content.strip().split())
        df.loc[index, section_heading] = content
#write the updated dataframe to the existing csv file
df.to_csv("careers.csv", index=False)


https://nationalcareers.service.gov.uk/job-profiles/accounting-technician
https://nationalcareers.service.gov.uk/job-profiles/admin-assistant
https://nationalcareers.service.gov.uk/job-profiles/arts-administrator
https://nationalcareers.service.gov.uk/job-profiles/assistant-immigration-officer
https://nationalcareers.service.gov.uk/job-profiles/auditor
https://nationalcareers.service.gov.uk/job-profiles/bid-writer
https://nationalcareers.service.gov.uk/job-profiles/bilingual-secretary
https://nationalcareers.service.gov.uk/job-profiles/bookkeeper
https://nationalcareers.service.gov.uk/job-profiles/border-force-officer
https://nationalcareers.service.gov.uk/job-profiles/car-rental-agent
https://nationalcareers.service.gov.uk/job-profiles/charity-fundraiser
https://nationalcareers.service.gov.uk/job-profiles/civil-service-administrative-officer
https://nationalcareers.service.gov.uk/job-profiles/civil-service-executive-officer
https://nationalcareers.service.gov.uk/job-profiles/credit-co