In [None]:
# Full Job Board Scraper: Offline + Pagination + CSV/JSON

import os
import requests
from bs4 import BeautifulSoup
import time
import csv
import json


session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/117.0 Safari/537.36"
})


base_url = "https://realpython.github.io/fake-jobs/"
raw_folder = "raw_html_jobs"
os.makedirs(raw_folder, exist_ok=True)


page_number = 1

while url:
    response = session.get(url, timeout=10)
    response.encoding = 'utf-8'
    html_content = response.text

   
    file_path = os.path.join(raw_folder, f"page{page_number}.html")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(html_content)
    print(f"✅ Saved {file_path}")

   
    soup = BeautifulSoup(html_content, "html.parser")
    next_btn = soup.find("a", string="Next")
    if next_btn:
        url = base_url + next_btn["href"]
        page_number += 1
        time.sleep(2) 
    else:
        url = None

print(f"\n✅ Total pages saved: {page_number}")


all_jobs = []

html_files = sorted(os.listdir(raw_folder), key=lambda x: int(x.replace("page","").replace(".html","")))

for file_name in html_files:
    file_path = os.path.join(raw_folder, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f.read(), "html.parser")
    
    job_cards = soup.find_all("div", class_="card-content")
    for card in job_cards:
        title = card.find("h2", class_="title").text.strip()
        company = card.find("h3", class_="company").text.strip()
        location = card.find("p", class_="location").text.strip()
        link = card.find("a")["href"].strip()
        
        all_jobs.append({
            "title": title,
            "company": company,
            "location": location,
            "link": link
        })

print(f"\n✅ Total jobs extracted: {len(all_jobs)}")


csv_file = "jobs_all_pages.csv"
with open(csv_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Title", "Company", "Location", "Link"])
    for job in all_jobs:
        writer.writerow([job["title"], job["company"], job["location"], job["link"]])
print(f"✅ Jobs saved to {csv_file}")


json_file = "jobs_all_pages.json"
with open(json_file, "w", encoding="utf-8") as f:
    json.dump(all_jobs, f, indent=4, ensure_ascii=False)
print(f"✅ Jobs saved to {json_file}")


✅ Saved raw_html_jobs\page1.html

✅ Total pages saved: 1

✅ Total jobs extracted: 100
✅ Jobs saved to jobs_all_pages.csv
✅ Jobs saved to jobs_all_pages.json
