In [1]:
# BeautifulSoup library is used for parsing HTML and XML documents.
# It helps to extract data from web pages in an easy and structured way.
from bs4 import BeautifulSoup  

# requests library is used to send HTTP requests (GET, POST, etc.)
# It helps to fetch the content of a website (HTML source code).
import requests


In [3]:
# Sending an HTTP GET request to the given URL
# This fetches the HTML content of the fake jobs webpage for scraping
page = requests.get('https://realpython.github.io/fake-jobs/')


In [4]:
# Creating a BeautifulSoup object to parse the HTML content of the webpage
# page.content contains the raw HTML data
# 'html.parser' is used to convert HTML into a searchable and structured format
soup = BeautifulSoup(page.content, 'html.parser')


In [5]:
# Finding all HTML elements that have the class name 'card-content'
# Each 'card-content' block represents a single job card on the webpage
all_cards = soup.find_all(class_='card-content')

# Counting the total number of job cards found on the webpage
len(all_cards)


100

In [6]:
# Selecting the first job card from the list of all job cards
# all_cards[0] represents the first job posting on the webpage
card = all_cards[0]

# Displaying the selected job card (HTML content)
card


<div class="card-content">
<div class="media">
<div class="media-left">
<figure class="image is-48x48">
<img alt="Real Python Logo" src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1"/>
</figure>
</div>
<div class="media-content">
<h2 class="title is-5">Senior Python Developer</h2>
<h3 class="subtitle is-6 company">Payne, Roberts and Davis</h3>
</div>
</div>
<div class="content">
<p class="location">
        Stewartbury, AA
      </p>
<p class="is-small has-text-grey">
<time datetime="2021-04-08">2021-04-08</time>
</p>
</div>
<footer class="card-footer">
<a class="card-footer-item" href="https://www.realpython.com" target="_blank">Learn</a>
<a class="card-footer-item" href="https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html" target="_blank">Apply</a>
</footer>
</div>

In [7]:
# Extracting the job designation/title from the selected job card
# 'title is-5' class contains the job role name
# .text is used to get only the readable text (without HTML tags)
designation = card.find(class_='title is-5').text


In [8]:
# Extracting the company name from the selected job card
# 'subtitle is-6 company' class contains the company name
# .text is used to extract readable text from the HTML element
company_name = card.find(class_='subtitle is-6 company').text

# Displaying the extracted company name
company_name


'Payne, Roberts and Davis'

In [9]:
# Extracting the job location from the selected job card
# 'location' class contains the job location details
# .text extracts readable text from HTML
# .strip('\n') removes unwanted newline characters from the text
location = card.find(class_='location').text.strip('\n')


In [10]:
# Extracting the job posting date from the selected job card
# 'time' HTML tag contains the date when the job was posted
# .text is used to get the readable date text
date_of_post = card.find('time').text


In [11]:
# Extracting the job apply link from the selected job card
# 'card-footer-item' class contains multiple links (e.g., Learn more, Apply)
# [1] is used to select the second link, which is the Apply link
# ['href'] extracts the URL from the anchor tag
apply_link = card.find_all(class_='card-footer-item')[1]['href']


In [12]:
# Creating an empty list to store all job records
records = []

# Looping through each job card extracted from the webpage
for card in all_cards:
    
    # Extracting job designation/title
    designation = card.find(class_='title is-5').text
    
    # Extracting company name
    company_name = card.find(class_='subtitle is-6 company').text
    
    # Extracting job location and removing extra newline characters
    location = card.find(class_='location').text.strip('\n')
    
    # Extracting job posting date
    date_of_post = card.find('time').text
    
    # Extracting the job apply link (second footer link)
    apply_link = card.find_all(class_='card-footer-item')[1]['href']
    
    # Storing extracted job details in a dictionary
    d1 = {
        'designation': designation,
        'company_name': company_name,
        'location': location,
        'date_of_post': date_of_post,
        'apply_link': apply_link
    }
    
    # Appending each job record to the records list
    records.append(d1)


In [13]:
# Adding the current job record dictionary to the records list
records.append(d1)


In [14]:
# Importing pandas library for data manipulation and analysis
import pandas as pd

# Converting the list of job records (records) into a Pandas DataFrame
# Each dictionary in the list becomes a row in the DataFrame
df = pd.DataFrame(records)


In [15]:
# Checking the shape of the DataFrame
# It returns the number of rows (jobs) and columns (features)
df.shape


(101, 5)

In [16]:
# Displaying the first 5 rows of the DataFrame
# This helps to quickly preview the scraped job data
df.head()


Unnamed: 0,designation,company_name,location,date_of_post,apply_link
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA\n",2021-04-08,https://realpython.github.io/fake-jobs/jobs/se...
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA\n",2021-04-08,https://realpython.github.io/fake-jobs/jobs/en...
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA\n",2021-04-08,https://realpython.github.io/fake-jobs/jobs/le...
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP\n",2021-04-08,https://realpython.github.io/fake-jobs/jobs/fi...
4,Product manager,Ramirez Inc,"North Jamieview, AP\n",2021-04-08,https://realpython.github.io/fake-jobs/jobs/pr...


In [17]:
# Counting how many times each job designation appears in the DataFrame
# value_counts() gives frequency of unique values in the 'designation' column
df['designation'].value_counts()


Python Programmer (Entry-Level)        3
Ship broker                            2
Legal executive                        2
Manufacturing systems engineer         2
Materials engineer                     2
                                      ..
Broadcast engineer                     1
Neurosurgeon                           1
Immigration officer                    1
Structural engineer                    1
Engineer, broadcasting (operations)    1
Name: designation, Length: 92, dtype: int64

In [19]:
# Importing necessary libraries
import requests  # To send HTTP requests and fetch webpage content
from bs4 import BeautifulSoup  # For parsing HTML content
import pandas as pd  # For structuring data and saving to CSV/JSON
import logging  # For logging info and errors during scraping

# Setting up logging configuration
# Logs will be saved in 'scraper.log' with timestamp, level, and message
logging.basicConfig(
    filename="scraper.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# Target URL of the fake jobs webpage
URL = "https://realpython.github.io/fake-jobs/"

# Function to fetch HTML content of a given URL
def fetch_page(url):
    # Sending GET request to the URL
    response = requests.get(url)
    # Raise exception if request fails (status code not 200)
    response.raise_for_status()
    # Return the raw HTML content
    return response.text

# Function to parse job listings from HTML
def parse_jobs(html):
    # Create BeautifulSoup object to parse HTML
    soup = BeautifulSoup(html, "html.parser")
    # Find all job cards on the page
    cards = soup.find_all("div", class_="card-content")
    # Initialize list to store job data
    records = []

    # Loop through each job card to extract details
    for card in cards:
        # Extract job title
        title = card.find("h2", class_="title").text.strip()
        # Extract company name
        company = card.find("h3", class_="company").text.strip()
        # Extract job location
        location = card.find("p", class_="location").text.strip()

        # Store the extracted data in a dictionary
        records.append({
            "title": title,
            "company": company,
            "location": location
        })
    # Return the list of job records
    return records

# Function to save scraped data into CSV and JSON files
def save_data(records):
    # Convert list of dictionaries into a Pandas DataFrame
    df = pd.DataFrame(records)
    # Save DataFrame to CSV file (without index column)
    df.to_csv("jobs_data.csv", index=False)
    # Save DataFrame to JSON file with readable formatting
    df.to_json("jobs_data.json", orient="records", indent=4)

# Main function to run the scraper
def main():
    try:
        # Fetch HTML content from the URL
        html = fetch_page(URL)
        # Parse jobs from the HTML content
        records = parse_jobs(html)
        # Save the parsed job data to CSV and JSON
        save_data(records)
        # Log successful scraping
        logging.info("Scraping completed successfully")
    except Exception as e:
        # Log any errors that occur during scraping
        logging.error(f"Error occurred: {e}")

# Entry point: run main function when script is executed
if __name__ == "__main__":
    main()


In [20]:
# Saving the DataFrame to a CSV file named 'jobs_data.csv'
# index=False ensures that the DataFrame index is not written to the file
df.to_csv("jobs_data.csv", index=False)

# Saving the same DataFrame to a JSON file named 'jobs_data.json'
# orient="records" creates a list of dictionaries (one per row)
# indent=4 makes the JSON file readable with proper indentation
df.to_json("jobs_data.json", orient="records", indent=4)

# Print confirmation message to indicate files have been saved successfully
print("Files saved successfully")


Files saved successfully


In [21]:
# Importing necessary libraries
import requests  # To send HTTP requests and fetch webpage content
from bs4 import BeautifulSoup  # For parsing HTML content
import pandas as pd  # For structuring data and saving to CSV/JSON

# Function to fetch the HTML content of a given URL
def fetch_page(url):
    # Sending a GET request to the URL
    response = requests.get(url)
    # Raise an exception if the request failed (status code not 200)
    response.raise_for_status()
    # Return the HTML content of the page
    return response.text

# Function to parse job postings from the fetched HTML
def parse_jobs(html):
    # Create BeautifulSoup object to parse HTML content
    soup = BeautifulSoup(html, "html.parser")
    # Find all job cards on the page using the class 'card-content'
    cards = soup.find_all("div", class_="card-content")

    # Initialize an empty list to store job records
    records = []
    # Loop through each job card to extract job details
    for card in cards:
        # Extract job title and remove extra spaces
        title = card.find("h2", class_="title").text.strip()
        # Extract company name and remove extra spaces
        company = card.find("h3", class_="company").text.strip()
        # Extract job location and remove extra spaces
        location = card.find("p", class_="location").text.strip()

        # Store extracted details in a dictionary
        records.append({
            "title": title,
            "company": company,
            "location": location
        })
    # Return the list of all job records
    return records

# Function to save the extracted job data to CSV and JSON files
def save_data(records):
    # Convert the list of dictionaries into a Pandas DataFrame
    df = pd.DataFrame(records)
    # Save the DataFrame to a CSV file (without index column)
    df.to_csv("jobs_data.csv", index=False)
    # Save the same DataFrame to a JSON file with readable formatting
    df.to_json("jobs_data.json", orient="records", indent=4)
    # Return the DataFrame for further use if needed
    return df


In [22]:
# URL of the webpage to scrape
URL = "https://realpython.github.io/fake-jobs/"

# Fetch the HTML content of the URL
# Calls the fetch_page() function defined earlier
html = fetch_page(URL)

# Parse the fetched HTML to extract job postings
# Calls the parse_jobs() function
records = parse_jobs(html)

# Save the extracted job data to CSV and JSON files
# Also returns a Pandas DataFrame for immediate use
df = save_data(records)

# Display the first 5 rows of the DataFrame to verify the data
df.head()


Unnamed: 0,title,company,location
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA"
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA"
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA"
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP"
4,Product manager,Ramirez Inc,"North Jamieview, AP"


In [23]:
# Importing logging library to track info and errors
import logging

# Setting up logging configuration
# Logs will be written to 'scraper.log' with timestamp, level, and message
logging.basicConfig(
    filename="scraper.log",
    level=logging.INFO,  # INFO level will log info and errors
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# Using try-except block to catch errors during scraping
try:
    # Fetch HTML content from the URL
    html = fetch_page(URL)
    # Parse job postings from the HTML
    records = parse_jobs(html)
    # Save extracted data to CSV and JSON files
    save_data(records)
    # Log a success message if scraping completes without error
    logging.info("Scraping completed successfully")
except Exception as e:
    # Log any error that occurs during scraping
    logging.error(f"Error occurred: {e}")


In [26]:
# Importing necessary libraries
import requests  # To fetch HTML content from web pages
from bs4 import BeautifulSoup  # To parse HTML content
import pandas as pd  # To structure data and save to CSV/JSON
import logging  # To log info and errors during scraping

# Setting up logging configuration
# All logs will be written to 'scraper.log' with timestamp, log level, and message
logging.basicConfig(
    filename="scraper.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# URL of the webpage to scrape
URL = "https://realpython.github.io/fake-jobs/"

# Function to fetch HTML content of a webpage
def fetch_page(url):
    # Send GET request to the URL
    response = requests.get(url)
    # Raise an exception if the request failed
    response.raise_for_status()
    # Return the raw HTML content
    return response.text

# Function to parse job postings from HTML
def parse_jobs(html):
    # Create BeautifulSoup object for HTML parsing
    soup = BeautifulSoup(html, "html.parser")
    # Find all job cards using their CSS class
    cards = soup.find_all("div", class_="card-content")
    # Initialize list to store job data
    records = []

    # Loop through each job card to extract details
    for card in cards:
        # Extract job title, company, and location
        title = card.find("h2", class_="title").text.strip()
        company = card.find("h3", class_="company").text.strip()
        location = card.find("p", class_="location").text.strip()

        # Store the extracted details in a dictionary
        records.append({
            "title": title,
            "company": company,
            "location": location
        })

    # Return the list of job records
    return records

# Function to save job data to CSV and JSON
def save_data(records):
    # Convert list of dictionaries into a Pandas DataFrame
    df = pd.DataFrame(records)
    # Save DataFrame to CSV file
    df.to_csv("jobs_data.csv", index=False)
    # Save DataFrame to JSON file with readable formatting
    df.to_json("jobs_data.json", orient="records", indent=4)

# Main function to execute the scraper
def main():
    try:
        # Fetch HTML content
        html = fetch_page(URL)
        # Parse jobs from HTML
        records = parse_jobs(html)
        # Save the extracted job data
        save_data(records)
        # Log success message
        logging.info("Scraping completed successfully")
    except Exception as e:
        # Log any errors that occur during scraping
        logging.error(f"Error occurred: {e}")

# Entry point: run main() if this script is executed
if __name__ == "__main__":
    main()


In [27]:
# Define the required Python packages for this project
requirements = """requests
beautifulsoup4
pandas
"""

# Write the required packages to 'requirements.txt'
# This allows anyone to install dependencies using:
# pip install -r requirements.txt
with open("requirements.txt", "w") as f:
    f.write(requirements)

# Print confirmation message
print("requirements.txt created")


requirements.txt created


In [28]:
# Define the content of the README file
readme = """# Job Listings Web Scraper

This project scrapes job listings from a public website using Python and BeautifulSoup.

## Tools Used
- Python
- BeautifulSoup
- Requests
- Pandas

## Features
- Extracts job title, company, location
- Saves data to CSV and JSON
- Logging and error handling

## How to Run
pip install -r requirements.txt
python scraper.py

## Author
Sanjana Takmoge
"""

# Write the README content to 'README.md' file
with open("README.md", "w", encoding="utf-8") as f:
    f.write(readme)

# Print confirmation message
print("README.md created")


README.md created
