In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
# Load the HTML file
with open('C:\\Users\\shahania\\Documents\\Phd\\HERSS Summer School\\Tutorial\\Test_HTML_1.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

In [None]:
# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Print the parsed HTML with proper indentation
print(soup)

**Handing Malformed HTML**

In [None]:
malformed_html = '''
<section>
    <h2>Content Section</h2>
    <p class="content">Here is some <b>bold</b> text and <i>italic</i> text.</p>
    <p class="content">Another paragraph with a <a href="http://example.com">link</a>.</p>
    <p class="content">This paragraph is missing a closing tag for <i>italic text.
</section>
'''

soup = BeautifulSoup(malformed_html, 'html.parser')
print(soup.prettify())


**Search and Extract Data**

In [None]:
soup = BeautifulSoup(html_content, 'html.parser')

# Extract the title
title = soup.title.string
print(f"Title: {title}")

# Find all paragraphs
paragraphs = soup.find_all('p')
for i, p in enumerate(paragraphs, start=1):
    print(f"Paragraph {i}: {p.get_text()}")

# Find and print the link
link = soup.find('a')
print(f"Link Text: {link.get_text()}, URL: {link['href']}")




**HTML/XML parsing + Integration with Parsers**

In [None]:
soup_lxml = BeautifulSoup(html_content, 'lxml')
print(f"Title using lxml: {soup_lxml.title.string}")

# Alternatively, parse the HTML content with the default html.parser
soup_html_parser = BeautifulSoup(html_content, 'html.parser')
print(f"Title using html.parser: {soup_html_parser.title.string}")



**Text Manipulation**

In [None]:
# Modify the title
soup.title.string = "Updated Sample Web Page Title"

# Add a new paragraph
new_paragraph = soup.new_tag('p')
new_paragraph.string = "This is a new paragraph added to the content section."
soup.body.append(new_paragraph)

# Print the modified HTML
print(soup.prettify())

**Let's extract differnt HTML tags**

In [None]:
# Load the HTML content
with open('C:\\Users\\shahania\\Documents\\Phd\\HERSS Summer School\\Tutorial\\HTML_TAGS.html', 'r') as file:
    soup = BeautifulSoup(file, 'html.parser')

In [None]:
# 1. Scraping Images
def scrape_images(soup):
    print("=== Scraping Images ===")
    images = soup.find_all('img')
    for img in images:
        img_url = img['src']
        img_name = img['alt'].replace(' ', '_') + ".jpg"
        img_data = requests.get(img_url).content
        with open(img_name, 'wb') as handler:
            handler.write(img_data)
        print(f"Downloaded {img_name}")
    print("\n")

In [None]:
scrape_images(soup)

In [None]:
# 2. Scraping PDFs
def scrape_pdfs(soup):
    print("=== Scraping PDFs ===")
    pdf_links = soup.find_all('a', href=True)
    for link in pdf_links:
        if link['href'].endswith('.pdf'):
            pdf_url = link['href']
            pdf_name = link.get_text().replace(' ', '_') + ".pdf"
            pdf_data = requests.get(pdf_url).content
            with open(pdf_name, 'wb') as handler:
                handler.write(pdf_data)
            print(f"Downloaded {pdf_name}")
    print("\n")


In [None]:
scrape_pdfs(soup)

In [None]:
# 3. Scraping Lists (Unordered and Ordered)
def scrape_lists(soup):
    print("=== Scraping Unordered Lists (ul) ===")
    ul_items = soup.find_all('ul')
    for ul in ul_items:
        list_items = ul.find_all('li')
        for item in list_items:
            print(f"Unordered list item: {item.get_text()}")

    print("=== Scraping Ordered Lists (ol) ===")
    ol_items = soup.find_all('ol')
    for ol in ol_items:
        ordered_items = ol.find_all('li')
        for step in ordered_items:
            print(f"Ordered list step: {step.get_text()}")
    print("\n")




In [None]:
scrape_lists(soup)

In [None]:
# 4. Scraping Table Data (only cells)
def scrape_tables(soup):
    print("=== Scraping Table Data ===")
    table = soup.find('table')
    rows = table.find_all('tr')

    for row in rows:
        cells = row.find_all('td')
        for cell in cells:
            print(f"Table Cell: {cell.get_text()}")
    print("\n")



In [None]:
# complete
def scrape_tables(soup):
    print("=== Scraping Table Data ===")
    
    # Find the table in the HTML
    table = soup.find('table')
    
    # Extract the header (if present)
    headers = table.find_all('th')
    header_row = [header.get_text().strip() for header in headers]
    print(f"Table Headers: {header_row}")
    
    # Extract all the rows
    rows = table.find_all('tr')
    
    # Loop through each row
    for row in rows:
        # Extract individual cells
        cells = row.find_all('td')
        cell_data = [cell.get_text().strip() for cell in cells]
        
        # If the row has data, print it
        if cell_data:
            print(f"Row Data: {cell_data}")
            
        # Print the entire row (optional to use later for row-level processing)
        entire_row = ' | '.join(cell_data)
        print(f"Entire Row: {entire_row}")
    print("\n")


In [None]:
scrape_tables(soup)

In [None]:
# 5. Scraping Navigation Links
def scrape_links(soup):
    print("=== Scraping Navigation Links ===")
    links = soup.find_all('a')
    for link in links:
        print(f"Link Text: {link.get_text()}, URL: {link['href']}")
    print("\n")



In [None]:
scrape_links(soup)

In [None]:
# 6. Scraping Headings (h1, h2, etc.)
def scrape_headings(soup):
    print("=== Scraping Headings ===")
    headings = soup.find_all(['h1', 'h2'])
    for heading in headings:
        print(f"{heading.name}: {heading.get_text()}")
    print("\n")


In [None]:
scrape_headings(soup)

In [None]:

# 7. Handling Attributes (for instance, class or id)
def scrape_attributes(soup):
    print("=== Scraping Elements with Class or ID ===")
    elements_with_class = soup.find_all(class_="item")
    for elem in elements_with_class:
        print(f"Element with class 'item': {elem.get_text()}")

    elements_with_id = soup.find_all(id=True)
    for elem in elements_with_id:
        print(f"Element with ID: {elem.get_text()}")
    print("\n")

In [None]:
scrape_attributes(soup)

**Let's try with the website**

In [None]:
import requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
} # 
url = 'https://www.goodreads.com/'
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

Beautiful Soup does not handle HTTP requests or handle JavaScript. It focuses entirely on parsing HTML/XML documents. You typically use it in combination with libraries like requests to fetch web pages and then use Beautiful Soup to extract the data you need from those pages.

**Headers, specifically the User-Agent header, play a crucial role in making your HTTP requests appear more like those from a regular browser.** 



In [None]:
soup.prettify()

If the content is missing, inspect the source of the Goodreads page in your browser (right-click and select "View Page Source") and compare it with what you get from requests. If they differ, it means the content is being rendered by JavaScript after the initial page load. That's where Selenium comes in!

In [None]:
# 1. Using find() to find the first occurrence of a specific tag or class
# Let's find the title of the page
page_title = soup.find('title').get_text()
print(f"Page Title: {page_title}")

In [None]:
# 2. Using find_all() to find all occurrences of a tag
# Find all the links in the footer
footer_links = soup.find_all('a', class_='responsiveSiteFooter__link')
print("Footer Links:")
for link in footer_links:
    print(link.get_text())

In [None]:
# 3. Using select() to find elements using CSS selectors
# Find the main promo headline using a CSS selector
promo_headline = soup.select('div.promoHeader__promoMastheadContent h2')[0].get_text()
print(f"Promo Headline: {promo_headline}")

In [None]:
# 4. Using parent to navigate to the parent element
# Find the parent of the first footer link
first_footer_link = soup.find('a', class_='responsiveSiteFooter__link')
footer_parent = first_footer_link.parent
print(f"Parent of the first footer link: {footer_parent.name}")  # This should print the name of the parent tag, e.g., 'li'

In [None]:
# 5. Using next_sibling to navigate to the next sibling
# Find the next sibling after the first footer link
next_sibling = first_footer_link.find_next_sibling()
print(f"Next sibling after the first footer link: {next_sibling.get_text()}")  # Should be the text of the next link

In [None]:
# Find the first genre link
first_genre = soup.find('a', href="/genres/art")
print(f"First genre: {first_genre.get_text()}")


In [None]:
# Find the next sibling genre link
next_genre = first_genre.find_next_sibling('a')
print(f"Next genre: {next_genre.get_text()}")

In [None]:
# Find the next sibling again
third_genre = next_genre.find_next_sibling('a')
print(f"Third genre: {third_genre.get_text()}")

In [None]:
# 6. Using previous_sibling to navigate to the previous sibling
# Find the previous sibling of the next sibling (should bring us back to the first link)
previous_sibling = third_genre.find_previous_sibling('a')
print(f"Previous sibling of the next link: {previous_sibling.get_text()}")  # Should be the text of the first link again

In [None]:
# 6. Using previous_sibling to navigate to the previous sibling
# Find the previous sibling of the next sibling (should bring us back to the first link)
first_genre_again  = previous_sibling.find_previous_sibling('a')
print(f"Previous sibling of the next link: {first_genre_again.get_text()}")  # Should be the text of the first link again

In [None]:
# 7. Using find_parent to find a specific parent element
# Find the parent of a specific link using find_parent
specific_link = soup.find('a', href='/about/us')
specific_parent = specific_link.find_parent('ul')
print(f"Specific parent of 'About Us' link: {specific_parent.name}")  # Should be 'ul'


In [None]:
# 8. Using find_all to find all images with a specific class
# Find all book images in the feature teaser boxes
book_images = soup.find_all('img', class_='bookImgSimilar')
print("Book Images:")
for img in book_images:
    print(img['src'])

In [None]:
# 9. Using select to find elements by attribute
# Use a CSS selector to find an element by its attribute (e.g., data-react-class)
react_components = soup.select('[data-react-class]')
print("React Components with data-react-class:")
for component in react_components:
    print(component['data-react-class'])

In [None]:
# 10. Using find to find an element with a specific text
# Find a link containing specific text
best_quotes_link = soup.find('a', string='Best quotes')
print(f"Best quotes link: {best_quotes_link['href']}")

**When to use what**?

1. find()
When to Use: Use find() when you want to locate the first occurrence of a specific tag or element that matches your criteria.
2. find_all()
When to Use: Use find_all() when you need to retrieve all elements that match a specific tag or criteria.
3. select()
When to Use: Use select() when you need to find elements using complex CSS selectors. It’s very powerful for matching based on classes, IDs, or nested elements.

4. find_parent() and parent
When to Use: Use these methods to navigate upwards in the DOM tree to find the parent of a specific element.
5. find_next_sibling() and find_previous_sibling()
When to Use: Use these methods to navigate sideways in the DOM tree to find the next or previous sibling of an element. It’s useful when dealing with elements at the same level. (Example: Finding the next and previous paragraphs around a specific paragraph)

6. children
When to Use: Use children when you want to iterate over all direct children of an element. This is useful for navigating elements that contain nested elements.
