In [47]:
from bs4 import BeautifulSoup

In [2]:
# Load the HTML file
with open('C:\\Users\\shahania\\Documents\\Phd\\HERSS Summer School\\Tutorial\\Test_HTML_1.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

In [None]:
# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Print the parsed HTML with proper indentation
print(soup)

**Handing Malformed HTML**

In [48]:
malformed_html = '''
<section>
    <h2>Content Section</h2>
    <p class="content">Here is some <b>bold</b> text and <i>italic</i> text.</p>
    <p class="content">Another paragraph with a <a href="http://example.com">link</a>.</p>
    <p class="content">This paragraph is missing a closing tag for <i>italic text.
</section>
'''

soup = BeautifulSoup(malformed_html, 'html.parser')
print(soup.prettify())


<section>
 <h2>
  Content Section
 </h2>
 <p class="content">
  Here is some
  <b>
   bold
  </b>
  text and
  <i>
   italic
  </i>
  text.
 </p>
 <p class="content">
  Another paragraph with a
  <a href="http://example.com">
   link
  </a>
  .
 </p>
 <p class="content">
  This paragraph is missing a closing tag for
  <i>
   italic text.
  </i>
 </p>
</section>



**Search and Extract Data**

In [5]:
soup = BeautifulSoup(html_content, 'html.parser')

# Extract the title
title = soup.title.string
print(f"Title: {title}")

# Find all paragraphs
paragraphs = soup.find_all('p')
for i, p in enumerate(paragraphs, start=1):
    print(f"Paragraph {i}: {p.get_text()}")

# Find and print the link
link = soup.find('a')
print(f"Link Text: {link.get_text()}, URL: {link['href']}")




Title: Sample Web Page for Web Scraping
Paragraph 1: This is a simple page for demonstrating web scraping with Beautiful Soup.
Paragraph 2: This page contains various HTML elements that can be extracted using Beautiful Soup.
Paragraph 3: It also includes some deliberately malformed HTML to demonstrate error handling.
Paragraph 4: Here is some bold text and italic text.
Paragraph 5: Another paragraph with a link.
Paragraph 6: This paragraph is missing a closing tag for italic text.
    
Paragraph 7: © 2024 Sample Web Page
Link Text: link, URL: http://example.com


**HTML/XML parsing + Integration with Parsers**

In [6]:
soup_lxml = BeautifulSoup(html_content, 'lxml')
print(f"Title using lxml: {soup_lxml.title.string}")

# Alternatively, parse the HTML content with the default html.parser
soup_html_parser = BeautifulSoup(html_content, 'html.parser')
print(f"Title using html.parser: {soup_html_parser.title.string}")



Title using lxml: Sample Web Page for Web Scraping
Title using html.parser: Sample Web Page for Web Scraping


**Text Manipulation**

In [None]:
# Modify the title
soup.title.string = "Updated Sample Web Page Title"

# Add a new paragraph
new_paragraph = soup.new_tag('p')
new_paragraph.string = "This is a new paragraph added to the content section."
soup.body.append(new_paragraph)

# Print the modified HTML
print(soup.prettify())

**Let's try with the website**

In [22]:
import requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
} # 
url = 'https://www.goodreads.com/'
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

Beautiful Soup does not handle HTTP requests or handle JavaScript. It focuses entirely on parsing HTML/XML documents. You typically use it in combination with libraries like requests to fetch web pages and then use Beautiful Soup to extract the data you need from those pages.

**Headers, specifically the User-Agent header, play a crucial role in making your HTTP requests appear more like those from a regular browser.** 



In [None]:
soup.prettify()

If the content is missing, inspect the source of the Goodreads page in your browser (right-click and select "View Page Source") and compare it with what you get from requests. If they differ, it means the content is being rendered by JavaScript after the initial page load. That's where Selenium comes in!

In [25]:
# 1. Using find() to find the first occurrence of a specific tag or class
# Let's find the title of the page
page_title = soup.find('title').get_text()
print(f"Page Title: {page_title}")

Page Title: Goodreads | Meet your next favorite book


In [28]:
# 2. Using find_all() to find all occurrences of a tag
# Find all the links in the footer
footer_links = soup.find_all('a', class_='responsiveSiteFooter__link')
print("Footer Links:")
for link in footer_links:
    print(link.get_text())

Footer Links:
About us
Careers
Terms
Privacy
Interest Based Ads
Ad Preferences
Help
Authors
Advertise
Authors & ads blog
API
Mobile version



In [29]:
# 3. Using select() to find elements using CSS selectors
# Find the main promo headline using a CSS selector
promo_headline = soup.select('div.promoHeader__promoMastheadContent h2')[0].get_text()
print(f"Promo Headline: {promo_headline}")

Promo Headline: 
Discover & read more



In [30]:
# 4. Using parent to navigate to the parent element
# Find the parent of the first footer link
first_footer_link = soup.find('a', class_='responsiveSiteFooter__link')
footer_parent = first_footer_link.parent
print(f"Parent of the first footer link: {footer_parent.name}")  # This should print the name of the parent tag, e.g., 'li'

Parent of the first footer link: li


In [32]:
# 5. Using next_sibling to navigate to the next sibling
# Find the next sibling after the first footer link
next_sibling = first_footer_link.find_next_sibling()
print(f"Next sibling after the first footer link: {next_sibling.get_text()}")  # Should be the text of the next link

AttributeError: 'NoneType' object has no attribute 'get_text'

Because we have no sibling here. 

*What is a sibling?*

<div class="family">
    <div class="parent">
        <p>Child 1</p>
        <p>Child 2</p>
        <p>Child 3</p>
   </
</div>viv>
<

Parent:

The element directly above another element in the hierarchy.
In the example, the <div class="parent"> is the parent of <p> elements (Child 1, Child 2d3), Chil

Children:

Elements directly below another element in the hierarchy.
The <p> elements are children of the <div class="parent">.


Siblings:

Elements that share the same parent.
In the example, "Child 1", "Child 2", and "Child 3" are siblings because they all share the same parent <div classnt"> <"pare
/div>



In [33]:
# Find the first genre link
first_genre = soup.find('a', href="/genres/art")
print(f"First genre: {first_genre.get_text()}")


First genre: Art


In [34]:
# Find the next sibling genre link
next_genre = first_genre.find_next_sibling('a')
print(f"Next genre: {next_genre.get_text()}")

Next genre: Biography


In [35]:
# Find the next sibling again
third_genre = next_genre.find_next_sibling('a')
print(f"Third genre: {third_genre.get_text()}")

Third genre: Business


In [41]:
# 6. Using previous_sibling to navigate to the previous sibling
# Find the previous sibling of the next sibling (should bring us back to the first link)
previous_sibling = third_genre.find_previous_sibling('a')
print(f"Previous sibling of the next link: {previous_sibling.get_text()}")  # Should be the text of the first link again

Previous sibling of the next link: Biography


In [42]:
# 6. Using previous_sibling to navigate to the previous sibling
# Find the previous sibling of the next sibling (should bring us back to the first link)
first_genre_again  = previous_sibling.find_previous_sibling('a')
print(f"Previous sibling of the next link: {first_genre_again.get_text()}")  # Should be the text of the first link again

Previous sibling of the next link: Art


In [39]:
# 7. Using find_parent to find a specific parent element
# Find the parent of a specific link using find_parent
specific_link = soup.find('a', href='/about/us')
specific_parent = specific_link.find_parent('ul')
print(f"Specific parent of 'About Us' link: {specific_parent.name}")  # Should be 'ul'


Specific parent of 'About Us' link: ul


In [43]:
# 8. Using find_all to find all images with a specific class
# Find all book images in the feature teaser boxes
book_images = soup.find_all('img', class_='bookImgSimilar')
print("Book Images:")
for img in book_images:
    print(img['src'])

Book Images:
https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1480106986l/33917._SX98_.jpg
https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1631251689l/4214._SX98_.jpg
https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1563042852l/49628._SX98_.jpg
https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1479863624l/1618._SX98_.jpg
https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1409595968l/929._SX98_.jpg
https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1529026760l/39832183._SX98_.jpg
https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1722456144l/43641._SX98_.jpg
https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1622355533l/4667024._SX98_.jpg


In [44]:
# 9. Using select to find elements by attribute
# Use a CSS selector to find an element by its attribute (e.g., data-react-class)
react_components = soup.select('[data-react-class]')
print("React Components with data-react-class:")
for component in react_components:
    print(component['data-react-class'])

React Components with data-react-class:
ReactComponents.StoresInitializer
ReactComponents.GoogleBannerAd
ReactComponents.EditorialBlogThumbnail
ReactComponents.GoogleBannerAd


In [46]:
# 10. Using find to find an element with a specific text
# Find a link containing specific text
best_quotes_link = soup.find('a', string='Best quotes')
print(f"Best quotes link: {best_quotes_link['href']}")

Best quotes link: /quotes


**When to use what**?

1. find()
When to Use: Use find() when you want to locate the first occurrence of a specific tag or element that matches your criteria.
2. find_all()
When to Use: Use find_all() when you need to retrieve all elements that match a specific tag or criteria.
3. select()
When to Use: Use select() when you need to find elements using complex CSS selectors. It’s very powerful for matching based on classes, IDs, or nested elements.

4. find_parent() and parent
When to Use: Use these methods to navigate upwards in the DOM tree to find the parent of a specific element.
5. find_next_sibling() and find_previous_sibling()
When to Use: Use these methods to navigate sideways in the DOM tree to find the next or previous sibling of an element. It’s useful when dealing with elements at the same level. (Example: Finding the next and previous paragraphs around a specific paragraph)

6. children
When to Use: Use children when you want to iterate over all direct children of an element. This is useful for navigating elements that contain nested elements.
