In [1]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [3]:
from bs4 import BeautifulSoup

# Sample HTML document
html_doc = """
<html>
<head><title>World's Population</title></head>
<body>
    <h1>Main Heading</h1>
    <p class="xyz">This is a paragraph.</p>
    <p class="content">Another paragraph with <a href="https://example.com">a link</a>.</p>

    <div id="container">
        <ul>
            <li class="item">Item 1</li>
            <li class="item">Item 2</li>
            <li class="item">Item 3</li>
        </ul>
    </div>

    <table>
        <tr><th>Name</th><th>Age</th></tr>
        <tr><td>Alice</td><td>25</td></tr>
        <tr><td>Bob</td><td>30</td></tr>
    </table>
</body>
</html>
"""



In [5]:
soup = BeautifulSoup(html_doc, "html.parser")
print(soup)


<html>
<head><title>World's Population</title></head>
<body>
<h1>Main Heading</h1>
<p class="xyz">This is a paragraph.</p>
<p class="content">Another paragraph with <a href="https://example.com">a link</a>.</p>
<div id="container">
<ul>
<li class="item">Item 1</li>
<li class="item">Item 2</li>
<li class="item">Item 3</li>
</ul>
</div>
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>Alice</td><td>25</td></tr>
<tr><td>Bob</td><td>30</td></tr>
</table>
</body>
</html>



In [7]:
#Finding things via tags

# 1. **Basic Parsing**
print("Page Title:", soup.body.text)  # Extract title
#Limitation: Finds first occurence only

Page Title: 
Main Heading
This is a paragraph.
Another paragraph with a link.


Item 1
Item 2
Item 3



NameAge
Alice25
Bob30




In [9]:
# 10. **Prettifying HTML Output**
x = BeautifulSoup("<html><body><span>This is first</span><p>This is second</p></body></html>","html.parser")
print("\nPrettified HTML:")
print(x.prettify())


Prettified HTML:
<html>
 <body>
  <span>
   This is first
  </span>
  <p>
   This is second
  </p>
 </body>
</html>



In [11]:
# 2. **Finding Elements**
print("\nFind first paragraph:",( soup.find("p").text) ) # First <p> tag
print("Find all paragraphs:", soup.find_all("p"))  # All <p> tags
x = []
for i in soup.find_all("p"):
  x.append(i.text)
print(x)


Find first paragraph: This is a paragraph.
Find all paragraphs: [<p class="xyz">This is a paragraph.</p>, <p class="content">Another paragraph with <a href="https://example.com">a link</a>.</p>]
['This is a paragraph.', 'Another paragraph with a link.']


In [13]:
# 3. **Using CSS Selectors**
print("\nUsing CSS Selectors:")
print(soup.select("p")[0].text)  # Select by tag name
print(soup.select(".xyz")[0].text)  # Select by class
print(soup.select("#container ul li")[1].text)  # Select second list item in <ul>


Using CSS Selectors:
This is a paragraph.
This is a paragraph.
Item 2


In [15]:
table1 = soup.find("table")
rows = table1.find_all("tr")
print("\nTable Data:")
for row in rows:
   cols = row.find_all("td")
   print(row.text.split())
   if cols:
        print(f"Name: {cols[0].text}, Age: {cols[1].text}")


Table Data:
['NameAge']
['Alice25']
Name: Alice, Age: 25
['Bob30']
Name: Bob, Age: 30
