In [1]:
# Import the BeautifulSoup and re libraries
from bs4 import BeautifulSoup
import re

#### IMPORTANT Note: For more details (on any of the code) regarding BeautifulSoup, the official BeautifulSoup documentation is a great resource. 

Here are some links:

1. https://www.crummy.com/software/BeautifulSoup/bs4/doc/
2. https://tedboy.github.io/bs4_doc/

In [2]:
# Reading a file
fhand = open("Example 3.html")
inp = fhand.read()

In [3]:
print(inp)

<html>
<head>
    <title>The Dormouse's story</title>
</head>
<body>
<h1 id="heading" class = "page error"> Page 1 </h1>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>

</body>
</html>


In [4]:
# Creating a beautifulSoup object
soup = BeautifulSoup(inp, "html.parser")
print(type(soup))

<class 'bs4.BeautifulSoup'>


In [5]:
# find_all() method: FIND ALL alays returns a Python list 

# find_all(name, attrs, recursive, string, limit, keywords)
# There are 6 parameters: name, attrs, recursive, string, limit, keywords
# find_all() method looks through a tag's descendants and retrieves all descendants that matches the filter
# In this example, the variable "tags" will contain a list. Each element in this list is an <a> tag object

In [7]:
# .find() : finds the first <a> tag and returns one tag object
# .find_all() finds all <a> tags and returnns a list of tag objects with the specified tag, from start to end
tags = soup.find_all("a")
print(tags)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [8]:
# find_all() - the name parameter

# Pass in a value for name and you’ll tell BeautifulSoup to only consider tags with certain names 
print(soup.find_all("title"))

# example: finds all <title> in the HTML document and includes the tag <title> </title> 

[<title>The Dormouse's story</title>]


In [16]:
# You can pass a list as the name parameter
# Each element in this list will be an <a> tag object and/or <b> tag object

tags = soup.find_all(["b","a"])
print(tags)

# example: find all <b> tags and <a> tags 
# you put them in a PYTHON LIST to search for more than one tag type at the same time 

[<b>The Dormouse's story</b>, <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [9]:
# attrs parameter
# The attributes parameter takes a Python dictionary of attributes and matches tags that contain 
# any one of those attributes
# An example of a single attr-value pair in the "attrs" parameter
print(soup.find_all("a", attrs= {"id":"link1"}))

# example: find all <a> tags whos id attribute equals "link1" 
# output: returns entire <a> tag from start to close 

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]


In [10]:
# attrs parameter
# example of a multiple attr-value pairs in the "attrs" parameter
print(soup.find_all("a", attrs= {"id":"link1", "class":"sister"}))

# example: find all <a> tags whos id attribute = "link1" and class = "sister"
# output: returns entire <a> tag from start to close 

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]


In [22]:
# String parameter
# you want to search strings intead of tags
print(soup.find_all(string = "Lacie"))

# finds all the text string in the HTML document that are exactly equal to 'lacie'

['Lacie']


In [24]:
# You can also pass a list
print(soup.find_all(string = ["Lacie","Tillie"]))

# String parameter
# you want to search strings intead of tags
print(soup.find_all(string = "Lacie"))

# find the string in the HTML document that is equal to 'lacie' or 'Tillie', if there is both returns both

['Lacie', 'Tillie']


In [26]:
# Combining name and string parameters
# You can combine string with parameters that finds tags
print(soup.find_all("a", string="Elsie"))

# () - means only specifications = USES AND LOGIC
# [ ] = USES OR LOGIC 

# return the a tag that has a string exactly = "Elsie"

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]


In [28]:
# recursive parameter
print(soup.body.find_all("b"))

# Revisiting definition of find_all
# looks through a tag's descendants and retrieves all descendants that matches the filter
# By default, find_all will examine all the descendants of a tag: its children , its children’s children 
# and so on

# What does the above code do?
# Find all the <b> tags in all the descendants of body such as h1,p (whose descendant is b),p(whose descendants
# are a, a, a), and p 

# searches inside the <body> tag and finds all the <b> tags anywhere under it 

[<b>The Dormouse's story</b>]


In [30]:
# Let us see the body tag. 
# The <b> tag is a descendant of the body tag. 
print(soup.body)

# searches inside the <body> tag and prints it from start to close and everything inside of it 

<body>
<h1 class="page error" id="heading"> Page 1 </h1>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>


In [32]:
# What if I want to search ONLY the children? use the recursive parameter
print(soup.body.find_all("b", recursive=False))
# searches for only the DIRECT CHILDREN of <body> for <b> tags 
# <body> and <b>: <b> is not a direct child of body, it is nested, no <b> exists directly inside of the <body>

# Recursive parameter changes the definition of find_all
# looks through a tag's children and retrieves all children that matches the filter
# In other words, If you want it to consider direct children, you can pass in recursive=False

# recursive = True: searches everything inside the tag 
# recursive = False: searches only for direct children, tells beautifulsoup not to search too deep, not nested descendants
# so it returns []

[]


In [34]:
# limit parameter
# If you don’t need all the results, you can pass in a number for limit
# It tells Beautiful Soup to stop gathering results after its found a certain number
print(soup.find_all("a", limit=2))

# sets the limit of results equal to 2 where it finds <a> tags in the HTML document 

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


In [36]:
# Without Limit parameter - you get all three tags
print(soup.find_all("a"))

# with no limit it returns all the tags in the HTML document 

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [38]:
# Keyword (**kwargs) parameter
# In Python, you can pass extra named options to a function like id="main" or class_="btn"
# **kwargs is Python’s name for “any extra named options you pass.”
# In find_all, those extra options (e.g., id=, class_=) are combined with attrs={...} behind the scenes. They work the same.
print(soup.find_all(id='link2'))

# returns the tag line where the id attribute is equal to link 2 

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


In [12]:
# Example: keyword parameter with the name parameter. Two or more keywords could also be provided 
# Here the question is: find all the <a> tags with id=link 2 and class=sister
print(soup.find_all("a", id='link2', class_ = "sister")) # uses kwargs (key word arguments)

# class_ is used because: class is a reserved keyword in python

print(soup.find_all("a", attrs= {"id":"link2", "class":"sister"})) # uses attrs dicitonary 


[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


In [13]:
# Keyword parameter 
# Example: no name parameter and two keyword parameters
# They might produce the same output (as the code above) but the question is different
# Here the question is: find all the tags with id=link2 and class=sister
print(soup.find_all(id='link2', class_ = "sister"))

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


In [14]:
# Keyword parameter - finding if an attribute exists
# For example: find all the tags that have an id attribute
print(soup.find_all(id=True))

# find all the tags that have an id attribute regardless of what the value is 
# id = false: find all the tags that do not have an id attribute 

[<h1 class="page error" id="heading"> Page 1 </h1>, <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [46]:
# attributes parameter can do the same
print(soup.find_all(attrs = {'id':True})) # uses attrs dicitonary and produces the same output

[<h1 class="page error" id="heading"> Page 1 </h1>, <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [48]:
# Keywords can be based on patterns -- we can use regular expressions to match patterns
print(soup.find_all(href=re.compile("elsie"), id='link1'))

# have an href attribute containing the pattern "elsie" and have an id equal to "link1"
# re.compile("elsie"): returns the pattern meaning the stirng must conatain "elsie" soemwhere inside of it 
# meaning it would match other patterns like elsie123, page-elsie-, elsie.html
# returns the a tag that has both of these specifications 

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]


In [15]:
# attributes parameter can do the same
print(soup.find_all(attrs = {'href': re.compile("elsie"), 'id':'link1'})) # uses attrs dicitonary produces same output

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]


In [16]:
# keyword and attrs parameters can do the same thing 
# Keywords = USES EQUAL SIGN WHICH EQUALS
print(soup.find_all(id='link2', class_ = "sister"))

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


In [54]:
# attributes parameter = uses : 
print(soup.find_all(attrs={"id":"link2", "class":"sister"}))

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


In [56]:
# Keywords parameter: always uses the underscore 
print(soup.find_all(class_=["page", "error"]))  # must contain all these classes 

[<h1 class="page error" id="heading"> Page 1 </h1>]


In [17]:
# attributes parameter: always uses the attrs={:}
print(soup.find_all(attrs={"class": ["page", "error"]})) # must contain all these classes 

[<h1 class="page error" id="heading"> Page 1 </h1>]


In [18]:
# When the keyword parameter fails?
# when attribute names are in this format (for ex: data-cap)
markup1 = '<a data-cap="error">Page 1</a><a name="abc">Page 1</a>'
soup1 = BeautifulSoup(markup1,"html.parser")

In [62]:
# When using such attributes, the keyword parameter fails
print(soup1.find_all(data-cap=error))

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (705312716.py, line 2)

In [23]:
# Instead, use the attrs parameter
print(soup1.find_all(attrs ={"data-cap":"error"})) # this is the one that works
print(soup1.find_all(class_=["data-cap","error"])) # returns empty because it looks for "data-cap" and "error" but doesn't exist

# atribute is data-cap = "error" not class="data-cap error"

[<a data-cap="error">Page 1</a>]
[]


In [66]:
# When the keyword parameter fails?
# when attributes are named "name"
print(soup1.find_all(name = "abc")) #CANNOT be used for name

# this finds the tag whos tag name is abc, not true 
# name is the attribute so you need to use attrs dicitonary 

[]


In [68]:
print(soup1.find_all(attrs ={"name":"abc"})) # CORRECT

# finds the tag whos attribute name equals "abc"

[<a name="abc">Page 1</a>]


In [70]:
# When the keyword parameter fails?
# when attributes are named "class"
print(soup.find_all(class = "sister"))

# using keyword class needs to have an underscore 

SyntaxError: invalid syntax (680757540.py, line 3)

In [24]:
# If you want to use, class with the keywords parameter, use "class_"
print(soup.find_all(class_ = "sister"))

# returns all the tag lines whos class is sister 

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [74]:
# or use the attributes parameter
print(soup.find_all(attrs = {'class':'sister'}))

# returns all tag lines whos attrs name is class= class and value is sisyter: uses attrs dicitonary 

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [76]:
# FIND method
# this method is used when you want only one result
# The find_all() method scans the entire document looking for results, but sometimes you only want
# to find one result. If you know a document only has one <body> tag, it’s a waste of time to scan 
# the entire document looking for more. Rather than passing in limit=1 every time you call find_all, 
# you can use the find() method. 

# find() - name,attrs,string,keyword, recursive
# It has NO limit parameter

# This is an example of find with the name parameter
print(soup.find("a"))

# finds the first a tag in the HTML document, prints from start to end 

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


In [78]:
# Name and attrs parameter
print(soup.find("a", attrs = {"id":"link1"}))

# finds the first a tag whos attrs name is id and value is link1, prints start to end 

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


In [80]:
# Name and Keywords parameter
print(soup.find("a", id="link1"))

# find a tag line that has an id = link1

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


In [82]:
# String parameter
print(soup.find(string="Elsie"))

# find ONLY the FIRST text string that is equal to 'elsie'

Elsie


In [84]:
# Recursive parameter
print(soup.body.find("b"))

# inside the body <body> tag find the first <b> tag and return it 

<b>The Dormouse's story</b>


In [86]:
# Let us see the body tag. 
# <b> tag is a descendant of the body tag. 
print(soup.body)

# print the whole body section in the HTML document 

<body>
<h1 class="page error" id="heading"> Page 1 </h1>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>


In [26]:
# What if I want to search only the children
print(soup.body.find("b", recursive=False))
# Changes the definition of find
# looks through a tag's children and retrieves the first child that matches the filter

# recursive = false searches only the direct children 
# in the body, b IS NOT the direct child of body, it's not so prints none 
# if it was it would print the <b> The Dormouse story </b> line 

None


In [27]:
# What if I want to search only the children
print(soup.body.find("b", recursive=True))
# Changes the definition of find
# looks through a tag's children and retrieves the first child that matches the filter

# true searches all descendants 

<b>The Dormouse's story</b>


In [90]:
# Can you write a nearly equivalent code of the following code using find_all?
print(soup.find("a"))

# finds and prints the FIRST a tag in the HTML document 

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


In [92]:
# Answer to the question above.
print(soup.find_all("a", limit=1))

# looks through entire HTML document, only prints the first a tag because the limit is 1 

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]


In [28]:
# Remember: the find method returns a tag. So .name, .attrs, .string that we learnt in lesson 1 applies to
# the output of the find method
# Example:
print(soup.find("a").name)

# prints the tag name of a in the HTML document 

print(soup.find("a").attrs)

# prints all the attribute of the first a tag in the HTML document, creates attrs dicitonary, {'href':'website', 'class': 'sister', 'id': 'link1'}

print(soup.find("a").string)

# prints only the text string in the a tag: string only prints the direct text no nested text string 

a
{'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}
Elsie


In [96]:
# get_text(): prints ALL visible text inside of the tag

# If you only want the human-readable text inside a document or tag, you can use the get_text() method. 
# It returns all the text in a document or beneath a tag, as a single Unicode string:
print(soup.find("a").get_text())

# get_text() applies only to find() but not find_all() because there are so many tags so you have to use .get_text() in a loop 
# get_text(): find() returns a tag, but fina_all() returns list of tags 

Elsie


In [30]:
# get_text()
# Difference between get_text() and .string
markup2='<a href="www.google.com">\nLink to <i>google.com</i>\n</a><b></b>'
soup2 = BeautifulSoup(markup2,'html.parser') 

# Situation 1
# get data within a tag
print(soup2.a)

<a href="www.google.com">
Link to <i>google.com</i>
</a>


In [31]:
# using .string produces None because the <a> tag contains nested elements (the <i> tag, here) 
print(soup2.a.string)

#.string only prints direct text no nested 

None


In [32]:
# using .get_text() works fine
print(soup2.a.get_text())

# prints all text does not care about nested 


Link to google.com



In [33]:
# Situation 2 (in this situation both .string and .get_text() produce the same output)
print(soup2.i) # prints what isndie of the i tag, including the i tag start to end 

<i>google.com</i>


In [34]:
# using .string
print(soup2.i.string)

# prints only the direct string in i

google.com


In [108]:
# using .get_text()
print(soup2.i.get_text())

# prints all text regardless of nested 

google.com


In [110]:
# Situation 3  - here the <b> tag has no data
print(soup2.b)

# prints only empty b tag because there is no data 

<b></b>


In [112]:
# using .string produces None
print(soup2.b.string)

# none because there is no direct text to return 

None


In [114]:
# using .get_text() produces an empty string
soup2.b.get_text()

# get_text() will produce an empty string, instead on none 

''

In [116]:
'''
What do these return?
.string → returns NavigableString (acts like a string, but is a BeautifulSoup object)
          We can convert it to a string using str()

.get_text() → returns a string object directly
'''
print(type(soup2.i.string)) # . string returns navigable string <class bs4 (element because of i, navigable string)
print(type(soup2.i.get_text())) # get_text() returns string object directly <class str>

<class 'bs4.element.NavigableString'>
<class 'str'>


In [118]:
#  We can convert NavigableString to a string using str()
# .string before conversion is a NavigableString
a=soup2.i.string
print(type(a))

# .string after conversion is a string
aStr  = str(a) # convert navigable string into direct str and store it in variable astr
print(type(aStr)) # print type of astr 


<class 'bs4.element.NavigableString'>
<class 'str'>


In [35]:
# Prettify Method

print(soup.p) # prints the p tag from start to end 

<p class="title"><b>The Dormouse's story</b></p>


In [36]:
# The prettify() method will turn a Beautiful Soup parse tree into a nicely formatted Unicode string,
# with a separate line for EACH TAG and AND EACH STRING 

print(soup.p.prettify())

<p class="title">
 <b>
  The Dormouse's story
 </b>
</p>



# Difference Between `find` and `find_all` (BeautifulSoup)

## Quick Comparison

| Feature | `find_all` | `find` |
|---|---|---|
| **Purpose** | Get **many** matches | Get **one** (the **first**) match |
| **Return type** | **`ResultSet`** (list-like collection) | **`Tag`** (commonly) or **`NavigableString`** when using `string=...`; `None` if not found |
| **Works with tag methods/attrs directly?** | **No** (on the `ResultSet` itself) | **Yes** (on the returned `Tag`) |
| **No matches** | Empty `ResultSet` | `None` |
| **`limit` parameter** | **Supported** (`find_all(..., limit=N)`) | **Not supported** |
| **Chaining** | **Not on `ResultSet`** → index or loop first | **Yes** (returns a single `Tag`) |

---


# Difference Between `find` and `find_all` (BeautifulSoup)

## `find_all`

- **Purpose:** When you want **many** matches.
- **Return type:** A **`ResultSet`** (list-like collection), _not_ a single tag.  
  - Because it’s a collection, **tag methods/attributes do not work on the `ResultSet` itself**:
    - `.find` ❌  `.find_all` ❌  `.get_text` ❌  `.name` ❌  `.attrs` ❌  `.string` ❌
    - `soup.find_all("a").find("p")`        # ❌
    - `soup.find_all("a").find_all("p")`    # ❌
    - `soup.find_all("a").get_text()`       # ❌
    - `soup.find_all("a").name`            # ❌
    - `soup.find_all("a").attrs`            # ❌
    - `soup.find_all("a").string`           # ❌
  - You must **iterate** or **index** to get an individual item first and then apply tag methods/attributes.
- **Item type:** Each item in the collection is usually a **`Tag`**.  
  - **Exception:** If you search with `string=...`, the items are **`NavigableString`** objects (not Tags).
- **Looping:** Ideal for `for` loops; inside the loop you can use tag methods/attributes on each item.
  - **Example:**
    ```python
    for a in soup.find_all("a"):
        print(a.get_text())
    ```
- **Chaining:** **Do not** chain on the `ResultSet` itself. Chain **after indexing** or **inside the loop**.
  - **Examples:**  
    - `soup.find_all("a").find("p")` ❌
    - `soup.find("body").find_all("a").find_all("p")` ❌
    - `soup.find_all("a")[0].find("p")` ✅  
    - 
      ```python
      for a in soup.find_all("a"): ✅ 
          ps = a.find_all("p")
      ```
- **No matches:** Returns an **empty `ResultSet`** (`[]`, falsy in conditionals).
- **`limit` parameter:** **Supported** → `soup.find_all("a", limit=3)`.


---

## `find`

- **Purpose:** When you want **one** match — the **first match** in document order.
- **Return type:** A **`Tag`** (most common), **`NavigableString`** if you searched with `string=...` or **`None`** if not found.
- **Tag methods/attributes:** Work **directly** on the returned object (when it’s a `Tag`).
  - **Examples (assuming a `Tag` is returned):**
    - `soup.find("a").find("p")` ✅
    - `soup.find("a").get_text()` ✅
    - `soup.find("a").find_all("p")` ✅
    - `soup.find("a").name` ✅  
    - `soup.find("a").attrs` ✅  
    - `soup.find("a").string` ✅ *(may be `None` if the tag contains nested elements)*  
    - **Exception:** If you search with `string=...`, the items are **`NavigableString`** objects (not Tags).
- **Chaining:** **Yes**—because it returns a single `Tag`, you can chain:
  - `soup.find("body").find("p", class_="story").find_all("a")` ✅
  - `soup.find("body").find("p").find("b")` ✅
- **No matches:** Returns **`None`** (check before chaining or calling methods).
  - **Pattern:**
    ```python
    p = soup.find("p")
    if p:
        print(p.get_text())
    ```
- **`limit` parameter:** **Not supported** (use `find_all(..., limit=1)` if you truly need that style, but `find` is faster/cleaner for the first match).

---

## Quick Takeaways

- **Need many?** → `find_all(...)` → iterate/index → then use tag methods.  
- **Need just the first?** → `find(...)` → use tag methods right away.  
- **String-search caveat:** Using `string=` returns **`NavigableString`** (can't apply tag methods like `.find()`, `.attrs`, `.get_text()`).
- **Truthiness:** `find_all(...)` → empty/nonnull collection (empty is falsy); `find(...)` → `None` or a single object (`None` is falsy).


In [37]:
# Find_all Method
# Accessing Each Item and its components
aTags = soup.find_all("a")

print("Actual Tag Object 1:",aTags[0])

# indexes all a tags into python list, indexes the lsit so returns the first one of the a tags 

Actual Tag Object 1: <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


In [38]:
print("Tag Object 1 Name:",aTags[0].name)
print("Tag Object 1 attributes:", aTags[0].attrs)
print("Tag Object 1 .string:", aTags[0].string)
print("Tag Object 1 .get_text():", aTags[0].get_text())

Tag Object 1 Name: a
Tag Object 1 attributes: {'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}
Tag Object 1 .string: Elsie
Tag Object 1 .get_text(): Elsie


In [39]:
print("Actual Tag Object 2:",aTags[1])
print("Tag Object 2 Name:",aTags[1].name)
print("Tag Object 2 attributes:", aTags[1].attrs)
print("Tag Object 2 .string:", aTags[1].string)
print("Tag Object 2 .get_text():", aTags[1].get_text())

Actual Tag Object 2: <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
Tag Object 2 Name: a
Tag Object 2 attributes: {'href': 'http://example.com/lacie', 'class': ['sister'], 'id': 'link2'}
Tag Object 2 .string: Lacie
Tag Object 2 .get_text(): Lacie


In [40]:
# Find_all method and For Loop

aTags = soup.find_all("a") # it returns a list and stores in aTags
# each item in that list is a TAG object
tag = 0
for i in aTags:
    print("*******The details of the tag at index position", tag)
    print("Actual Tag Object:", i)
    print("Tag name:",i.name)
    print("Tag attributes:", i.attrs)
    print("Tag's .string:", i.string)
    print("Tag's .get_text():", i.get_text())
    print("\n")
    tag = tag+1
    
# Cannot find (i.e., using navigation, find, find_all) within the result of a find_all

*******The details of the tag at index position 0
Actual Tag Object: <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Tag name: a
Tag attributes: {'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}
Tag's .string: Elsie
Tag's .get_text(): Elsie


*******The details of the tag at index position 1
Actual Tag Object: <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
Tag name: a
Tag attributes: {'href': 'http://example.com/lacie', 'class': ['sister'], 'id': 'link2'}
Tag's .string: Lacie
Tag's .get_text(): Lacie


*******The details of the tag at index position 2
Actual Tag Object: <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
Tag name: a
Tag attributes: {'href': 'http://example.com/tillie', 'class': ['sister'], 'id': 'link3'}
Tag's .string: Tillie
Tag's .get_text(): Tillie




In [41]:
# Navigating/Chaining Find()
# Remember find returns a tag object
print(soup.find("body").find("p").find("b"))

# find (): can chain 
# find_all(): can't chain but can use a loop 

<b>The Dormouse's story</b>


In [136]:
print(soup.find("body").find_all("a").find("p"))
# find_all returns a list - u cannot search within a list

# you can't chain with find_all() becuase you can search within a list 

AttributeError: ResultSet object has no attribute 'find'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?

In [138]:
# Allowed
print(soup.find("body").find("p", class_="story").find_all("a"))

# valid chain: tag > tag > list 
# invalid list: list > tag 

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [42]:
# Similarly there are other methods to search a tree.
# find_next_siblings() and find_next_sibling() methods example
# find_previous_siblings() and find_previous_sibling() methods example 
# find_all_next() and find_next() methods example
# find_all_previous() and find_previous() methods example
# Please go through the BeautifulSoup documentation (https://www.crummy.com/software/BeautifulSoup/bs4/doc/) 
# and the slides posted on Brightspace to learn more

In [43]:
# CSS selectors can also be used withing find_all
# Some examples below
print(soup.find_all(class_ = "sister"))

# find all tags that have class = sister 

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [44]:
print(soup.select(".sister"))

# find all elements with class sister 

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [45]:
print(soup.select("p>b"))

# find all b tags that are direct children of p tags 
# p is greater so it is the parent 

[<b>The Dormouse's story</b>]


In [148]:
# Matching based on regular Expression
# Regular expressions can be used in the keyword attribute
# They can be used in the name attribute as well
# For example: 
# all the tags that have the character "t"
tags = soup.find_all(re.compile("t"))
print(tags)

# when beautiful soup matches tags, it prints the entire HTML structure isnide the tag, saw <html> had a t, so printed <html> </html>

[<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1 class="page error" id="heading"> Page 1 </h1>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>
</html>, <title>The Dormouse's story</title>]


In [150]:
# all the tags that have the starting character as "b"
# For example: 
tags = soup.find_all(re.compile("^b"))
print(tags)

# ^ indicates b has to be the starting character for tag names that start with the letter b
# prints <body> </body> and <b> </b>

[<body>
<h1 class="page error" id="heading"> Page 1 </h1>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>, <b>The Dormouse's story</b>]


In [152]:
# Regular expressions could be used in the string parameter
# For example: all the strings that have the character "i"
print(soup.find_all(string=re.compile("i"))) 

# prints all text string that has an i in it 

['Once upon a time there were three little sisters; and their names were\n', 'Elsie', 'Lacie', 'Tillie', ';\nand they lived at the bottom of a well.']


In [46]:
# The find_all method has a shortcut
# find_all shortcut
print(soup.find_all("a"))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [47]:
# Because find_all is so prevalent in web scraping scripts, the above code could also be written in the
# following way
print(soup("a"))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
