# bs4

In [147]:
!pip3 install bs4



In [148]:
from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'html.parser')

print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [149]:
soup.title

<title>The Dormouse's story</title>

In [150]:
soup.title.name

'title'

In [151]:
soup.title.string

"The Dormouse's story"

In [152]:
soup.title.parent.name

'head'

In [153]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [154]:
soup.p['class']

['title']

In [155]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [156]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [157]:
soup.find(id="link3")

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [158]:
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [159]:
# Extracting all the text from the page
print(soup.get_text())


The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



### Four types of objects
1. Tag,
2. NavigableString, 
3. BeautifulSoup, and 
4. Comment.

In [160]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser')
tag = soup.b
type(tag)

bs4.element.Tag

In [161]:
tag.name

'b'

In [162]:
# We can get the value of the tag attributes by treating the tag like a dictionary
tag['class']

['boldest']

In [163]:
tag.attrs

{'class': ['boldest']}

In [164]:
tag['id'] = 'verybold'
tag['id']

'verybold'

In [165]:
tag['another-attribute'] = 1
tag.attrs

{'another-attribute': 1, 'class': ['boldest'], 'id': 'verybold'}

In [166]:
del tag['id']

In [167]:
tag.attrs

{'another-attribute': 1, 'class': ['boldest']}

### Multivalued Attributes

1. class
2. rel, 
3. rev, 
4. accept-charset, 
5. headers, and 
6. accesskey

In [168]:
css_soup = BeautifulSoup('<p class="body"></p>', 'html.parser')
css_soup.p['class']

['body']

In [169]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser')
css_soup.p['class']

['body', 'strikeout']

In [170]:
id_soup = BeautifulSoup('<p id="my id"></p>', 'html.parser')
id_soup.p['id']

'my id'

In [171]:
rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>', 'html.parser')
rel_soup.a['rel']

['index']

In [172]:
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p)

<p>Back to the <a rel="index contents">homepage</a></p>


In [173]:
### In XML document there are no multi-valued attributes
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml')
xml_soup.p['class']

'body strikeout'

### Navigable String

A string corresponds to a bit of text within a tag. Beautiful Soup uses the NavigableString class to contain these bits of text.

In [174]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser')
tag = soup.b

In [175]:
tag.string

'Extremely bold'

In [176]:
type(tag.string)

bs4.element.NavigableString

In [177]:
unicode_string = str(tag.string)
unicode_string

'Extremely bold'

In [178]:
type(unicode_string)

str

In [179]:
tag.string.replace_with('no longer bold')
tag

<b class="boldest">no longer bold</b>

**NOTE - To use the Navigable String outside the Beautiful Soup always convert it to unicode to avoid memory wastage.**

### Beautiful Soup

The BeautifulSoup object itself represents the document as a whole. For most purposes, you can treat it as a Tag object. This means it supports most of the methods described in Navigating the tree and Searching the tree.

In [180]:
soup.name

'[document]'

### Comments and other special strings

In [181]:
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup, 'html.parser')
comment = soup.b.string
type(comment)

bs4.element.Comment

Beautiful Soup defines classes for anything else that might show up in an XML document: CData, ProcessingInstruction, Declaration, and Doctype. Just like Comment, these classes are subclasses of NavigableString that add something extra to the string.

In [182]:
from bs4 import CData
cdata = CData("A CDATA block")
comment.replace_with(cdata)

print(soup.b.prettify())

<b>
 <![CDATA[A CDATA block]]>
</b>


### Navigating the tree

In [183]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [184]:
soup.body.b

<b>The Dormouse's story</b>

In [185]:
soup.contents

[<html><head><title>The Dormouse's story</title></head>
 <body>
 <p class="title"><b>The Dormouse's story</b></p>
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>
 <p class="story">...</p>
 </body></html>]

In [186]:
len(soup.contents)

1

In [187]:
soup.contents[0]

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [188]:
soup.contents[0].name

'html'

In [189]:
text = soup.contents[0].title.contents
print(text)

["The Dormouse's story"]


In [190]:
for child in text:
    print(child)

The Dormouse's story


The **.descendants** attribute lets you iterate over all of a tag’s children, recursively: its direct children, the children of its direct children, and so on.

In [191]:
for child in soup.contents[0].body.descendants:
    print(child)



<p class="title"><b>The Dormouse's story</b></p>
<b>The Dormouse's story</b>
The Dormouse's story


<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
Once upon a time there were three little sisters; and their names were

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Elsie
,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
Lacie
 and

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
Tillie
;
and they lived at the bottom of a well.


<p class="story">...</p>
...




In [192]:
len(list(soup.contents[0].children))

3

In [193]:
len(list(soup.contents[0].descendants))

25

In [194]:
print(soup.contents[0].body.string)

None


In [195]:
for string in soup.contents[0].body.strings:
    print(string)



The Dormouse's story


Once upon a time there were three little sisters; and their names were

Elsie
,

Lacie
 and

Tillie
;
and they lived at the bottom of a well.


...




In [196]:
# To remove the whitespaces
for string in soup.contents[0].body.stripped_strings:
    print(string)

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie
,
Lacie
and
Tillie
;
and they lived at the bottom of a well.
...


In [197]:
title_tag = soup.title

In [198]:
title_tag.contents

["The Dormouse's story"]

In [199]:
title_tag.string.parent

<title>The Dormouse's story</title>

In [200]:
html_tag = soup.html
type(html_tag.parent)

bs4.BeautifulSoup

In [201]:
# Beautiful Soup is the main parent
print(soup.parent)

None


In [202]:
link = soup.a

for parent in link.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

p
body
html
[document]


In [203]:
links = soup.a
link

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [204]:
link.next_sibling

',\n'

In [205]:
link.next_sibling.next_sibling

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

In [206]:
for sibling in soup.a.next_siblings:
    print(repr(sibling))

',\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
' and\n'
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
';\nand they lived at the bottom of a well.'


In [207]:
for sibling in soup.find(id="link3").previous_siblings:
    print(repr(sibling))

' and\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
',\n'
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
'Once upon a time there were three little sisters; and their names were\n'


In [208]:
soup.a.next_element

'Elsie'

In [209]:
soup.a.previous_element.next_element

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [210]:
for element in soup.find(id="link3").next_elements:
    print(repr(element))

'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
<p class="story">...</p>
'...'
'\n'


In [211]:
# using regular expressions
import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)

body
b


In [212]:
for tag in soup.find_all(re.compile("t")):
    print(tag.name)

html
title


In [213]:
# soup will match any item in the list
soup.find_all(["a", "b"])

[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [214]:
# finds all the tags in the document but none of the text strings
for tag in soup.find_all(True):
    print(tag.name)

html
head
title
body
p
b
p
a
a
a
p


In [215]:
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')

In [216]:
soup.find_all(has_class_but_no_id)

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [217]:
# function that finds all a tags whose href attribute does not match a regular expression
def not_lacie(href):
    return href and not re.compile("lacie").search(href)
soup.find_all(href=not_lacie)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [219]:
# function that returns True if a tag is surrounded by string objects
from bs4 import NavigableString
def surrounded_by_strings(tag):
    return (isinstance(tag.next_element, NavigableString)
            and isinstance(tag.previous_element, NavigableString))


for tag in soup.find_all(surrounded_by_strings):
    print(tag.name)

body
p
a
a
a
p


In [220]:
soup.find_all("title")

[<title>The Dormouse's story</title>]

In [221]:
soup.find_all("p", "title")

[<p class="title"><b>The Dormouse's story</b></p>]

In [222]:
soup.find_all("a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [223]:
soup.find(id="link2")

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

In [224]:
import re
soup.find(string=re.compile("sisters"))

'Once upon a time there were three little sisters; and their names were\n'

In [225]:
soup.find_all(href=re.compile("elsie"))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [226]:
soup.find_all(id=True)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [227]:
soup.find_all(href=re.compile("elsie"), id='link1')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [232]:
data_soup = BeautifulSoup('[<div data-foo="value">foo!</div>', 'html.parser')

In [233]:
data_soup.find_all(data-foo="value")

SyntaxError: keyword can't be an expression (<ipython-input-233-a766c8a0cac6>, line 1)

In [234]:
data_soup.find_all(attrs={"data-foo": "value"})

[<div data-foo="value">foo!</div>]

In [236]:
name_soup = BeautifulSoup('<input name="email"/>', 'html.parser')
name_soup.find_all(name="email")

[]

In [237]:
name_soup.find_all(attrs={"name": "email"})

[<input name="email"/>]

In [238]:
soup.find_all("a", class_="sister")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [241]:
soup.find_all(class_ = re.compile("itl"))

[<p class="title"><b>The Dormouse's story</b></p>]

In [243]:
def has_six_characters(css_class):
    return css_class is not None and len(css_class) == 6

soup.find_all(class_=has_six_characters)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [244]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser')
css_soup.find_all("p", class_="strikeout body")

[]

In [245]:
css_soup.select("p.strikeout.body")

[<p class="body strikeout"></p>]

In [246]:
soup.find_all("a", attrs={"class": "sister"})

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]