In [5]:
html = '<!DOCTYPE html>\
<html>\
<head>\
<title>PrepInsta Website</title>\
</head>\
<body>\
<h1 class="mainHead">Welcome to PrepInsta</h1>\
<p>This is a great website for us to prepare</p>\
<p class="link">This is a <a href="https://prepinstaprime.com">Link</a> to PrepInsta website</p>\
<p>We wish you <b>All the best</b></p>\
<h2><a href="https://prepinsta.com/terms-and-conditions/">Terms and Conditions</a></h2>\
</body>\
</html>'

In [6]:
from bs4 import BeautifulSoup
data = BeautifulSoup(html, 'html.parser')
print(data.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   PrepInsta Website
  </title>
 </head>
 <body>
  <h1 class="mainHead">
   Welcome to PrepInsta
  </h1>
  <p>
   This is a great website for us to prepare
  </p>
  <p class="link">
   This is a
   <a href="https://prepinstaprime.com">
    Link
   </a>
   to PrepInsta website
  </p>
  <p>
   We wish you
   <b>
    All the best
   </b>
  </p>
  <h2>
   <a href="https://prepinsta.com/terms-and-conditions/">
    Terms and Conditions
   </a>
  </h2>
 </body>
</html>


In [7]:
data.a

<a href="https://prepinstaprime.com">Link</a>

In [11]:
#exact match
data.h2.a

<a href="https://prepinsta.com/terms-and-conditions/">Terms and Conditions</a>

In [12]:
data.body.h2.a

<a href="https://prepinsta.com/terms-and-conditions/">Terms and Conditions</a>

In [14]:
data.body.h2.a.string

'Terms and Conditions'

In [15]:
p_list = data.find_all('p')

for item in p_list:
    print(item)

<p>This is a great website for us to prepare</p>
<p class="link">This is a <a href="https://prepinstaprime.com">Link</a> to PrepInsta website</p>
<p>We wish you <b>All the best</b></p>


In [16]:
p_list = data.find_all('p')

for item in p_list:
    print(item.string)

This is a great website for us to prepare
None
None


In [18]:
p_list = data.find_all('p')

for item in p_list:
    print(list(item.stripped_strings))

['This is a great website for us to prepare']
['This is a', 'Link', 'to PrepInsta website']
['We wish you', 'All the best']


In [19]:
all_data_list = data.find_all(True)

tag_list = set()

for tag_data in all_data_list:
    if tag_data.name not in tag_list:
        tag_list.add(tag_data.name)

print(tag_list)
print(len(tag_list))

{'p', 'html', 'b', 'body', 'head', 'title', 'h2', 'a', 'h1'}
9


In [21]:
contents = data.html.contents

for content in contents:
    print(content)
    
print(len(contents))

<head><title>PrepInsta Website</title></head>
<body><h1 class="mainHead">Welcome to PrepInsta</h1><p>This is a great website for us to prepare</p><p class="link">This is a <a href="https://prepinstaprime.com">Link</a> to PrepInsta website</p><p>We wish you <b>All the best</b></p><h2><a href="https://prepinsta.com/terms-and-conditions/">Terms and Conditions</a></h2></body>
2


In [25]:
# contents gets an iterator for the list and for child in title_tag. 
# children uses the iterator its been handed
# difference between contents and children
children = data.html.children

for child in children:
    print(child)

<head><title>PrepInsta Website</title></head>
<body><h1 class="mainHead">Welcome to PrepInsta</h1><p>This is a great website for us to prepare</p><p class="link">This is a <a href="https://prepinstaprime.com">Link</a> to PrepInsta website</p><p>We wish you <b>All the best</b></p><h2><a href="https://prepinsta.com/terms-and-conditions/">Terms and Conditions</a></h2></body>


In [29]:
children = data.body.children

for child in children:
    print(child)
    
print(len(list(data.body.children)))

<h1 class="mainHead">Welcome to PrepInsta</h1>
<p>This is a great website for us to prepare</p>
<p class="link">This is a <a href="https://prepinstaprime.com">Link</a> to PrepInsta website</p>
<p>We wish you <b>All the best</b></p>
<h2><a href="https://prepinsta.com/terms-and-conditions/">Terms and Conditions</a></h2>
5


In [30]:
data.body.descendants

<generator object Tag.descendants at 0x7fefdbf68f20>

In [33]:
descendants_list = list(data.body.descendants)

for descendant in descendants_list:
    print(descendant)
    print("---------")
    
print(len(descendants_list))

<h1 class="mainHead">Welcome to PrepInsta</h1>
---------
Welcome to PrepInsta
---------
<p>This is a great website for us to prepare</p>
---------
This is a great website for us to prepare
---------
<p class="link">This is a <a href="https://prepinstaprime.com">Link</a> to PrepInsta website</p>
---------
This is a 
---------
<a href="https://prepinstaprime.com">Link</a>
---------
Link
---------
 to PrepInsta website
---------
<p>We wish you <b>All the best</b></p>
---------
We wish you 
---------
<b>All the best</b>
---------
All the best
---------
<h2><a href="https://prepinsta.com/terms-and-conditions/">Terms and Conditions</a></h2>
---------
<a href="https://prepinsta.com/terms-and-conditions/">Terms and Conditions</a>
---------
Terms and Conditions
---------
16


In [34]:
descendants_list

[<h1 class="mainHead">Welcome to PrepInsta</h1>,
 'Welcome to PrepInsta',
 <p>This is a great website for us to prepare</p>,
 'This is a great website for us to prepare',
 <p class="link">This is a <a href="https://prepinstaprime.com">Link</a> to PrepInsta website</p>,
 'This is a ',
 <a href="https://prepinstaprime.com">Link</a>,
 'Link',
 ' to PrepInsta website',
 <p>We wish you <b>All the best</b></p>,
 'We wish you ',
 <b>All the best</b>,
 'All the best',
 <h2><a href="https://prepinsta.com/terms-and-conditions/">Terms and Conditions</a></h2>,
 <a href="https://prepinsta.com/terms-and-conditions/">Terms and Conditions</a>,
 'Terms and Conditions']

In [36]:
parent = data.body.parent
parent

<html><head><title>PrepInsta Website</title></head><body><h1 class="mainHead">Welcome to PrepInsta</h1><p>This is a great website for us to prepare</p><p class="link">This is a <a href="https://prepinstaprime.com">Link</a> to PrepInsta website</p><p>We wish you <b>All the best</b></p><h2><a href="https://prepinsta.com/terms-and-conditions/">Terms and Conditions</a></h2></body></html>

In [38]:
parent = data.p.parent
parent

<p>We wish you <b>All the best</b></p>

In [39]:
parent = data.b.parent
parent

<p>We wish you <b>All the best</b></p>