# Introducing BeatifulSoup

In [1]:
import requests
import webbrowser

import re 
from bs4 import BeautifulSoup

from pprint import pprint

In [2]:
very_simple_html = """

<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
<body>
<p class="title">
    <b>The Dormouse's story</b>
</p>

<p class="story">
Once upon a time there were three little sisters; and their names were:
    
    <a href ="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    <a href ="http://example.com/lacie" class="sister" id="link1">Lacie</a> and
    <a href ="http://example.com/tillie" class="sister" id="link1">Tillie</a>;

and they lived at the bottom of a well.
</p>

<p class="story">The story continues</p>
</body>
</html>
"""

In [3]:
soup = BeautifulSoup(very_simple_html)
soup

<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="title">
<b>The Dormouse's story</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were:
    
    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" href="http://example.com/lacie" id="link1">Lacie</a> and
    <a class="sister" href="http://example.com/tillie" id="link1">Tillie</a>;

and they lived at the bottom of a well.
</p>
<p class="story">The story continues</p>
</body>
</html>

In [4]:
type(soup)

bs4.BeautifulSoup

#### The soup object is the root of our parse tree

#### The html we extracted from our website may not be clean may not be well intended. If we want to view it in a prettified format try the below code;

In [5]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were:
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link1">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link1">
    Tillie
   </a>
   ;

and they lived at the bottom of a well.
  </p>
  <p class="story">
   The story continues
  </p>
 </body>
</html>



#### To view the html tags within our page

In [6]:
soup.html

<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="title">
<b>The Dormouse's story</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were:
    
    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" href="http://example.com/lacie" id="link1">Lacie</a> and
    <a class="sister" href="http://example.com/tillie" id="link1">Tillie</a>;

and they lived at the bottom of a well.
</p>
<p class="story">The story continues</p>
</body>
</html>

#### We can directly access the html contents by "." operator

In [7]:
soup.head

<head>
<title>The Dormouse's story</title>
</head>

In [8]:
soup.title

<title>The Dormouse's story</title>

#### For accessing the string

In [9]:
soup.title.string

"The Dormouse's story"

#### the "string" method only works when we have exactly one string in the title. It cants contain other nested elements.

#### For accessing the name of the html element

In [10]:
soup.title.name

'title'

In [11]:
soup.body

<body>
<p class="title">
<b>The Dormouse's story</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were:
    
    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" href="http://example.com/lacie" id="link1">Lacie</a> and
    <a class="sister" href="http://example.com/tillie" id="link1">Tillie</a>;

and they lived at the bottom of a well.
</p>
<p class="story">The story continues</p>
</body>

#### There are repeating tags in the body, like 'paragraphs'. "." operator will only give us the first element present.

In [12]:
soup.body.p

<p class="title">
<b>The Dormouse's story</b>
</p>

#### Bold tag <b>

In [14]:
soup.b

<b>The Dormouse's story</b>

#### The anchor tag "a"

In [15]:
soup.a 

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

#### finding parents

In [16]:
soup.a.parent

<p class="story">
Once upon a time there were three little sisters; and their names were:
    
    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" href="http://example.com/lacie" id="link1">Lacie</a> and
    <a class="sister" href="http://example.com/tillie" id="link1">Tillie</a>;

and they lived at the bottom of a well.
</p>

In [17]:
soup.a.parent.parent

<body>
<p class="title">
<b>The Dormouse's story</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were:
    
    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" href="http://example.com/lacie" id="link1">Lacie</a> and
    <a class="sister" href="http://example.com/tillie" id="link1">Tillie</a>;

and they lived at the bottom of a well.
</p>
<p class="story">The story continues</p>
</body>

#### soup.div
#### soup.i - italics
#### soup.img - images 
