In [1]:
import pandas as pd 
import requests
from bs4 import BeautifulSoup

# learning how to work with html tags 

In [2]:
#1st step - create the html variable 
html_doc="""
<!DOCTYPE html>
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
</html>
"""

In [3]:
# parse the html
soup = BeautifulSoup(html_doc, 'html.parser')

In [4]:
# print the formatted html
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>



# Navigate using html

In [5]:
soup.title   # from the soup, select class = title 

<title>The Dormouse's story</title>

In [6]:
soup.title.string

"The Dormouse's story"

In [7]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [8]:
p_tags=soup.find_all('p')

In [9]:
p_tags

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [10]:
for p in p_tags:
    print(p.get_text())

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...


In [12]:
a_tags=soup.find_all('a')

In [15]:
a_tags

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [13]:
for a in a_tags:
    print(a.get_text())

Elsie
Lacie
Tillie


In [16]:
links=[]
for a in a_tags:
    links.append(a.get('href'))

In [17]:
links

['http://example.com/elsie',
 'http://example.com/lacie',
 'http://example.com/tillie']

In [18]:
soup.title

<title>The Dormouse's story</title>

In [19]:
soup.title.parent.string

"The Dormouse's story"

In [20]:
soup.title.parent.name

'head'

In [50]:
soup.a.parent.parent.parent.parent


<!DOCTYPE html>

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [51]:
import re # regex 

In [55]:
soup.find_all(string=re.compile('well'))

[';\nand they lived at the bottom of a well.']

In [56]:
soup.text.count('we')

3

## Eg searching one webpage and counting the number of mentions of a specific word (case sensitive)
 - remember this is the raw html 

In [61]:
re.findall(r'\w+',requests.get('http://www.ironhack.com/en').text).count('bootcamp')

60

In [62]:
response=requests.get('http://www.ironhack.com/en')

In [63]:
response.status_code

200

# CSS method to get data 

In [65]:
soup.select('#link2') #    # for css id

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [67]:
soup.select('.sister')  #    . for css class

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [68]:
# iterate through the class sister and collect the text for each record 
for s in soup.select('.sister'): 
    print(s.get_text())

Elsie
Lacie
Tillie


In [71]:
print(soup.select('.sister')[2].get_text()) # index 

Tillie


In [72]:
print(soup.select('.sister').get_text()) # this wont work because we have multiple results 

AttributeError: ResultSet object has no attribute 'get_text'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?

# Activities 