In [1]:
# 1. import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# 2. The html-code

html_doc = """
<!DOCTYPE html>
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
</html>
"""

html code along ......


In [4]:
# parse the element

soup = BeautifulSoup(html_doc, 'html.parser')

In [5]:
# html well indented, makes it easier to read

print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>



In [6]:
# Extract the title

soup.title

<title>The Dormouse's story</title>

In [7]:
soup.title.name

'title'

In [8]:
soup.title.string

"The Dormouse's story"

In [11]:
soup.title.parent.name

'head'

In [12]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [13]:
soup.p['class']

['title']

In [16]:
soup.find_all("p")

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [17]:
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [19]:
# Clean version without HTML tags

print(soup.get_text())



The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...




pull data from a movie site 

In [None]:
# 2. find url and store it in a variable
url = "https://www.imdb.com/chart/top"

In [None]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

In [None]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")
# 4.2. check that the html code looks like it should
soup

In [None]:
# 5. retrieve/extract the desired info 
#(here, you'll paste the "Selector" copied from the site inspector to get the element that belongs to the top movie)

soup.select("#main > div > span > div > div > div.lister > table > tbody > tr:nth-child(1) > td.titleColumn")

In [None]:
soup.select("td.titleColumn") # all the info about all the movies

In [None]:
soup.select("td.titleColumn a") # all elements containing movie titles

In [None]:
# we can use .get_text() to extract the content of the tags we selected
# we'll need to do it to each tag with a for loop: here we do it to the first one
soup.select("td.titleColumn a")[0]
soup.select("td.titleColumn a")[0].get_text()

In [None]:
# the director and main stars are in the same tag, but as a value of the attribute "title"
# we can access attributes as key-value pairs of dictionaries: using ["key"] to get the value:
soup.select("td.titleColumn a")[0]["title"]


In [None]:
# instead of ["title"] we could use .get("title"): choose whatever you prefer

In [None]:
# the years are inside a 'span' tag with the 'secondaryInfo' class
# we also specify the parent tag and its class, which is the same we used before
# the years are inside parentheses, but we'll take care of that later
soup.select("td.titleColumn span.secondaryInfo")[0].get_text()

Using with a data frame

In [None]:

#1. initialize empty lists for title, dir_stars and year

title = []
dir_stars = []
year = []

In [None]:
#2. define the number of iterations of our for loop
# by checking how many elements are in the retrieved result set
# (this is equivalent but more robust than just explicitly defining 250 iterations)

num_iter = len(soup.select("td.titleColumn a"))

In [None]:
# iterate through the result set and retrive all the data
for i in range(num_iter):
    title.append(soup.select("td.titleColumn a")[i].get_text())
    dir_stars.append(soup.select("td.titleColumn a")[i]["title"])
    year.append(soup.select("td.titleColumn span.secondaryInfo")[i].get_text())

In [None]:
print(title)
print(dir_stars)
print(year)

# each list becomes a column
movies = pd.DataFrame({"title":title,
                       "dir_stars":dir_stars,
                       "year":year
                      })

In [None]:
movies.head()

CLEANING 

Take the year out of the parentheses: we know we can totally do that with regex, but string methods such as str.replace() might be simpler to use.

Change the data type of the year column to integer.

Split dir_stars into 3 columns, one for each person: director, star_1, star_2. This could have been done by filtering when extracting the data from the html document, but it looks easier afterwards:
    The (dir.) pattern can be totally removed
    We can split the string at each comma

In [None]:
director = []
star_1 = []
star_2 = []

for movie in dir_stars:
    crew = movie.split(",")
    director.append(crew[0].replace(" (dir.)", ""))
    star_1.append(crew[1])
    star_2.append(crew[2])

# each list becomes a column
movies = pd.DataFrame({"title":title,
                       "director":director,
                       "star_1":star_1,
                       "star_2":star_2
                      })

movies.head()