https://realpython.com/python-web-scraping-practical-introduction/

In [2]:
from urllib.request import urlopen

In [3]:
url = "http://olympus.realpython.org/profiles/aphrodite"
page = urlopen(url)

In [4]:
page

<http.client.HTTPResponse at 0x1119b9460>

In [5]:
html_bytes = page.read()
html = html_bytes.decode("utf-8")

In [6]:
print(html)

<html>
<head>
<title>Profile: Aphrodite</title>
</head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="/static/aphrodite.gif" />
<h2>Name: Aphrodite</h2>
<br><br>
Favorite animal: Dove
<br><br>
Favorite color: Red
<br><br>
Hometown: Mount Olympus
</center>
</body>
</html>



In [8]:
title_index = html.find("<title>")
title_index

14

In [9]:
start_index = title_index + len("<title>")
start_index

21

In [11]:
end_index = html.find("</title>")
end_index

39

In [12]:
title = html[start_index:end_index]
title

'Profile: Aphrodite'

In [13]:
url = "http://olympus.realpython.org/profiles/poseidon"
page = urlopen(url)
html = page.read().decode("utf-8")
start_index = html.find("<title>") + len("<title>")
end_index = html.find("</title>")
title = html[start_index:end_index]
title

'\n<head>\n<title >Profile: Poseidon'

In [14]:
import re
from urllib.request import urlopen

url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")

pattern = "<title.*?>.*?</title.*?>"
match_results = re.search(pattern, html, re.IGNORECASE)
title = match_results.group()
title = re.sub("<.*?>", "", title) # Remove HTML tags

print(title)

Profile: Dionysus


In [15]:
url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")
html

'<html>\n<head>\n<TITLE >Profile: Dionysus</title  / >\n</head>\n<body bgcolor="yellow">\n<center>\n<br><br>\n<img src="/static/dionysus.jpg" />\n<h2>Name: Dionysus</h2>\n<img src="/static/grapes.png"><br><br>\nHometown: Mount Olympus\n<br><br>\nFavorite animal: Leopard <br>\n<br>\nFavorite Color: Wine\n</center>\n</body>\n</html>\n'

In [16]:
start_index = html.find("Name:") + len("Name: ")
end_index = html.find("</h2>")
name = html[start_index:end_index]
name

'Dionysus'

In [17]:
start_index = html.find("Favorite Color:") + len("Favorite Color: ")
end_index = html.find("\n</center>")
fav_color = html[start_index:end_index]
fav_color

'Wine'

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")

In [2]:
print(soup.get_text())



Profile: Dionysus





Name: Dionysus

Hometown: Mount Olympus

Favorite animal: Leopard 

Favorite Color: Wine






In [3]:
soup.find_all("img")

[<img src="/static/dionysus.jpg"/>, <img src="/static/grapes.png"/>]

In [4]:
image1, image2 = soup.find_all("img")

In [5]:
image1

<img src="/static/dionysus.jpg"/>

In [6]:
image1.name

'img'

In [8]:
image1['src']

'/static/dionysus.jpg'

In [9]:
image2['src']

'/static/grapes.png'

In [10]:
soup.title

<title>Profile: Dionysus</title>

In [11]:
soup.title.string

'Profile: Dionysus'

In [12]:
soup.find_all("img", src="/static/dionysus.jpg")

[<img src="/static/dionysus.jpg"/>]

In [21]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
base_url = "http://olympus.realpython.org"
page = urlopen(base_url + '/profiles')
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")

In [22]:
for link in soup.find_all("a"):
    link_url = base_url + link['href']
    print(link_url)

http://olympus.realpython.org/profiles/aphrodite
http://olympus.realpython.org/profiles/poseidon
http://olympus.realpython.org/profiles/dionysus


In [23]:
import mechanicalsoup
browser = mechanicalsoup.Browser()

In [24]:
url = "http://olympus.realpython.org/login"
page = browser.get(url)

In [25]:
page

<Response [200]>

In [26]:
type(page.soup)

bs4.BeautifulSoup

In [27]:
page.soup

<html>
<head>
<title>Log In</title>
</head>
<body bgcolor="yellow">
<center>
<br/><br/>
<h2>Please log in to access Mount Olympus:</h2>
<br/><br/>
<form action="/login" method="post" name="login">
Username: <input name="user" type="text"/><br/>
Password: <input name="pwd" type="password"/><br/><br/>
<input type="submit" value="Submit"/>
</form>
</center>
</body>
</html>

In [28]:
import mechanicalsoup

browser = mechanicalsoup.Browser()
url = "http://olympus.realpython.org/login"
login_page = browser.get(url)
login_html = login_page.soup

form = login_html.select("form")[0]
form.select("input")[0]["value"] = "zeus"
form.select("input")[1]["value"] = "ThunderDude"

profiles_page = browser.submit(form, login_page.url)

In [29]:
profiles_page.url

'http://olympus.realpython.org/profiles'