Python Seminar (AY250) UC Berkeley

# Super simple webpage access

In [None]:
%run ../00_AdvancedPythonConcepts/talktools.py

In [None]:
from __future__ import absolute_import, division, print_function

In [None]:
# URL = Uniform Resource Locator
try:
    # For Python 3.0 and later
    from urllib.request import urlopen
except ImportError:
    # Fall back to Python 2's urllib2
    from urllib2 import urlopen
    
# Brain maps data "Explore the Brain like never before"
url = "http://brainmaps.org/"  
response = urlopen(url) # response is a file-like object
html_data = response.read()
response.close() # close response as you would a normal file
print(html_data[:300])

Small aside: if you have HTML data that you want to render, you can use `webbrowser` module

see http://docs.python.org/library/webbrowser.html

In [None]:
import webbrowser
open("/tmp/tmp.html","w").write(html_data.decode("UTF-8"))
webbrowser.open("file:///tmp/tmp.html")

# Scripting an HTTP GET request

In [None]:
try:
    from urllib.parse import urlencode
except:
    from urllib import urlencode


# create a dictionary to store the GET data
get_info = {"q": "Joshua S. Bloom", "page": "2"} 

# encode the data in proper URL format
url_values = urlencode(get_info) 
print(url_values)

In [None]:
url = "http://pubget.com/search"

# open the url as before
#alternatively: urlopen(url + "?" + url_values.encode("utf-8"))
response = urlopen(url,data=url_values.encode("utf-8"))  

html = response.read()
response.close()
print(html[8000:9000])

# Scripting an HTTP POST request

In [None]:
data = {}
data["author"] = "Sagan, Carl"
params = urlencode(data).encode("UTF-8") # same urlencode method
url = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect"
response = urlopen(url, params) 
# POST request is indicated by including the params in urlopen
html = response.read()
response.close()
print(html[16474:19000])

# Access an FTP server

In [None]:
import ftplib
ftp = ftplib.FTP("ftp.cdc.gov")
ftp.login()

In [None]:
ftp.cwd("/pub/OPD")

In [None]:
ftp.dir()

In [None]:
ftp.cwd("Susanna")

In [None]:
ftp.dir()

In [None]:
ftp.retrbinary('RETR SIKA_BANNER_7X3_reduced.pdf', open('zika.pdf', 'wb').write)

In [None]:
pwd = !pwd

In [None]:
import webbrowser

webbrowser.open_new('file://{}/zika.pdf'.format(pwd[0]))

# Parsing HTML with BeautifulSoup

See: http://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [None]:
try:
    # For Python 3.0 and later
    from urllib.request import urlopen
except ImportError:
    # Fall back to Python 2's urllib2
    from urllib2 import urlopen
    
response = urlopen("http://words.bighugelabs.com/")
html = response.read()
response.close()

# pip install beautifulsoup4
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,"html.parser")
forms = soup.findAll("form")
forms

In [None]:
print(html)

In [None]:
links = soup.findAll("form")
for link in links:
    print(link)

Let's load up a whole bunch of baby names, by combining scripted webpage access with BeautifulSoup:

In [None]:
from bs4 import BeautifulSoup
url = "http://nameberry.com/search/boys_names/J"
response = urlopen(url)
html = response.read()
response.close()
soup = BeautifulSoup(html,"html.parser")

In [None]:
items = soup.findAll("li", class_="name_in_list")
print(items)

In [None]:
items[35].a.get_text()

In [None]:
import string

letters = "qwertyuioplkjhgfdsazxcvbnm"
boy_names = []
for n in string.ascii_uppercase[:26]:
    url = "http://nameberry.com/search/boys_names/" + n
    response = urlopen(url)
    html = response.read()
    response.close()
    soup = BeautifulSoup(html,"html.parser")
    items = soup.findAll("li", class_="name_in_list")
    for item in items:
        if len(item.findAll("a")) == 1:
            boy_names.append(item.a.get_text())

In [None]:
print(boy_names)

In [None]:
boy_names.sort()
print(str(len(boy_names)) + " names from " + \
       boy_names[0] + " to " + boy_names[-1] + ".")

To demonstrate we downloaded and parsed all the names, and to have a little fun, let's make up an official-sounding name for a childish Congressman.

In [None]:
import random
proper_person_name = ""
for n in range(5):
    proper_person_name += random.choice(boy_names) + " "
proper_person_name = "Congressman " + proper_person_name[:-1] + " XVI" + " PhD"
print(proper_person_name)

# JSON API

In [None]:
import json
import joshkey
base_domain = "http://words.bighugelabs.com/"

api_key =  joshkey.API # get your own damn key!
word = "hacker"

url = base_domain + "api/2/" + api_key + "/" + word + "/json"
print(url)

result = json.loads(urlopen(url).read().decode("UTF-8")) # a dictionary!

print(result)

In [None]:
import pprint
pprint.pprint(result)

A more fleshed-out example code, prints the output more cleanly.

In [None]:
import sys
base_domain = "http://words.bighugelabs.com/"
api_key = "483e281b60496d7961d852629799e733"
word = "notebook"
print("Retrieving thesaurus entry for \"" + word + "\".")
url = base_domain + "api/2/" + api_key + "/" + word + "/json"
try:
    result = json.loads(urlopen(url).read().decode("UTF-8")) # a dictionary!
except:
    print("Error - word probably not in thesaurus.")
    #sys.exit()
for part_of_speech in result:
    print("-"*50)
    print("These are the " + part_of_speech + " entries:")
    for key in ["syn", "ant", "rel"]:
        try:
            for synonym in result[part_of_speech][key]:
                print(key + " - " + synonym)
        except:
            continue