In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Lab 3 SI 618: Fetching and parsing structured documents (100 points)
#
# The utf8 'magic comment' is to tell Python that this source code will
# contain unicode literals outside of the ISO-Latin-1 character set.

# Some lines of code are taken from Google's Python Class
# http://code.google.com/edu/languages/google-python-class/  and
# an earlier lab by Dr. Yuhang Wang.

# The purpose of this lab is to have you practice using some powerful
# modules for fetching and parsing content:
#    urllib3 : for fetching the content of a URL (e.g. HTML page)
#    BeautifulSoup : for parsing HTML and XML pages
#    json : for JSON reading and writing
#
# You should fill in the code for the functions below.
# main() is already set up to call the functions with a few different inputs,
# printing 'OK' when each function is correct.

In [2]:
from bs4 import BeautifulSoup
import json
import urllib3

In [3]:
# We will get the UMSI wikipedia page using urllib3. This will be the html used in the lab
http = urllib3.PoolManager()
response = http.request("GET", 'https://en.wikipedia.org/w/index.php?title=University_of_Michigan_School_of_Information&oldid=1010694377')
html_doc = response.data.decode('utf-8')

In [4]:
# this is the json string used in this lab. It is a tweet posted in the CDC Twitter account regarding COVID vaccines on 9/10/2021 
json_str = open("tweet.json","r").read()

In [5]:

# Q1. get_title (15 points)
# The get_title function should should process the HTML page stored in the global
# variable html_doc, and return the title of the page in a unicode string.
# get_title() should return u'University of Michigan School of Information - Wikipedia'
def get_title():
    soup = BeautifulSoup(html_doc)
    return soup.title.string

In [6]:
# Q2. get_tweet_photo (15 points)
# The get_tweet_photo function should load the dictionary stored as a JSON string
# in global variable json_str, and the shortened url of the photo embedded in the tweet
# get_tweet_photo() should return https://t.co/FSmQo3y3Ir
def get_tweet_photo():
    text = json.loads(json_str)
    return text['entities']['media'][0]['url']

In [7]:
# Q3. get_link_count (20 points)
# The get_link_count function should process the HTML page stored in the global variable
# html_doc, and return the number of links  
# get_link_count() should return 390
def get_link_count():
    soup = BeautifulSoup(html_doc)
    return len(soup.find_all("a"))

In [8]:
# Q4. get_sub_headings (20 points)
# The get_sub_headings function should process the HTML page stored in the global variable
# html_doc, and return the eight sub headings in a JSON string. 
# Note that it should return a string, not a list. 
# get_sub_headings() should return '["Undergraduate degree", "Master\'s degree", "Master\'s of Applied Data Science on Coursera", "Doctoral degree", "Faculty and research", "History", "References", "External links"]'
def get_sub_headings():
    soup = BeautifulSoup(html_doc)
    toctext = soup.find_all(attrs={'class':"toctext"})
    subheadings = []
    for toc in toctext:
        subheadings.append(toc.text)
    return '["' + '", "'.join(subheadings) + '"]'

In [9]:
# Q5. get_school_info (30 points)
# The get_school_info function should process the HTML page stored in the global variable
# html_doc, and return information information from the summary info box in a JSON string. 
# Note that it should return a string, not a list.
# get_school_info() should return '{"Type": "Public", "Established": "1969[citation needed]", "Parent institution": "University of Michigan", "Dean": "Thomas Finholt", "Academic staff": "111[1]", "Students": "1199[1]", "Location": "Ann Arbor, Michigan, United States", "Campus": "Urban", "Website": "si.umich.edu"}'
# HINT: contruct a list of tuples first, and then convert it to a dictionary to turn into a
#  JSON string.
def get_school_info():
    soup = BeautifulSoup(html_doc)
    labels = soup.find_all(attrs={'class':"infobox-label"})
    data = soup.find_all(attrs={'class':"infobox-data"})
    dict = {}
    for i in range(0, len(labels)):
        dict[labels[i].text] = data[i].text
    str = json.dumps(dict)
    return str

In [10]:
#######################################################################
# DO NOT MODIFY ANY CODE BELOW
#######################################################################

# Provided simple test() function used in main() to print
# what each function returns vs. what it's supposed to return.
def test(got, expected):
  if got == expected:
    prefix = ' OK '
  else:
    prefix = '  X '
  print ('%s got: %s expected: %s' % (prefix, repr(got), repr(expected)))

def test2(got, expected):
  if got == expected:
    prefix = ' OK '
  else:
    prefix = '  X '
  print ('%s got: %s expected: %s' % (prefix, got, expected))

# Provided main() calls the above functions with interesting inputs,
# using test() to check if each result is correct or not.

In [11]:
def main():
  print ('get_title')

  test(get_title(), u'University of Michigan School of Information - Wikipedia')
  
  print ('get_tweet_photo')

  test(get_tweet_photo(), "https://t.co/FSmQo3y3Ir")

  print ('get_link_counts')

  test(get_link_count(), 390)
  
  print ('get_sub_headings')

  test(get_sub_headings(), '["Undergraduate degree", "Master\'s degree", "Master\'s of Applied Data Science on Coursera", "Doctoral degree", "Faculty and research", "History", "References", "External links"]')

  print ('get_school_info')

  test(get_school_info(), '{"Type": "Public", "Established": "1969[citation needed]", "Parent institution": "University of Michigan", "Dean": "Thomas Finholt", "Academic staff": "111[1]", "Students": "1199[1]", "Location": "Ann Arbor, Michigan, United States", "Campus": "Urban", "Website": "si.umich.edu"}')

In [12]:
# Standard boilerplate to call the main() function.
if __name__ == '__main__':
  main()

get_title
 OK  got: 'University of Michigan School of Information - Wikipedia' expected: 'University of Michigan School of Information - Wikipedia'
get_tweet_photo
 OK  got: 'https://t.co/FSmQo3y3Ir' expected: 'https://t.co/FSmQo3y3Ir'
get_link_counts
 OK  got: 390 expected: 390
get_sub_headings
 OK  got: '["Undergraduate degree", "Master\'s degree", "Master\'s of Applied Data Science on Coursera", "Doctoral degree", "Faculty and research", "History", "References", "External links"]' expected: '["Undergraduate degree", "Master\'s degree", "Master\'s of Applied Data Science on Coursera", "Doctoral degree", "Faculty and research", "History", "References", "External links"]'
get_school_info
 OK  got: '{"Type": "Public", "Established": "1969[citation needed]", "Parent institution": "University of Michigan", "Dean": "Thomas Finholt", "Academic staff": "111[1]", "Students": "1199[1]", "Location": "Ann Arbor, Michigan, United States", "Campus": "Urban", "Website": "si.umich.edu"}' expected: '