# File read/write, Web Request, CSV, Excel, JSON, and XML parsing examples
This notebook shows how to handle files manually without using pandas, and how to parse different types of data.

Also shows some basic BeautifulSoup and ElementTree code.

Code may not run properly because the input file is not uploaded to Github, so use the code only as reference.

#### Opening and parsing a CSV file

In [50]:
import csv
import numpy as np
import pandas as pd
import os
import pprint

def parse_file(datafile):
    name = ""
    data = []
    with open(datafile,'rb') as f:
        r = csv.reader(f)
        temp = r.next()
        header = r.next()
        name = temp[1]
        for line in r:
            data.append(line)
    return (name, data)

#### Opening and parsing Excel

In [3]:
def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    data = {}
    # process all rows that contain station data
    for n in range (1, 9):
        station = sheet.cell_value(0, n)
        cv = sheet.col_values(n, start_rowx=1, end_rowx=None)

        maxval = max(cv)
        maxpos = cv.index(maxval) + 1
        maxtime = sheet.cell_value(maxpos, 0)
        realtime = xlrd.xldate_as_tuple(maxtime, 0)
        data[station] = {"maxval": maxval,
                         "maxtime": realtime}

    print data
    return data

def save_file(data, filename):
    with open(filename, "w") as f:
        w = csv.writer(f, delimiter='|')
        w.writerow(["Station", "Year", "Month", "Day", "Hour", "Max Load"])
        for s in data:
            year, month, day, hour, _ , _= data[s]["maxtime"]
            w.writerow([s, year, month, day, hour, data[s]["maxval"]])

#### JSON and web request handling

In [None]:
# To experiment with this code freely you will have to run this code locally.
# Take a look at the main() function for an example of how to use the code.
# We have provided example json output in the other code editor tabs for you to
# look at, but you will not be able to run any queries through our UI.
import json
import requests


BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"

# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = {  "simple": {},
                "atr": {"inc": "aliases+tags+ratings"},
                "aliases": {"inc": "aliases"},
                "releases": {"inc": "releases"}}


def query_site(url, params, uid="", fmt="json"):
    # This is the main function for making queries to the musicbrainz API.
    # A json document should be returned by the query.
    params["fmt"] = fmt
    r = requests.get(url + uid, params=params)
    print "requesting", r.url

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def query_by_name(url, params, name):
    # This adds an artist name to the query parameters before making
    # an API call to the function above.
    params["query"] = "artist:" + name
    return query_site(url, params)


def pretty_print(data, indent=4):
    # After we get our output, we can format it to be more readable
    # by using this function.
    if type(data) == dict:
        print json.dumps(data, indent=indent, sort_keys=True)
    else:
        print data


def main():
    '''
    Modify the function calls and indexing below to answer the questions on
    the next quiz. HINT: Note how the output we get from the site is a
    multi-level JSON document, so try making print statements to step through
    the structure one level at a time or copy the output to a separate output
    file.
    '''
    results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
    pretty_print(results)

    artist_id = results["artists"][1]["id"]
    print "\nARTIST:"
    pretty_print(results["artists"][1])

    artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
    releases = artist_data["releases"]
    print "\nONE RELEASE:"
    pretty_print(releases[0], indent=2)
    release_titles = [r["title"] for r in releases]

    print "\nALL TITLES:"
    for t in release_titles:
        print t


if __name__ == '__main__':
    main()


#### Parsing XML

In [None]:
#!/usr/bin/env python
# Your task here is to extract data from xml on authors of an article
# and add it to a list, one item for an author.
# See the provided data structure for the expected format.
# The tags for first name, surname and email should map directly
# to the dictionary keys, but you have to extract the attributes from the "insr" tag
# and add them to the list for the dictionary key "insr"
import xml.etree.ElementTree as ET

article_file = "exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()

# Data looks like this:
# <aug>
#     <au>
#         <snm>Mei-Dan</snm>
#         <fnm>Omer</fnm>
#         <insr iid="I1"/>
#         <email>omer@extremegate.com></email>
#     </au>
#     <au>...

def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": author.find('fnm').text,
                "snm": author.find('snm').text,
                "email": author.find('email').text,
                "insr": []
        }
        for i in author.findall('./insr'):
            data['insr'].append(i.attrib['iid'])
            print data
        authors.append(data)

    return authors


def test():
    solution = [{'insr': ['I1'], 'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'},
                {'insr': ['I2'], 'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'},
                {'insr': ['I3', 'I4'], 'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'},
                {'insr': ['I3'], 'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'},
                {'insr': ['I8'], 'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'},
                {'insr': ['I3', 'I5'], 'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'},
                {'insr': ['I6'], 'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'},
                {'insr': ['I7'], 'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]

    root = get_root(article_file)
    data = get_authors(root)

    assert data[0] == solution[0]
    assert data[1]["insr"] == solution[1]["insr"]


test()

#### Scraping data off a website that has a drop down menu.
1. Build list of carrier values
2. Build list of airport values
3. Make HTTP requests to download all data
4. Parse local data files

In [1]:
from bs4 import BeautifulSoup #this parses HTML
import requests
import json

In [104]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Please note that the function 'make_request' is provided for your reference only.
# You will not be able to to actually use it from within the Udacity web UI.
# Your task is to process the HTML using BeautifulSoup, extract the hidden
# form field values for "__EVENTVALIDATION" and "__VIEWSTATE" and set the appropriate
# values in the data dictionary.
# All your changes should be in the 'extract_data' function
from bs4 import BeautifulSoup
import requests
import json
import urllib

# html_page was used in the quiz 
html_page = "http://www.transtats.bts.gov/Data_Elements.aspx?Data=2"

# Using the urllib is what is suggested in the BeautifulSoup online documentation. 
# r = urllib.urlopen('http://www.transtats.bts.gov/Data_Elements.aspx?Data=2').read()
# soup = BeautifulSoup(r)

# But stackoverflow and Udacity say the Request module should be used to get the html data.
# http://stackoverflow.com/questions/2018026/what-are-the-differences-between-the-urllib-urllib2-and-requests-module
s = requests.Session() # some websites use cookies to keep the session alive. 
r = s.get("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2")
print r.text
soup = BeautifulSoup(r.text)



<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
<head><title>
	Data Elements
</title>
    
    <script type="text/javascript" src="lib/topNavBar.js"></script>
	<script type="text/javascript" src="lib/no_ms.js"></script>
	<link rel="shortcut icon" href="favicon.ico" /><link href="styles/global.css" REL="stylesheet" TYPE="text/css" /><link rel="stylesheet" type="text/css" href="styles/local_Top_nav_main.css" /><link rel="stylesheet" type="text/css" href="styles/rita_main.css" /><link rel="stylesheet" type="text/css" href="styles/headerFooter.css" />

<script language="javascript" type="text/javascript">
function window_CarrierList(page)
{
    //aUrl="CarrierList.asp?xpage=" + xPage + "&flag=" + flag;
    aUrl="CarrierList.aspx?page=" + page;
  
	windef="Height=750,width=600,Left=0,Top=0,dependent=yes,resizable=Yes, scrollbars=yes";
    objWi

In [103]:
soup.head

<head><title>\r\n\tData Elements\r\n</title>\n<script src="lib/topNavBar.js" type="text/javascript"></script>\n<script src="lib/no_ms.js" type="text/javascript"></script>\n<link href="favicon.ico" rel="shortcut icon"/><link href="styles/global.css" rel="stylesheet" type="text/css"/><link href="styles/local_Top_nav_main.css" rel="stylesheet" type="text/css"/><link href="styles/rita_main.css" rel="stylesheet" type="text/css"/><link href="styles/headerFooter.css" rel="stylesheet" type="text/css"/>\n<script language="javascript" type="text/javascript">\r\nfunction window_CarrierList(page)\r\n{\r\n    //aUrl="CarrierList.asp?xpage=" + xPage + "&flag=" + flag;\r\n    aUrl="CarrierList.aspx?page=" + page;\r\n  \r\n\twindef="Height=750,width=600,Left=0,Top=0,dependent=yes,resizable=Yes, scrollbars=yes";\r\n    objWindow = window.open("","Subwindow", windef);\r\n\tobjWindow.location.href=aUrl;\r\n\tobjWindow.focus();\r\n\r\n}\r\n\r\nfunction window_CarrierList_Foreign()\r\n{\r\n    //aUrl="Carr

In [47]:
print len(soup.find(id='CarrierList').find_all('option'))

17


In [90]:
print len(soup.find(id='form1').find_all('input'))
soup.find(id='__EVENTVALIDATION')['value']
soup.find(id='__VIEWSTATE')['value']

5


'/wEPDwULLTEwNTQ0NzIzNzEPFg4eB3N0ckNvbm4FWlByb3ZpZGVyPS5ORVQgRnJhbWV3b3JrIERhdGEgUHJvdmlkZXIgZm9yIE9EQkM7RFNOPUVuZGVhdm91cjt1aWQ9d2VidXNlcjtwd2Q9IVdlYnVzZXIxMjM0Ox4FTUxpc3QFrQEnQVRMJywnQldJJywnQk9TJywnQ0xUJywnTURXJywnT1JEJywnREZXJywnREVOJywnRFRXJywnRkxMJywnSUFIJywnTEFTJywnTEFYJywnTUlBJywnTVNQJywnSkZLJywnTEdBJywnRVdSJywnTUNPJywnUEhMJywnUEhYJywnUERYJywnU0xDJywnU0FOJywnU0ZPJywnU0VBJywnVFBBJywnRENBJywnSUFEJx4MQWlycG9ydF9OYW1lBQ1BbGwgQWlycG9ydHMgHgZIZWFkZXIFB0ZsaWdodHMeCVNlbGVjdGlvbgUcQWxsIENhcnJpZXJzIC0gQWxsIEFpcnBvcnRzIB4EVW5pdGUeBlNvdXJjZQVDPEJSPlNPVVJDRTogQnVyZWF1IG9mIFRyYW5zcG9ydGF0aW9uIFN0YXRpc3RpY3MgVC0xMDAgU2VnbWVudCBkYXRhLhYGAggPDxYCHgRUZXh0BQdGbGlnaHRzZGQCCg8PFgIfBwUcQWxsIENhcnJpZXJzIC0gQWxsIEFpcnBvcnRzIGRkAgwPZBYUAgEPEA8WAh4UQXBwZW5kRGF0YUJvdW5kSXRlbXNnZA8WEWYCAQICAgMCBAIFAgYCBwIIAgkCCgILAgwCDQIOAg8CEBYREAUdQWxsIFUuUy4gYW5kIEZvcmVpZ24gQ2FycmllcnMFA0FsbGcQBRFBbGwgVS5TLiBDYXJyaWVycwUFQWxsVVNnEAUUQWxsIEZvcmVpZ24gQ2FycmllcnMFCkFsbEZvcmVpZ25nEAUQQWxhc2thIEFpcmxpbmVzIAUCQVNnEAUSQW1lcml

In [None]:
# This set of code shouldn't run. They are copied and pasted from the Udacity tutorial where they store
# an HTML file locally on their server. This is just for conceptual reference.
def extract_data(page):
    data = {"eventvalidation": "",
            "viewstate": ""}
    with open(page, "r") as html:
        soup = BeautifulSoup(html,'lxml')
    
    data['eventvalidation'] = soup.find(id='form1').find(id='__EVENTVALIDATION')['value']
    data['viewstate'] = soup.find(id='form1').find(id='__VIEWSTATE')['value']

    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]

    # The variable "s" is the session created at the beginning up there.
    r = s.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': "BOS",
                          'CarrierList': "VX",
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })
    f = open("virgin_and_logan-airport.html","w")
    f.write(r.text)
    return r.text



def test():
    data = extract_data(html_page)
    assert data["eventvalidation"] != ""
    assert data["eventvalidation"].startswith("/wEWjAkCoIj1ng0")
    assert data["viewstate"].startswith("/wEPDwUKLTI")

    
test()