# Read csv data

In [2]:
import os
DATADIR = ""
DATAFILE= "beatles-diskography.csv"


#parse each row into dictionary, the fields serve as keys and the fileds serve as values 
def parse_file(datafile):
    data = []
    with open(datafile, "rb") as f:
        '''
        Start from here
        ''' 
        # Read the first line, split with comma as delimeter, will served as keys
        header = f.readline().decode().split(",") # keys
        counter = 0
        for line in f:
            if counter == 10:
                break
            
            # Individual field would be served for values 
            fields = line.decode().split(",")
            entry = {}
            
            for i, value in enumerate(fields):
                # header[i] is the key, value from fields is the value
                entry[header[i].strip()] = value.strip()
            data.append(entry)
            counter += 1
        '''
        End here
        '''
    return data


In [3]:
def test():
    # a simple test of your implemetation
    datafile = os.path.join(DATADIR, DATAFILE)
    d = parse_file(datafile)
    firstline = {'Title': 'Please Please Me', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '22 March 1963', 'US Chart Position': '-', 'RIAA Certification': 'Platinum', 'BPI Certification': 'Gold'}
    tenthline = {'Title': '', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '10 July 1964', 'US Chart Position': '-', 'RIAA Certification': '', 'BPI Certification': 'Gold'}

    assert d[0] == firstline
    assert d[9] == tenthline

    
test()

# Work with xlrd

In [4]:
import xlrd
from zipfile import ZipFile
datafile = "2013_ERCOT_Hourly_Load_Data.xls"

In [6]:
def open_zip(datafile):
    with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
        myzip.extractall()


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)

    ### example on how you can get the data
    #sheet_data = [[sheet.cell_value(r, col) for col in range(sheet.ncols)] for r in range(sheet.nrows)]

    ### other useful methods:
    # print "\nROWS, COLUMNS, and CELLS:"
    # print "Number of rows in the sheet:", 
    # print sheet.nrows
    # print "Type of data in cell (row 3, col 2):", 
    # print sheet.cell_type(3, 2)
    # print "Value in cell (row 3, col 2):", 
    # print sheet.cell_value(3, 2)
    # print "Get a slice of values in column 3, from rows 1-3:"
    # print sheet.col_values(3, start_rowx=1, end_rowx=4)

    # print "\nDATES:"
    # print "Type of data in cell (row 1, col 0):", 
    # print sheet.cell_type(1, 0)
    # exceltime = sheet.cell_value(1, 0)
    # print "Time in Excel format:",
    # print exceltime
    # print "Convert time to a Python datetime tuple, from the Excel float:",
    # print xlrd.xldate_as_tuple(exceltime, 0)
    sheet_data = [[sheet.cell_value(r,col) for col in range(sheet.ncols)] for r in range(sheet.nrows)]
    exceltime = [sheet.cell_value(r,0) for r in range(sheet.nrows)]
    converted_time = [xlrd.xldate_as_tuple(i,0) for i in exceltime[1:]]
    temp = [sheet.cell_value(r,1) for r in range(sheet.nrows)]
    excelcost = [float(i) for i in temp[1:]]
    data = {
            'maxtime': converted_time[excelcost.index(max(excelcost))],
            'maxvalue': max(excelcost),
            'mintime': converted_time[excelcost.index(min(excelcost))],
            'minvalue': min(excelcost),
            'avgcoast': sum(excelcost)/(sheet.nrows-1)
    }
    return data


def test():
    data = parse_file(datafile)

    assert data['maxtime'] == (2013, 8, 13, 17, 0, 0)
    assert round(data['maxvalue'], 10) == round(18770.166858114047, 10)


test()

# Work with Json

In [7]:
"""
To experiment with this code freely you will have to run this code locally.
Take a look at the main() function for an example of how to use the code. We
have provided example json output in the other code editor tabs for you to look
at, but you will not be able to run any queries through our UI.
"""
import json
import requests

BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"


# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = {  "simple": {},
                "atr": {"inc": "aliases+tags+ratings"},
                "aliases": {"inc": "aliases"},
                "releases": {"inc": "releases"}}


def query_site(url, params, uid="", fmt="json"):
    """
    This is the main function for making queries to the musicbrainz API. The
    query should return a json document.
    """
    params["fmt"] = fmt
    r = requests.get(url + uid, params=params)
    print( "requesting", r.url)

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def query_by_name(url, params, name):
    """
    This adds an artist name to the query parameters before making an API call
    to the function above.
    """
    params["query"] = "artist:" + name
    return query_site(url, params)


def pretty_print(data, indent=4):
    """
    After we get our output, we can use this function to format it to be more
    readable.
    """
    if type(data) == dict:
        print (json.dumps(data, indent=indent, sort_keys=True))
    else:
        print (data)


def main():
    """
    Below is an example investigation to help you get started in your
    exploration. Modify the function calls and indexing below to answer the
    questions on the next quiz.

    HINT: Note how the output we get from the site is a multi-level JSON
    document, so try making print statements to step through the structure one
    level at a time or copy the output to a separate output file. Experimenting
    and iteration will be key to understand the structure of the data!
    """

    # Query for information in the database about bands named Nirvana
    results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
    pretty_print(results)

    # Isolate information from the 4th band returned (index 3)
    print ("\nARTIST:")
    pretty_print(results["artists"][3])

    # Query for releases from that band using the artist_id
    artist_id = results["artists"][3]["id"]
    artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
    releases = artist_data["releases"]

    # Print information about releases from the selected band
    print ("\nONE RELEASE:")
    pretty_print(releases[0], indent=2)

    release_titles = [r["title"] for r in releases]
    print ("\nALL TITLES:")
    for t in release_titles:
        print (t)

if __name__ == '__main__':
    main()

  "name": "usa"
                },
                {
                    "count": 0,
                    "name": "alternative"
                },
                {
                    "count": 0,
                    "name": "am\u00e9ricain"
                },
                {
                    "count": 0,
                    "name": "legendary"
                },
                {
                    "count": 1,
                    "name": "acoustic rock"
                },
                {
                    "count": 3,
                    "name": "noise rock"
                },
                {
                    "count": 0,
                    "name": "90"
                },
                {
                    "count": 0,
                    "name": "northwest"
                },
                {
                    "count": 0,
                    "name": "rock and indie"
                },
                {
                    "count": 0,
                    "name": "unit

In [8]:
# How many bands named "First Aid Kit"?
results = query_by_name(ARTIST_URL, query_type["simple"], "First Aid Kit")

ids, count = [], 0 
for i in results['artists']:
    if i['name'] == "First Aid Kit":
        ids.append(i['id'])
        count += 1
print('The number of bands named "First Aid Kit" is %i'%count)

# Begin-area name for queen
results = query_by_name(ARTIST_URL, query_type["simple"], "Queen")
artist = results["artists"][0]
print("The begin-area name for Queen is %s"%artist["begin-area"]["name"])

# Spanish Alias for Beatles
results = query_by_name(ARTIST_URL, query_type["simple"], "Beatles")
artist = results["artists"][0]

for i in artist['aliases']:
    if i['locale'] == 'es':
        print('Spanish alias for Beatles is %s'%i['name'])

# Nirvana disambiguation 
results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
print('Nirvana disambiguation %s'%results['artists'][0]['disambiguation'])

# When was one-direction formed
results = query_by_name(ARTIST_URL, query_type["simple"], "one direction")
print('One Direction formed in %s'%results['artists'][0]['life-span']['begin'])

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AFirst+Aid+Kit&fmt=json
The number of bands named "First Aid Kit" is 2
requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AQueen&fmt=json
The begin-area name for Queen is London
requesting http://musicbrainz.org/ws/2/artist/?query=artist%3ABeatles&fmt=json
Spanish alias for Beatles is Los Beatles
requesting http://musicbrainz.org/ws/2/artist/?query=artist%3ANirvana&fmt=json
Nirvana disambiguation 90s US grunge band
requesting http://musicbrainz.org/ws/2/artist/?query=artist%3Aone+direction&fmt=json
One Direction formed in 2010-07


# Quiz 1

In [None]:
"""
Your task is to process the supplied file and use the csv module to extract data from it.
The data comes from NREL (National Renewable Energy Laboratory) website. Each file
contains information from one meteorological station, in particular - about amount of
solar and wind energy for each hour of day.

Note that the first line of the datafile is neither data entry, nor header. It is a line
describing the data source. You should extract the name of the station from it.

The data should be returned as a list of lists (not dictionaries).
You can use the csv modules "reader" method to get data in such format.
Another useful method is next() - to get the next line from the iterator.
You should only change the parse_file function.
"""
import csv
import os

DATADIR = ""
DATAFILE = "745090.csv"


def parse_file(datafile):
    name = ""
    data = []
    with open(DATADIR + DATAFILE, newline='') as f:
        reader = csv.reader(f, delimiter = ',')
        data = []
        for i in reader:
            data.append(i)

        name = data[0][1]
        header = data[1]
        data = data[2:]

    # Do not change the line below
    return (name, data)


def test():
    datafile = os.path.join(DATADIR, DATAFILE)
    name, data = parse_file(datafile)

    assert name == "MOUNTAIN VIEW MOFFETT FLD NAS"
    assert data[0][1] == "01:00"
    assert data[2][0] == "01/01/2005"
    assert data[2][5] == "2"


if __name__ == "__main__":
    test()

# Quiz 2

In [114]:
# -*- coding: utf-8 -*-
'''
Find the time and value of max load for each of the regions
COAST, EAST, FAR_WEST, NORTH, NORTH_C, SOUTHERN, SOUTH_C, WEST
and write the result out in a csv file, using pipe character | as the delimiter.

An example output can be seen in the "example.csv" file.
'''

import xlrd
import os
import csv
from zipfile import ZipFile

datafile = "2013_ERCOT_Hourly_Load_Data.xls"
outfile = "2013_Max_Loads.csv"


def open_zip(datafile):
    with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
        myzip.extractall()


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    # Read header 
    header = [sheet.cell_value(0, col) for col in range(sheet.ncols)]
    # Extract station names
    station = header[1:]
    # Read station data

    dt = [[sheet.cell_value(r,col) for r in range(1,sheet.nrows)]for col in range(1,sheet.ncols) ]
    excel_time = [sheet.cell_value(r,0) for r in range(1,sheet.nrows)]
    time = [xlrd.xldate_as_tuple(i,0) for i in excel_time]

    max_load = [max(i) for i in dt]
    max_idx = [i.index(max(i)) for i in dt]
    max_time = [time[i] for i in max_idx]
    max_year = [i[0] for i in max_time]
    max_month = [i[1] for i in max_time]
    max_day= [i[2] for i in max_time]
    max_hour = [i[3] for i in max_time]

    data = list([station, max_year, max_month, max_day, max_hour, max_load])
    return data

def save_file(data, filename):
    with open(filename,'w',newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter = '|')
        header = ['Station','Year','Month','Day','Hour','Max Load']
        writer.writerow(header)
        for i in range(len(data[0])):
            row = list([data[0][i],data[1][i],data[2][i],data[3][i],data[4][i],data[5][i]])
            writer.writerow(row)
    
def test():
    open_zip(datafile)
    data = parse_file(datafile)
    save_file(data, outfile)

    number_of_rows = 0
    stations = []

    ans = {'FAR_WEST': {'Max Load': '2281.2722140000024',
                        'Year': '2013',
                        'Month': '6',
                        'Day': '26',
                        'Hour': '17'}}
    correct_stations = ['COAST', 'EAST', 'FAR_WEST', 'NORTH',
                        'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST']
    fields = ['Year', 'Month', 'Day', 'Hour', 'Max Load']

    with open(outfile) as of:
        csvfile = csv.DictReader(of, delimiter="|")
        for line in csvfile:
            station = line['Station']
            if station == 'FAR_WEST':
                for field in fields:
                    # Check if 'Max Load' is within .1 of answer
                    if field == 'Max Load':
                        max_answer = round(float(ans[station][field]), 1)
                        max_line = round(float(line[field]), 1)
                        assert max_answer == max_line

                    # Otherwise check for equality
                    else:
                        assert ans[station][field] == line[field]

            number_of_rows += 1
            stations.append(station)

        # Output should be 8 lines not including header
        assert number_of_rows == 8

        # Check Station Names
        assert set(stations) == set(correct_stations)

        
   


# Quiz 3

In [122]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This exercise shows some important concepts that you should be aware about:
- using codecs module to write unicode files
- using authentication with web APIs
- using offset when accessing web APIs

To run this code locally you have to register at the NYTimes developer site 
and get your own API key. You will be able to complete this exercise in our UI
without doing so, as we have provided a sample result. (See the file 
'popular-viewed-1.json' from the tabs above.)

Your task is to modify the article_overview() function to process the saved
file that represents the most popular articles (by view count) from the last
day, and return a tuple of variables containing the following data:
- labels: list of dictionaries, where the keys are the "section" values and
  values are the "title" values for each of the retrieved articles.
- urls: list of URLs for all 'media' entries with "format": "Standard Thumbnail"

All your changes should be in the article_overview() function. See the test() 
function for examples of the elements of the output lists.
The rest of functions are provided for your convenience, if you want to access
the API by yourself.
"""
import json
import codecs
import requests

URL_MAIN = "http://api.nytimes.com/svc/"
URL_POPULAR = URL_MAIN + "mostpopular/v2/"
API_KEY = { "popular": "",
            "article": ""}


def get_from_file(kind, period):
    filename = "popular-{0}-{1}.json".format(kind, period)
    with open(filename, "r") as f:
        return json.loads(f.read())


def article_overview(kind, period):
    data = get_from_file(kind, period)
    titles = []
    urls =[]
    # YOUR CODE HERE
    for article in data:
        section = article["section"]
        title = article["title"]
        titles.append({section: title})

        if "media" in article:
            for m in article["media"]:
                for mm in m["media-metadata"]:
                    if mm["format"] == "Standard Thumbnail":
                        urls.append(mm["url"])
    return (titles, urls)


def query_site(url, target, offset):
    # This will set up the query with the API key and offset
    # Web services often use offset paramter to return data in small chunks
    # NYTimes returns 20 articles per request, if you want the next 20
    # You have to provide the offset parameter
    if API_KEY["popular"] == "" or API_KEY["article"] == "":
        print ("You need to register for NYTimes Developer account to run this program.")
        print ("See Intructor notes for information")
        return False
    params = {"api-key": API_KEY[target], "offset": offset}
    r = requests.get(url, params = params)

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def get_popular(url, kind, days, section="all-sections", offset=0):
    # This function will construct the query according to the requirements of the site
    # and return the data, or print an error message if called incorrectly
    if days not in [1,7,30]:
        print ("Time period can be 1,7, 30 days only")
        return False
    if kind not in ["viewed", "shared", "emailed"]:
        print ("kind can be only one of viewed/shared/emailed")
        return False

    url += "most{0}/{1}/{2}.json".format(kind, section, days)
    data = query_site(url, "popular", offset)

    return data


def save_file(kind, period):
    # This will process all results, by calling the API repeatedly with supplied offset value,
    # combine the data and then write all results in a file.
    data = get_popular(URL_POPULAR, "viewed", 1)
    num_results = data["num_results"]
    full_data = []
    with codecs.open("popular-{0}-{1}.json".format(kind, period), encoding='utf-8', mode='w') as v:
        for offset in range(0, num_results, 20):        
            data = get_popular(URL_POPULAR, kind, period, offset=offset)
            full_data += data["results"]
        
        v.write(json.dumps(full_data, indent=2))


def test():
    titles, urls = article_overview("viewed", 1)
    assert len(titles) == 20
    assert len(urls) == 30
    assert titles[2] == {'Opinion': 'Professors, We Need You!'}
    assert urls[20] == 'http://graphics8.nytimes.com/images/2014/02/17/sports/ICEDANCE/ICEDANCE-thumbStandard.jpg'


if __name__ == "__main__":
    test()