### Using csv Module

In [1]:
import csv
import os

DATADIR = ""
DATAFILE = "745090.csv"

In [2]:
def read_file(csv):
    with open(csv, 'rb') as f:
        reader = csv.DictReader(f)
        return reader

In [11]:
def parse_file(datafile):
    name = ""
    data = []
    with open(datafile,'r') as f:
        reader = csv.reader(f)
        name = next(reader)[1]
        next(reader)
        for row in reader:
            data.append(row)
        
    # Do not change the line below
    return (name, data)

In [12]:
def test():
    datafile = os.path.join(DATADIR, DATAFILE)
    name, data = parse_file(datafile)

    assert name == "MOUNTAIN VIEW MOFFETT FLD NAS"
    assert data[0][1] == "01:00"
    assert data[2][0] == "01/01/2005"
    assert data[2][5] == "2"

    print (name)
    print (data[2])
    
if __name__ == "__main__":
    test()

MOUNTAIN VIEW MOFFETT FLD NAS
['01/01/2005', '03:00', '0', '0', '0', '2', '0', '0', '2', '0', '0', '2', '0', '0', '2', '0', '0', '2', '0', '0', '2', '0', '0', '2', '0', '8', 'E', '9', '8', 'E', '9', '7.0', 'A', '7', '6.0', 'A', '7', '93', 'A', '7', '1013', 'A', '7', '120', 'A', '7', '2.1', 'A', '7', '16100', 'A', '7', '2100', 'A', '7', '1.1', 'E', '8', '0.099', 'F', '8', '0.160', 'F', '8', '0', '1', 'A', '7']


### Excel to csv

In [13]:
import xlrd
import os
import csv
from zipfile import ZipFile

datafile = "2013_ERCOT_Hourly_Load_Data.xls"
outfile = "2013_Max_Loads.csv"

In [14]:
def open_zip(datafile):
    with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
        myzip.extractall()

In [15]:
def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    data = []
    
    maxvalues = {} # structure: {label: [maxload(0), time(1)]}
    for col in range(1, sheet.ncols - 1):
        label = sheet.cell_value(0, col)
        column = sheet.col_values(col, start_rowx=1, end_rowx=sheet.nrows)
        maxvalues[label] = [max(column)]
        maxrownum = column.index(max(column)) + 1
        maxvalues[label].append(sheet.cell_value(maxrownum, 0))
    
    stations = ['COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST']
    for station in stations:
        row = [station]
        time = xlrd.xldate_as_tuple(maxvalues[station][1], 0)
        for index in range(len(time[:4])):
            row.append(time[index])
        row.append(maxvalues[station][0])
        data.append(row)

    print (data)
    return data

In [16]:
def save_file(data, filename):
    with open(filename, 'w') as csvfile:
        datawriter = csv.writer(csvfile, delimiter = '|')
        datawriter.writerow(['Station', 'Year', 'Month', 'Day', 'Hour', 'Max Load'])
        datawriter.writerows(data)

In [18]:
def test():
    #open_zip(datafile)
    data = parse_file(datafile)
    save_file(data, outfile)

    ans = {'FAR_WEST': {'Max Load': "2281.2722140000024", 'Year': "2013", "Month": "6", "Day": "26", "Hour": "17"}}
    
    fields = ["Year", "Month", "Day", "Hour", "Max Load"]
    with open(outfile) as of:
        csvfile = csv.DictReader(of, delimiter="|")
        for line in csvfile:
            s = line["Station"]
            if s == 'FAR_WEST':
                for field in fields:
                    assert ans[s][field] == line[field]

        
test()

[['COAST', 2013, 8, 13, 17, 18779.025510000003], ['EAST', 2013, 8, 5, 17, 2380.1654089999956], ['FAR_WEST', 2013, 6, 26, 17, 2281.2722140000024], ['NORTH', 2013, 8, 7, 17, 1544.7707140000005], ['NORTH_C', 2013, 8, 7, 18, 24415.570226999993], ['SOUTHERN', 2013, 8, 8, 16, 5494.157645], ['SOUTH_C', 2013, 8, 8, 18, 11433.30491600001], ['WEST', 2013, 8, 7, 17, 1862.6137649999998]]


### Wrangling JSON

In [19]:
import json
import codecs
import requests

URL_MAIN = "http://api.nytimes.com/svc/"
URL_POPULAR = URL_MAIN + "mostpopular/v2/"
API_KEY = { "popular": "",
            "article": ""}

In [20]:
def get_from_file(kind, period):
    filename = "popular-{0}-{1}.json".format(kind, period)
    with open(filename, "r") as f:
        return json.loads(f.read())

In [21]:
def article_overview(kind, period):
    data = get_from_file(kind, period)
    titles = []
    urls =[]

    # YOUR CODE HERE
    # create titles
    for dic in data:
        asset = {}
        key = dic['section']
        val = dic['title']
        asset[key] = val
        titles.append(asset)

    # create urls
    # { 'media': [ { 'm-m': [{},{},{}] } , { 'm-m':[{},{},{}] } ] ... }
    for dic in data:
        for item in dic['media']:
            for media in item['media-metadata']:
                if media['format'] == 'Standard Thumbnail':
                    urls.append(media['url'])

    return (titles, urls)

In [23]:
def query_site(url, target, offset):
    # This will set up the query with the API key and offset
    # Web services often use offset paramter to return data in small chunks
    # NYTimes returns 20 articles per request, if you want the next 20
    # You have to provide the offset parameter
    if API_KEY["popular"] == "" or API_KEY["article"] == "":
        print ("You need to register for NYTimes Developer account to run this program.")
        print ("See Intructor notes for information")
        return False
    params = {"api-key": API_KEY[target], "offset": offset}
    r = requests.get(url, params = params)

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()

In [25]:
def get_popular(url, kind, days, section="all-sections", offset=0):
    # This function will construct the query according to the requirements of the site
    # and return the data, or print an error message if called incorrectly
    if days not in [1,7,30]:
        print ("Time period can be 1,7, 30 days only")
        return False
    if kind not in ["viewed", "shared", "emailed"]:
        print ("kind can be only one of viewed/shared/emailed")
        return False

    url = URL_POPULAR + "most{0}/{1}/{2}.json".format(kind, section, days)
    data = query_site(url, "popular", offset)

    return data

In [26]:
def save_file(kind, period):
    # This will process all results, by calling the API repeatedly with supplied offset value,
    # combine the data and then write all results in a file.
    data = get_popular(URL_POPULAR, "viewed", 1)
    num_results = data["num_results"]
    full_data = []
    with codecs.open("popular-{0}-{1}-full.json".format(kind, period), encoding='utf-8', mode='w') as v:
        for offset in range(0, num_results, 20):        
            data = get_popular(URL_POPULAR, kind, period, offset=offset)
            full_data += data["results"]
        
        v.write(json.dumps(full_data, indent=2))

In [28]:
def test():
    titles, urls = article_overview("viewed", 1)
    assert len(titles) == 20
    assert len(urls) == 30
    assert titles[2] == {'Opinion': 'Professors, We Need You!'}
    assert urls[20] == 'http://graphics8.nytimes.com/images/2014/02/17/sports/ICEDANCE/ICEDANCE-thumbStandard.jpg'


if __name__ == "__main__":
    test()

answer key:

In [29]:
def article_overview(kind, period):
    data = get_from_file(kind, period)
    titles = []
    urls =[]

    for article in data:
        section = article["section"]
        title = article["title"]
        titles.append({section: title})
        if "media" in article:
            for m in article["media"]:
                for mm in m["media-metadata"]:
                    if mm["format"] == "Standard Thumbnail":
                        urls.append(mm["url"])
    return (titles, urls)