In [79]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with cities infobox data, audit it, come up with a
cleaning idea and then clean it up. In the first exercise we want you to audit
the datatypes that can be found in some particular fields in the dataset.
The possible types of values can be:
- NoneType if the value is a string "NULL" or an empty string ""
- list, if the value starts with "{"
- int, if the value can be cast to int
- float, if the value can be cast to float, but CANNOT be cast to int.
   For example, '3.23e+07' should be considered a float because it can be cast
   as float but int('3.23e+07') will throw a ValueError
- 'str', for all other values

The audit_file function should return a dictionary containing fieldnames and a 
SET of the types that can be found in the field. e.g.
{"field1": set([type(float()), type(int()), type(str())]),
 "field2": set([type(str())]),
  ....
}
The type() function returns a type object describing the argument given to the 
function. You can also use examples of objects to create type objects, e.g.
type(1.1) for a float: see the test function below for examples.

Note that the first three rows (after the header row) in the cities.csv file
are not actual data points. The contents of these rows should note be included
when processing data types. Be sure to include functionality in your code to
skip over or detect these rows.
"""
import codecs
import csv
import json
import pprint
import os
from itertools import islice

CITIES = os.path.join('data', 'DataWrangling', 'cities.csv')

FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label",
          "isPartOf_label", "areaCode", "populationTotal", "elevation",
          "maximumElevation", "minimumElevation", "populationDensity",
          "wgs84_pos#lat", "wgs84_pos#long", "areaLand", "areaMetro", "areaUrban"]

def audit_file(filename, fields):
    fieldtypes = {}
    for field in fields:
        fieldtypes[field] = set()
    # YOUR CODE HERE
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        header = reader.fieldnames
        #pprint.pprint(header)
        #skipping the extra metadata
        for i in range(3):
            l = reader.next()
        for row in reader:
                
            #pprint.pprint(row)
            for field in fields:
                entry = row[field]
                types = fieldtypes[field]
                if entry in ('NULL', ''):
                    types.add(type(None))
                elif entry.startswith("{"):
                    types.add(type(list()))
                else:
                    try:
                        value = float(entry)
                        try:
                            value = int(entry)
                        except ValueError:
                            pass
                    except ValueError:
                        value = str(entry)
                    finally:
                        types.add(type(value))
                        

    return fieldtypes


def test():
    fieldtypes = audit_file(CITIES, FIELDS)

    #pprint.pprint(fieldtypes)

    assert fieldtypes["areaLand"] == set([type(1.1), type([]), type(None)])
    
if __name__ == "__main__":
    test()


In [73]:
"""
In this problem set you work with cities infobox data, audit it, come up with a
cleaning idea and then clean it up.

Since in the previous quiz you made a decision on which value to keep for the
"areaLand" field, you now know what has to be done.

Finish the function fix_area(). It will receive a string as an input, and it
has to return a float representing the value of the area or None.
You have to change the function fix_area. You can use extra functions if you
like, but changes to process_file will not be taken into account.
The rest of the code is just an example on how this function can be used.
"""
import os, re, decimal
import codecs
import csv
import json
import pprint

CITIES = os.path.join('data', 'DataWrangling', 'cities.csv')

def get_accuracy(value):
    # return number of digits after points (positive values)
    return -1 * decimal.Decimal(str(value)).as_tuple().exponent

def get_value_with_max_accuracy(listofvalues):
    return max(listofvalues, key=lambda v: get_accuracy(v))


def fix_area(area):

    # YOUR CODE HERE
    if area in ('NULL', ''):
        area= None
    elif area.startswith("{"):
        areas = area.split("|")
        areas = [re.sub(r'[{}]','',a) for a in areas]
        area = float(get_value_with_max_accuracy(areas))
    else:
        area = float(area)
    return area



def process_file(filename):
    # CHANGES TO THIS FUNCTION WILL BE IGNORED WHEN YOU SUBMIT THE EXERCISE
    data = []

    with open(filename, "r") as f:
        reader = csv.DictReader(f)

        #skipping the extra metadata
        for i in range(3):
            l = reader.next()

        # processing file
        for line in reader:
            # calling your function to fix the area value
            if "areaLand" in line:
                #print line["areaLand"]
                line["areaLand"] = fix_area(line["areaLand"])
            data.append(line)

    return data


def test():
    data = process_file(CITIES)

    print "Printing three example results:"
    for n in range(5,8):
        pprint.pprint(data[n]["areaLand"])

    assert data[3]["areaLand"] == None        
    assert data[8]["areaLand"] == 55166700.0
    assert data[20]["areaLand"] == 14581600.0
    assert data[33]["areaLand"] == 20564500.0    


if __name__ == "__main__":
    test()

Printing three example results:
None
None
None


AssertionError: 

In [None]:
import re
re.sub(r'[{}]','','{tutu{}')

In [None]:
import decimal
d = decimal.Decimal(2.59258565654654e+07)

In [None]:
d.as_tuple().exponent

In [None]:
2.59258565654654e+07

In [None]:
def get_accuracy(value):
    # return number of digits after points (positive values)
    return -1 * decimal.Decimal(str(value)).as_tuple().exponent

In [None]:
map(lambda a: get_accuracy(a), [1.1,2.333])

In [None]:
areas=[1.1654651651,2.333]
max(areas, key=lambda a: get_accuracy(a))

In [None]:
def get_value_with_max_accuracy(listofvalues):
    return max(listofvalues, key=lambda v: get_accuracy(v))
def fix_area(area):

    # YOUR CODE HERE
    if area.startswith("{"):
        areas = area.split("|")
        areas = [re.sub(r'[{}]','',a) for a in areas]
        area = get_value_with_max_accuracy(areas)
    return area

In [74]:
fix_area('{7.14837e+06|7.15e+06}')

7148370.0

In [80]:
audit_file(os.path.join('data', 'DataWrangling', 'cities_short.csv'), ['name', 'populationTotal', 'areaMetro', 'postalCode'])

{'areaMetro': {float, NoneType},
 'name': {list, NoneType, str},
 'populationTotal': {int, NoneType},
 'postalCode': {int, NoneType, str}}

In [3]:
map(int, [2.3,11])

[2, 11]

In [4]:
apply(round, [2.3564, 1])

2.4