In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
In this problem set you work with cities infobox data, audit it, come up with a
cleaning idea and then clean it up. In the first exercise we want you to audit
the datatypes that can be found in some particular fields in the dataset.
The possible types of values can be:
- NoneType if the value is a string "NULL" or an empty string ""
- list, if the value starts with "{"
- int, if the value can be cast to int
- float, if the value can be cast to float, but CANNOT be cast to int.
   For example, '3.23e+07' should be considered a float because it can be cast
   as float but int('3.23e+07') will throw a ValueError
- 'str', for all other values

The audit_file function should return a dictionary containing fieldnames and a 
SET of the types that can be found in the field. e.g.
{"field1": set([type(float()), type(int()), type(str())]),
 "field2": set([type(str())]),
  ....
}
The type() function returns a type object describing the argument given to the 
function. You can also use examples of objects to create type objects, e.g.
type(1.1) for a float: see the test function below for examples.

Note that the first three rows (after the header row) in the cities.csv file
are not actual data points. The contents of these rows should note be included
when processing data types. Be sure to include functionality in your code to
skip over or detect these rows.
"""

import codecs
import csv
import json
import pprint

CITIES = 'cities.csv'

FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label",
          "isPartOf_label", "areaCode", "populationTotal", "elevation",
          "maximumElevation", "minimumElevation", "populationDensity",
          "wgs84_pos#lat", "wgs84_pos#long", "areaLand", "areaMetro", "areaUrban"]


def isfloat(value):#create a function that tests whether a value can be cast as float
    try:
        float(value)
        return True
    except ValueError:
        return False
    
    
def isint(value):#create a function that determines if a value is an integer
    try:
        int(value)
        return True
    except ValueError:
        return False
    
def audit_file(filename, fields):
    fieldtypes = {}
    for i in fields:
        fieldtypes[i] = set()#make each field value a set
    with open(filename,'r') as f:
        reader = csv.DictReader(f)
        for i in range(3):
            reader.next()#skips first 3 rows
        for row in reader:
            for field in fields:
                value = row[field]#match row and column values
                if value == 'NULL' or value == '':
                    fieldtypes[field].append(type(None))
                elif value.startswith('{'):
                    fieldtypes[field].add(type(None))
                else:
                    try:
                        int(value)
                        fieldtypes[field].add(int)
                    except ValueError:
                        try:
                            float(value)
                            fieldtypes[field].add(float)
                        except ValueError:
                            fieldtypes[field].add(str)

    # YOUR CODE HERE


    return fieldtypes


def test():
    fieldtypes = audit_file(CITIES, FIELDS)

    pprint.pprint(fieldtypes)

    assert fieldtypes["areaLand"] == set([type(1.1), type([]), type(None)])
    assert fieldtypes['areaMetro'] == set([type(1.1), type(None)])
    
if __name__ == "__main__":
    test()


KeyError: 'n'

Everythiing below this is notes to test out code

In [2]:
fieldtypes = {}
for i in FIELDS:
    fieldtypes[i] = []
print(fieldtypes)

{'name': [], 'timeZone_label': [], 'utcOffset': [], 'homepage': [], 'governmentType_label': [], 'isPartOf_label': [], 'areaCode': [], 'populationTotal': [], 'elevation': [], 'maximumElevation': [], 'minimumElevation': [], 'populationDensity': [], 'wgs84_pos#lat': [], 'wgs84_pos#long': [], 'areaLand': [], 'areaMetro': [], 'areaUrban': []}


In [None]:
def audit_file(filename, fields):
    fieldtypes = {}
    for i in FIELDS:
        fieldtypes[i] = []
    with open(filename,'r') as f:
        reader = csv.DictReader(f)
        header = reader.fieldnames
        for field in FIELDS:
            for row in reader:
                if row[field] == "NULL" or row[field] == "":
                    fieldtypes[field].append(type(None))
                elif row[field[0]] == '{':
                    fieldtypes[field].append(type([]))
                elif isint(row[field]):
                    fieldtypes[field].append(type(1.1))


In [6]:
with open(CITIES, "r") as f:
    reader = csv.DictReader(f)
    header = reader.fieldnames
    for field in FIELDS:
        for row in reader:
            if row[field] == "NULL" or row[field] == "":
                fieldtypes[field].append(type(None))

['URI', 'rdf-schema#label', 'rdf-schema#comment', 'administrativeDistrict_label', 'administrativeDistrict', 'anthem_label', 'anthem', 'area', 'areaCode', 'areaLand', 'areaMetro', 'areaRural', 'areaTotal', 'areaUrban', 'areaWater', 'city_label', 'city', 'code', 'country_label', 'country', 'daylightSavingTimeZone_label', 'daylightSavingTimeZone', 'district_label', 'district', 'division_label', 'division', 'elevation', 'federalState_label', 'federalState', 'foundingDate', 'foundingPerson_label', 'foundingPerson', 'foundingYear', 'governingBody_label', 'governingBody', 'government_label', 'government', 'governmentType_label', 'governmentType', 'isPartOf_label', 'isPartOf', 'isoCodeRegion_label', 'isoCodeRegion', 'leader_label', 'leader', 'leaderName_label', 'leaderName', 'leaderParty_label', 'leaderParty', 'leaderTitle', 'location_label', 'location', 'maximumElevation', 'mayor_label', 'mayor', 'minimumElevation', 'motto', 'municipality_label', 'municipality', 'part_label', 'part', 'percent

In [19]:
fieldtypes = {}
for i in FIELDS:
    fieldtypes[i] = []
with open(CITIES,'r') as f:
    reader = csv.DictReader(f)
    header = reader.fieldnames
    for field in FIELDS:
        for row in reader:
            if row[field] == "NULL" or row[field] == "":
                fieldtypes[field].append(type(None))
            elif row[field][0] == '{':
                fieldtypes[field].append(type([]))
            elif isint(row[field]):
                    fieldtypes[field].append(type(1.1))
        f.seek(0)
for i in fieldtypes:
    fieldtypes[i] = set(fieldtypes[i])
print(fieldtypes)

{'name': {<class 'NoneType'>, <class 'list'>}, 'timeZone_label': {<class 'NoneType'>}, 'utcOffset': {<class 'float'>, <class 'NoneType'>, <class 'list'>}, 'homepage': {<class 'NoneType'>}, 'governmentType_label': {<class 'NoneType'>}, 'isPartOf_label': {<class 'NoneType'>, <class 'list'>}, 'areaCode': {<class 'NoneType'>, <class 'float'>}, 'populationTotal': {<class 'NoneType'>, <class 'float'>}, 'elevation': {<class 'NoneType'>, <class 'list'>}, 'maximumElevation': {<class 'NoneType'>}, 'minimumElevation': {<class 'NoneType'>}, 'populationDensity': {<class 'NoneType'>, <class 'list'>}, 'wgs84_pos#lat': set(), 'wgs84_pos#long': set(), 'areaLand': {<class 'NoneType'>, <class 'list'>}, 'areaMetro': {<class 'NoneType'>}, 'areaUrban': {<class 'NoneType'>}}
