### Auditing Data Quality 

In [1]:
import codecs
import csv
import json
import pprint

CITIES = 'cities.csv'

In [2]:
FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label",
          "isPartOf_label", "areaCode", "populationTotal", "elevation",
          "maximumElevation", "minimumElevation", "populationDensity",
          "wgs84_pos#lat", "wgs84_pos#long", "areaLand", "areaMetro", "areaUrban"]

In [5]:
def audit_file(filename, fields):
    fieldtypes = {}
    for field in fields:
        fieldtypes[field] = set([])

    # YOUR CODE HERE
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        
        #skipping the extra metadata
        for i in range(3):
            next(reader)

        # processing file
        for row in reader:
            for field in fields:
                value = row[field]
                if value == 'NULL' or value == '':
                    fieldtypes[field].add(type(None))
                elif value.startswith('{'):
                    fieldtypes[field].add(list)
                else:
                    try:
                        value = int(value)
                        fieldtypes[field].add(int)
                    except ValueError:
                        try:
                            value = float(value)
                            fieldtypes[field].add(float)
                        except ValueError:
                            fieldtypes[field].add(str)

    return fieldtypes

In [6]:
fieldtypes = audit_file(CITIES, FIELDS)

In [7]:
fieldtypes

{'areaCode': {list, NoneType, str, int},
 'areaLand': {float, NoneType, list},
 'areaMetro': {float, NoneType, list},
 'areaUrban': {float, NoneType, list},
 'elevation': {float, NoneType, list},
 'governmentType_label': {list, NoneType, str},
 'homepage': {list, NoneType, str},
 'isPartOf_label': {list, NoneType, str},
 'maximumElevation': {float, NoneType, list},
 'minimumElevation': {float, NoneType},
 'name': {list, NoneType, str},
 'populationDensity': {float, NoneType, list},
 'populationTotal': {list, NoneType, int},
 'timeZone_label': {list, NoneType, str},
 'utcOffset': {str, int, float, list, NoneType},
 'wgs84_pos#lat': {float, list, NoneType},
 'wgs84_pos#long': {float, list, NoneType}}