### Auditing Data Quality 

In [1]:
import codecs
import csv
import json
import pprint

CITIES = 'cities.csv'

In [2]:
FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label",
          "isPartOf_label", "areaCode", "populationTotal", "elevation",
          "maximumElevation", "minimumElevation", "populationDensity",
          "wgs84_pos#lat", "wgs84_pos#long", "areaLand", "areaMetro", "areaUrban"]

In [5]:
def audit_file(filename, fields):
    fieldtypes = {}
    for field in fields:
        fieldtypes[field] = set([])

    # YOUR CODE HERE
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        
        #skipping the extra metadata
        for i in range(3):
            next(reader)

        # processing file
        for row in reader:
            for field in fields:
                value = row[field]
                if value == 'NULL' or value == '':
                    fieldtypes[field].add(type(None))
                elif value.startswith('{'):
                    fieldtypes[field].add(list)
                else:
                    try:
                        value = int(value)
                        fieldtypes[field].add(int)
                    except ValueError:
                        try:
                            value = float(value)
                            fieldtypes[field].add(float)
                        except ValueError:
                            fieldtypes[field].add(str)

    return fieldtypes

In [6]:
fieldtypes = audit_file(CITIES, FIELDS)

In [7]:
fieldtypes

{'areaCode': {list, NoneType, str, int},
 'areaLand': {float, NoneType, list},
 'areaMetro': {float, NoneType, list},
 'areaUrban': {float, NoneType, list},
 'elevation': {float, NoneType, list},
 'governmentType_label': {list, NoneType, str},
 'homepage': {list, NoneType, str},
 'isPartOf_label': {list, NoneType, str},
 'maximumElevation': {float, NoneType, list},
 'minimumElevation': {float, NoneType},
 'name': {list, NoneType, str},
 'populationDensity': {float, NoneType, list},
 'populationTotal': {list, NoneType, int},
 'timeZone_label': {list, NoneType, str},
 'utcOffset': {str, int, float, list, NoneType},
 'wgs84_pos#lat': {float, list, NoneType},
 'wgs84_pos#long': {float, list, NoneType}}

### Fixing the Area 

In [24]:
def fix_area(area):

    if area == "NULL":
        return None
    elif area[0] == "{":
        l = area.strip("{}").split("|")
        # Get string without 0's
        l1, l2 = str(l[0]).replace("e+", "").replace("0", ""), str(l[1]).replace("e+", "").replace("0", "")
        # Compare length of non-zero "significant" digits
        # Then return original which has more as a float
        if len(l1) > len(l2):
            return float(l[0])
        else:
            return float(l[1])
    return float(area)

In [25]:
def process_file(filename):
    # CHANGES TO THIS FUNCTION WILL BE IGNORED WHEN YOU SUBMIT THE EXERCISE
    data = []

    with open(filename, "r") as f:
        reader = csv.DictReader(f)

        #skipping the extra matadata
        for i in range(3):
            l = next(reader)

        # processing file
        for line in reader:
            # calling your function to fix the area value
            if "areaLand" in line:
                line["areaLand"] = fix_area(line["areaLand"])
            data.append(line)

    return data

In [27]:
result = process_file(CITIES)

### Fixing Names

In [32]:
def fix_name(name):

    # YOUR CODE HERE
    if name.startswith('{'):
        nameList = name.replace('{','').replace('}','').split('|')
        return nameList
    elif name == 'NULL':
        return []
    else:
        return [name]

In [33]:
def process_file(filename):
    data = []
    with open(filename, "r") as f:
        reader = csv.DictReader(f)

        #skipping the extra metadata
        for i in range(3):
            l = next(reader)

        # processing file
        for line in reader:
            # calling your function to fix the area value
            if "name" in line:
                line["name"] = fix_name(line["name"])
            data.append(line)
    return data

In [34]:
result = process_file(CITIES)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [35]:
for n in range(20):
    pprint.pprint(result[n]["name"])

'Kud'
'Kuju'
'Kumbhraj'
'Kumhari'
'{Kundagola|Kundgol ಕುಂದಗೋಳ}'
'Kunigal'
'Kunzer'
'{Kurduvadi|कुर्डुवाडी}'
'Kurgunta'
'Kurinjipadi'
'Kurud'
'Kushtagi'
'{Ladnun|लाडनूँ}'
'{Lahar|लहार}'
'Laharpur'
'Lakheri'
'Lakhipur'
'{Laksar|लक्सर}'
'Lalkuan'
'Lalsot'


### Crossfield Auditing 