# CSV tags cleaning

## street type naming

In [1]:
import os, re, sqlite3, time, pandas, csv, codecs
project_path = "C:\\Users\\TO72078\\Documents\\BIG_DATA\\UDACITY\\projects\\openstreetmap"
db_name = 'toulouse'

As for previous step (OSM2CVS and CSV2SQL), defining some helpful functions for CSV reading/writing management with Unicode support:

In [2]:
def lazyread_csv_data(csv_path):
    """build a CSV row generator"""
    with open(csv_path, 'rU') as data:
        reader = csv.DictReader(data)
        for row in reader:
            # yealding instead of returning avoids in-memory work
            yield {k: str_encode(v) for k, v in row.iteritems()}

            
def str_encode(v):
    """Return string object properly encoded if necessary"""
    return v.encode('utf-8') if isinstance(v, unicode) else str(v)

class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: str_encode(v) for k, v in row.iteritems()
        })

writing function for cleaning street names:

In [3]:
def capitalize_streetname(name):
    # this function capitalizes every word except junction words
    # words are separated by white spaces, dash, point...
    
    # initialize with lowering full name
    s0 = name.lower()
    # 1st step: capitalize first letter of every word
    s1 = re.sub(r'((^|[\.\'\s-])\w{1})', lambda pattern: pattern.group(1).upper(), s0)
    # 2d step: lower case of every junction word, such as 'le', 'la'...
    s2 = re.sub(r'([DLE][aeut]?[s]?[\'\s-])', lambda pattern: pattern.group(1).lower(), s1)
    return s2

validating functions with fake names:

In [4]:
stest = ["rue du Rendez-vous de l'estrapade", "rue de la dalbade", "AVENUE JEAN RIEUX du t.o.E.c"]
for s in stest:
    print '%s ==> %s' % (s, capitalize_streetname(s))

rue du Rendez-vous de l'estrapade ==> Rue du Rendez-Vous de l'Estrapade
rue de la dalbade ==> Rue de la Dalbade
AVENUE JEAN RIEUX du t.o.E.c ==> Avenue Jean Rieux du T.O.E.C


## zipcodes

In [5]:
def standardize_zipcode(code):
    """
    Standardizes the zipcode
    Extracts or completes the given string to get the standard 5 digits code
    
    Args:
        code: a string that must include at least two digits (french departments numbering)
    Returns:
        a string made of 5 digits
        an empty string if given code did not contain the sequence '31'
    Examples:
        '31000'         ==> '31000'
        '31'            ==> '31000'
        ' 31200 blabla' ==> '31200'
        'blabla3'       ==> ''
    """
    
    # searching for 2 to 5 digits
    mysearch = re.search(r'(\d{2,5})',code)
    if mysearch:
        scode = mysearch.group(1)
        for i in range(len(scode), 5): scode += '0'
    else:
        scode = ''
        
    return scode

Validating zipcode cleaning function: 

In [6]:
for code in ('31000', '31', ' 31200 blabla', 'blabla3'):
    print '%s ==> %s' % (repr(code), repr(standardize_zipcode(code)))

'31000' ==> '31000'
'31' ==> '31000'
' 31200 blabla' ==> '31200'
'blabla3' ==> ''


## Cities

In [7]:
RE_CITYMAP = {
    'Toulouse': re.compile(r'.*toulouse.*', re.I),
    'Ramonville-Saint-Agne': re.compile(r'.*ramonville.*', re.I),
    'Rouffiac-Tolosan': re.compile(r'.*rouffiac.*', re.I),
    'Saint-Orens-de-Gameville': re.compile(r'.*saint.*orens.*', re.I)
}

In [8]:
def map_city(name):
    'replace given name by mapping dict key if given name matches mapping dict corresponding regexp'
    for k,v in RE_CITYMAP.items():
        if v.match(name): return k
    return name

In [9]:
for city in ('Ramonville st agne', 'Rouffiac', 'saint orens', 'Toulouse'):
    print '%s ==> %s' % (repr(city), repr(map_city(city)))

'Ramonville st agne' ==> 'Ramonville-Saint-Agne'
'Rouffiac' ==> 'Rouffiac-Tolosan'
'saint orens' ==> 'Saint-Orens-de-Gameville'
'Toulouse' ==> 'Toulouse'


## applying cleaning procedure to all node and way tags with type `addr`:

In [12]:
def clean_data(input_path, output_path):
    """Clean csv street types (limited to tags of type 'addr' and key 'street')"""
    igenerator = lazyread_csv_data(input_path)
    with codecs.open(output_path, 'w') as ofile:
        owriter = UnicodeDictWriter(ofile, ['id', 'key', 'value', 'type', 'valid'])
        owriter.writeheader()
        for i, row in enumerate(igenerator):
            if row['type'] == 'addr':
                value = row['value']
                if row['key'] == 'street':
                    row['value'] = capitalize_streetname(value)
                elif row['key'] == 'postcode':
                    row['value'] = standardize_zipcode(value)
                elif row['key'] == 'city':
                    row['value'] = map_city(value)
                if row['value'] != value: print '%s ==> %s' % (value, row['value'])
            owriter.writerow(row)

In [13]:
for table_name in ('node_tags', 'way_tags'):
    print '********* Cleaning %s ***********' % table_name
    input_file = os.path.join(project_path, "%s_%s.csv" % (db_name, table_name))
    output_file = os.path.join(project_path, "%s_%s_clean.csv" % (db_name, table_name))
    clean_data(input_file, output_file)

********* Cleaning node_tags ***********
31000;31100;31200;31300;31400;31500 ==> 31000
toulouse ==> Toulouse
31000;31100;31200;31300;31400;31500 ==> 31000
TOULOUSE ==> Toulouse
TOULOUSE ==> Toulouse
31200‎ ==> 31200
31200‎ ==> 31200
TOULOUSE ==> Toulouse
31520 Ramonville Saint Agne ==> 31520
Vieille-Toulouse ==> Toulouse
Vieille-Toulouse ==> Toulouse
Vieille-Toulouse ==> Toulouse
Vieille-Toulouse ==> Toulouse
Vieille-Toulouse ==> Toulouse
Vieille-Toulouse ==> Toulouse
toulouse ==> Toulouse
Ramonville ==> Ramonville-Saint-Agne
Ramonville ==> Ramonville-Saint-Agne
Ramonville ==> Ramonville-Saint-Agne
Ramonville ==> Ramonville-Saint-Agne
Ramonville ==> Ramonville-Saint-Agne
Ramonville ==> Ramonville-Saint-Agne
Ramonville ==> Ramonville-Saint-Agne
Ramonville ==> Ramonville-Saint-Agne
Ramonville ==> Ramonville-Saint-Agne
TOULOUSE ==> Toulouse
TOULOUSE ==> Toulouse
Ramonville Saint Agne ==> Ramonville-Saint-Agne
Toulouse Cedex 1 ==> Toulouse
Toulouse Cedex 1 ==> Toulouse
TOULOUSE ==> Toulous