## Map parser

In [1]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "tampa_florida.osm"

def count_tags(filename):
    tag_count = {}
    for _, element in ET.iterparse(filename, events=("start",)):
        add_tag(element.tag, tag_count)
    return tag_count

def add_tag(tag, tag_count):
    if tag in tag_count:
        tag_count[tag] += 1
    else:
        tag_count[tag] = 1


In [4]:
tags = count_tags('tampa_florida.osm')
pprint.pprint(tags)

{'bounds': 1,
 'member': 31857,
 'nd': 1957582,
 'node': 1655566,
 'osm': 1,
 'relation': 1252,
 'tag': 1131585,
 'way': 182866}


## Tags

In [None]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        key = element.get("k")
        print key,
        # value = element.get("v")
        if problemchars.search(key):
            keys['problemchars'] += 1
            print '--> problemchars'
        elif lower_colon.search(key):
            keys['lower_colon'] += 1
            print '--> lower_colon'
        elif lower.search(key):
            keys['lower'] += 1
            print '--> lower'
        else:
            keys['other'] += 1
            print '--> other'
return keys

def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys


In [9]:
keys = process_map(OSMFILE)
pprint.pprint(keys)

set(['1',
     '100042',
     '1007528',
     '1009527',
     '1012362',
     '101999',
     '102411',
     '102582',
     '102999',
     '103095',
     '103107',
     '103253',
     '10344',
     '103464',
     '103574',
     '1037425',
     '104473',
     '104519',
     '104962',
     '1051550',
     '1058308',
     '105839',
     '1058666',
     '10786',
     '108295',
     '1083136',
     '1086615',
     '1087647',
     '1089595',
     '109035',
     '10927',
     '109925',
     '110046',
     '110126',
     '110263',
     '1103752',
     '110639',
     '1108251',
     '1110270',
     '1110420',
     '11126',
     '1118605',
     '113450',
     '1139460',
     '114161',
     '114388',
     '1149057',
     '11515',
     '115918',
     '116044',
     '1161073',
     '1164',
     '118021',
     '1180961',
     '118134',
     '1185729',
     '118613',
     '118856',
     '1198074',
     '1199709',
     '120468',
     '1207207',
     '1208453',
     '121241',
     '1214881',
     '12198

## Users

In [7]:
def get_user(element):
    return


def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if 'uid' in element.attrib:
            users.add(element.get('uid'))

    return users

In [8]:
users = process_map(OSMFILE)
pprint.pprint(users)

set(['1',
     '100042',
     '1007528',
     '1009527',
     '1012362',
     '101999',
     '102411',
     '102582',
     '102999',
     '103095',
     '103107',
     '103253',
     '10344',
     '103464',
     '103574',
     '1037425',
     '104473',
     '104519',
     '104962',
     '1051550',
     '1058308',
     '105839',
     '1058666',
     '10786',
     '108295',
     '1083136',
     '1086615',
     '1087647',
     '1089595',
     '109035',
     '10927',
     '109925',
     '110046',
     '110126',
     '110263',
     '1103752',
     '110639',
     '1108251',
     '1110270',
     '1110420',
     '11126',
     '1118605',
     '113450',
     '1139460',
     '114161',
     '114388',
     '1149057',
     '11515',
     '115918',
     '116044',
     '1161073',
     '1164',
     '118021',
     '1180961',
     '118134',
     '1185729',
     '118613',
     '118856',
     '1198074',
     '1199709',
     '120468',
     '1207207',
     '1208453',
     '121241',
     '1214881',
     '12198

## Audit

In [4]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
direction_re = re.compile(r'\w*(North|South|East|West|Northeast|Northwest|Southeast|Southwest|S|NE|W|N|E|SE|N.)$')

direction_mapping = {"N":"North",
                    "S":"South",
                    "NE":"Northeast",
                    "W":"West",
                    "E":"East",
                    "SE": "Southeast"}

expected = ["Passage","Cutoff","Bridge","Crossing","Lane","Way","Run","Loop","Plaza","Causeway","Terrace","Causeway","Highway","Bayway","Circle","Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Rd": "Road",
            "Rd.": "Road",
            "Dr": "Drive",
            "Av": "Avenue",
            "AVE": "Avenue",
            "Blvd": "Boulevard",
            "Cir": "Circle",
            "Hwy": "Highway",
            "Blvd.": "Boulevard",
            "Pkwy": "Parkway",
            "dr": "Drive",
            "Dr.": "Drive",
            "Ave.": "Avenue",
            "Pl": "Place",
            "Cswy": "Causeway",
            "Plz": "Plaza",
            "Ct": "Court",
            "Pky": "Parkway",
            "Ln": "Lane",
            "st": "Street",
            "road": "Road",
            "drive": "Drive",
            "lane": "Lane"
            }


def audit_street_type(street_types, street_name):
    try:
        comma_index = street_name.index(',')
        street_name = remove(street_name,comma_index)
    except:
        pass
        #print street_name
        
    try:
        pound_index = street_name.index('#')
        street_name = remove(street_name, pound_index)
    except:
        pass
    
    try:
        suite_index = street_name.index('Suite')
        street_name = remove(street_name, suite_index)
    except:
        pass
    
    
    end_direction = direction_re.search(street_name)
    if end_direction:
        street_name = street_name[:-len(end_direction.group(0))]
        street_name = end_direction.group(0) + " " + street_name
        try:
            street_name = update_direction(street_name,direction_mapping)
        except:
            pass

    
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    return street_types


def update_name(name, mapping):
    name_array = name.split(' ')
    last = name_array[-1]
    name_array[-1] = mapping[last]
    joined = ' '.join(name_array)
    
    return joined

            
def update_direction(name, mapping):
    name_array = name.split(' ')
    first = name_array[0]
    name_array[0] = mapping[first]
    return ' '.join(name_array)

def remove(name,index):
    subname = name[:index]
    return subname


In [5]:
st_types = audit(OSMFILE)
#pprint.pprint(dict(st_types))
for st_type, ways in st_types.iteritems():
    for name in ways:
        try:
            better_name = update_name(name, mapping)
            #print name, "=>", better_name
        except:
            print name

State Road 52
SR 52
FL 52
Boulevard of the Arts
SR 56
FL 56
N U.S. Hwy 41
US 41
N US Highway 41
N US 41
Main St m104
6010 US-301
US-301
State Road 64
U.S.19
US 98 Bypass
Corey Ave  St Pete Beach
4th Street Notth
US 92
12000 US Highway 92
North Avenue Republica de Cuba
Avenue Republica de Cuba
West Brandon Blvd (S.R. 60)
S Howard Av 105
North Westshore Bolevard
8492 Manatee Bay Dr Tampa
Avenue B
SR 580
State Road 580
Avenue F
US 301
US Highway 301
S US Highway 301
South US Highway 301
us Highway 301
CR 672
University Square Mall
E FL 70
Lakewood Main St Ste 102
FL 60
US-19
W Swann Av Prkg
US 98
US Highway 19
US 19
us 19
3001 US Hwy 19
U.S. 19
US Hwy 19
28519 State Road 54
FL 54
SR 54
State Road 54
Sunshine skyway
Bay Esplanade
US 301 (FL)
US 19 (FL)
S Fort Harrison
Broadway


In [9]:
import csv
import sqlite3

conn = sqlite3.connect('tampa.db')
#conn.text_factory = str
cur = conn.cursor()

In [10]:
cur.execute('CREATE TABLE nodes (id INTEGER PRIMARY KEY NOT NULL, lat REAL, lon REAL, user TEXT, uid INTEGER, version INTEGER, changeset INTEGER, timestamp TEXT);')
with open('nodes.csv','rb') as fname:
    dr = csv.DictReader(fname)
    to_Db = 
conn.commit()

OperationalError: table nodes already exists