# from clean CSV files to SQLite database

In [1]:
import os, sqlite3, time, pandas, csv
project_path = "C:\\Users\\TO72078\\Documents\\BIG_DATA\\UDACITY\\projects\\openstreetmap"
db_name = 'toulouse_clean'

Let's import our SQLite helper functions (empty_db, create_tables, list_tables, display_schema, db_query...)

In [2]:
import sys
sys.path.append(project_path)
import myeasysql

In [3]:
db_conn = sqlite3.connect(os.path.join(project_path, '%s.db' % db_name))
db_conn.text_factory = str
db_cursor = db_conn.cursor()
myeasysql.create_tables(db_cursor)

In [4]:
def lazyread_csv_data(csv_path):
    """build a CSV row generator"""
    with open(csv_path, 'rU') as data:
        reader = csv.DictReader(data)
        for row in reader:
            # yealding instead of returning avoids in-memory work
            yield {k: str_encode(v) for k, v in row.iteritems()}
                
def str_encode(v):
    """Return string object properly encoded if necessary"""
    return v.encode('utf-8') if isinstance(v, unicode) else str(v)


def insert_data(db_conn, table_name, csv_path):
    """Insert csv data into SQL"""
    from sqlite3 import OperationalError
    db_cursor = db_conn.cursor()
    generator = lazyread_csv_data(csv_path)
    try:
        for i, row in enumerate(generator):
            QUERY="INSERT INTO %s (%s) VALUES (%s)" % (table_name, ','.join(row.keys()), ','.join(['?' for k in row.keys()]))
            db_cursor.execute(QUERY, row.values())
    except OperationalError as e:
        print 'Failed while inserting:'
        print QUERY
        print 'sqlite error:', e
        return None
    finally:
        db_conn.commit()


Let's apply previous functions to the 5 tables required by our project. Again, memory and CPU are checked or monitored.

In [5]:
for name in ('nodes', 'node_tags_clean', 'ways', 'way_nodes', 'way_tags_clean'):
    table_name = name.replace('_clean','')
    csv_file = os.path.join(project_path, "toulouse_%s.csv" % name)
    print 'Populating %s with CSV %s' % (table_name, os.path.basename(csv_file))
    insert_data(db_conn, table_name, csv_file)

Populating nodes with CSV toulouse_nodes.csv
Populating node_tags with CSV toulouse_node_tags_clean.csv
Populating ways with CSV toulouse_ways.csv
Populating way_nodes with CSV toulouse_way_nodes.csv
Populating way_tags with CSV toulouse_way_tags_clean.csv


### Checking effect of cleaning on street names

before:
```
10, 7, ALLEE, allée, Allée, Allées, allées, André, Angle, Av., AVENUE, avenue, Avenue, Barrière, Bd, BIS, Bis, Boulevard, boulevard, bvd, C.c., CC, Centre, Chemin, chemin, Cheminement, Clos, Descente, esplanade, Esplanade, face, Frédéric, Grande, Impasse, impasse, La, la, Lotissement, Mail, Passage, Place, place, Port, Promenade, Quai, R.n., Rond-Point, ROUTE, Route, route, rte, Rue, rue, RUE, Savary, Square, Sur, Voie, voie
```

In [6]:
QUERY = """
SELECT value FROM (
    SELECT * FROM node_tags UNION ALL SELECT * FROM way_tags
    ) WHERE type='addr' AND key='street' GROUP BY value;
"""
street_names = myeasysql.db_query(db_cursor, QUERY)

In [7]:
street_types = set()
for (street,) in street_names:
    street_types.add(street.split()[0])

In [8]:
def str_encode(v):
    """Return string object properly encoded if necessary"""
    return v.encode('utf-8') if isinstance(v, unicode) else str(v)
pretty_street_types = sorted([str_encode(stype) for stype in street_types], key=str.lower)
print ', '.join(pretty_street_types)

10, 7, Allee, Allée, Allées, André, Angle, Av., Avenue, Barrière, Bd, Bis, Boulevard, Bvd, C.C., Cc, Centre, Chemin, Cheminement, Clos, Descente, Esplanade, Face, Frédéric, Grande, Impasse, la, Lotissement, Mail, Passage, Place, Port, Promenade, Quai, R.N., Rond-Point, Route, Rte, Rue, Savary, Square, Sur, Voie


### Checking effect of cleaning on zipcodes

In [9]:
QUERY = """
SELECT DISTINCT(value) FROM (
    SELECT * FROM node_tags UNION ALL SELECT * FROM way_tags
    ) WHERE type='addr' AND key='postcode';
"""
print ','.join([v for (v,) in myeasysql.db_query(db_cursor, QUERY)])

31500,31700,31140,31130,31200,31000,31170,31270,31240,31320,31120,31180,31850,31670,31400,31300,31650,31100,31024,31770,31520,68199,31055,31701,31776,31076,31028,31020,31242,31015,31065,31062,31150,31127,31901,31094,31070,31432,31047,31036,31081,31026,31027,31035,31022,31060,31079,31018,31750,31053,31840,31506,31021,31037


### Checking effect of cleaning on city names

In [10]:
QUERY = """
SELECT DISTINCT(value) FROM (
    SELECT * FROM node_tags UNION ALL SELECT * FROM way_tags
    ) WHERE type='addr' AND key='city';
"""
print ','.join([v for (v,) in myeasysql.db_query(db_cursor, QUERY)])

Toulouse,Labège,Montrabé,Tournefeuille,Blagnac,Saint-Orens-de-Gameville,Colomiers,Beauzelle,Ramonville-Saint-Agne,Quint-Fonsegrives,Balma,Cugnaux,L'Union,Castanet-Tolosan,Portet-sur-Garonne,Rouffiac-Tolosan,Aucamville,Auzeville-Tolosane,Saint-Jean,Fenouillet,Villeneuve-Tolosane,Cornebarrieu


In [11]:
db_conn.close()

## Conclusion: street names, zipcodes and city names are now properly standardized