# from CSV files to SQLite database

In [1]:
import os, sqlite3, time, pandas, csv
project_path = "C:\\Users\\TO72078\\Documents\\BIG_DATA\\UDACITY\\projects\\openstreetmap"
db_name = 'toulouse'

Let's import our SQLite helper functions (empty_db, create_tables, list_tables, display_schema, db_query...)

In [None]:
import sys
sys.path.append(project_path)
import myeasysql

## SQLite database preparation

Let's create a database (or connect to an existing one and empty it)

In [2]:
db_conn = sqlite3.connect(os.path.join(project_path, '%s.db' % db_name))
db_conn.text_factory = str # by default sqlite3 uses unicode, raises error while inserting data with pandas.to_sql()
myeasysql.empty_db(db_conn) # for testing purpose, the database may be already filled with test data

dropping table nodes
dropping table node_tags
dropping table ways
dropping table way_tags
dropping table way_nodes
Freeing memory on disk


Let's create the openstreetmap clean data tables and check their scheme 

In [3]:
c = db_conn.cursor()
myeasysql.create_tables(c)
myeasysql.list_tables(c)
myeasysql.display_schema(c)

[('CREATE TABLE nodes\n    (\n        id INTEGER PRIMARY KEY,\n        lat REAL,\n        lon REAL,\n        user TEXT,\n        uid INTEGER,\n        version TEXT,\n        changeset INTEGER,\n        timestamp TEXT,\n        FOREIGN KEY (uid)  REFERENCES ways (uid),\n        FOREIGN KEY (user) REFERENCES ways (user)\n    )',),
 ('CREATE TABLE node_tags\n    (\n        id INTEGER,\n        key TEXT,\n        value TEXT,\n        type TEXT,\n        valid TEXT,\n        FOREIGN KEY (id) REFERENCES nodes (id)\n    )',),
 ('CREATE TABLE ways\n    (\n        id INTEGER PRIMARY KEY,\n        user TEXT,\n        uid INTEGER,\n        version TEXT,\n        changeset INTEGER,\n        timestamp TEXT,\n        FOREIGN KEY (uid)  REFERENCES nodes (uid),\n        FOREIGN KEY (user) REFERENCES nodes (user)\n    )',),
 ('CREATE TABLE way_tags\n    (\n        id INTEGER,\n        key TEXT,\n        value TEXT,\n        type TEXT,\n        valid TEXT,\n        FOREIGN KEY (id) REFERENCES way (id)\n

## Lazy coding with Pandas one-liners
Now for every table, let's let pandas module read the CSV data and insert it into our database. We call it "lazy coding" since pandas offers one-line features for both actions. Let's check memory and CPU of the whole procedure.

In [4]:
# before: 50Mb (python core + imported modules)

start_time = time.time()
for table_name in ('nodes', 'node_tags', 'ways', 'way_nodes', 'way_tags'):
    csv_file = os.path.join(project_path, "%s_%s.csv" % (db_name, table_name))
    df = pandas.read_csv(csv_file)
    print '###### %s ######' % table_name
    print df.head()
    df.to_sql(table_name, db_conn, if_exists='replace', index=False)
    # mem peak ~1Gb
    # after: 500Mb
end_time = time.time()
print 'elapsed time = %s' % (end_time - start_time)

###### nodes ######
       id        lat       lon            user      uid  version  changeset  \
0  625219  43.549540  1.471045  Quentin SALLES  6604122        3   53577892   
1  625226  43.552357  1.478125           Yod4z   318821        9   36224374   
2  625235  43.554292  1.471321         hromain    39569        2     750508   
3  625236  43.555496  1.472247         hromain    39569        2     750508   
4  625239  43.554530  1.470092   Florian Birée    26867        4    4785749   

              timestamp  
0  2017-11-07T10:35:43Z  
1  2015-12-28T17:05:52Z  
2  2009-03-07T17:58:17Z  
3  2009-03-07T17:58:17Z  
4  2010-05-23T17:59:11Z  
###### node_tags ######
       id               key          value     type valid
0  625219               bus            yes  regular   yes
1  625219           highway       bus_stop  regular   yes
2  625219              name      Lapeyrade  regular   yes
3  625219          operator         Tisséo  regular   yes
4  625219  public_transport  stop_p

### First observation: big amount of memory consumption occurs since panda.read_csv returns a dataframe stored in-memory. this statement could raise some blocking issue with bigger OSM data

In [5]:
print myeasysql.list_tables(c)
myeasysql.display_schema(c)

['nodes', 'node_tags', 'ways', 'way_nodes', 'way_tags']


[('CREATE TABLE "nodes" (\n"id" INTEGER,\n  "lat" REAL,\n  "lon" REAL,\n  "user" TEXT,\n  "uid" INTEGER,\n  "version" INTEGER,\n  "changeset" INTEGER,\n  "timestamp" TEXT\n)',),
 ('CREATE TABLE "node_tags" (\n"id" INTEGER,\n  "key" TEXT,\n  "value" TEXT,\n  "type" TEXT,\n  "valid" TEXT\n)',),
 ('CREATE TABLE "ways" (\n"id" INTEGER,\n  "user" TEXT,\n  "uid" INTEGER,\n  "version" INTEGER,\n  "changeset" INTEGER,\n  "timestamp" TEXT\n)',),
 ('CREATE TABLE "way_nodes" (\n"id" INTEGER,\n  "node_id" INTEGER,\n  "position" INTEGER\n)',),
 ('CREATE TABLE "way_tags" (\n"id" INTEGER,\n  "key" TEXT,\n  "value" TEXT,\n  "type" TEXT,\n  "valid" TEXT\n)',)]

### Second observation: the table scheme have been broken and partially rebuilt by Pandas

Let's close the link with the database

In [40]:
db_conn.close()

Even after closing the link, memory has not been freed, let's collect the garbage

In [57]:
# before 500Mb
import gc
gc.collect()
# after 100Mb
del df
gc.collect()
# after 50Mb

28

## Lazy CSV reading and row-by-row SQL inserting
Now let's try another strategy, by coding some CSV reading based on generators to preserve memory

In [2]:
db_conn = sqlite3.connect(os.path.join(project_path, '%s.db' % db_name))
db_conn.text_factory = str
db_cursor = db_conn.cursor()

In [3]:
myeasysql.empty_db(db_cursor)
db_conn.commit() # in case db is locked following a crash during previous transactions
myeasysql.create_tables(db_cursor)

dropping table nodes
dropping table node_tags
dropping table ways
dropping table way_tags
dropping table way_nodes
Freeing memory on disk


Lazy loadind requires more effort in terms of coding:

In [4]:
def lazyread_csv_data(csv_path):
    """build a CSV row generator"""
    with open(csv_path, 'rU') as data:
        reader = csv.DictReader(data)
        for row in reader:
            # yealding instead of returning avoids in-memory work
            yield {k: str_encode(v) for k, v in row.iteritems()}
                
def str_encode(v):
    """Return string object properly encoded if necessary"""
    return v.encode('utf-8') if isinstance(v, unicode) else str(v)


def insert_data(db_conn, table_name, csv_path):
    """Insert csv data into SQL"""
    from sqlite3 import OperationalError
    db_cursor = db_conn.cursor()
    generator = lazyread_csv_data(csv_path)
    try:
        for i, row in enumerate(generator):
            QUERY="INSERT INTO %s (%s) VALUES (%s)" % (table_name, ','.join(row.keys()), ','.join(['?' for k in row.keys()]))
            db_cursor.execute(QUERY, row.values())
    except OperationalError as e:
        print 'Failed while inserting:'
        print QUERY
        print 'sqlite error:', e
        return None
    finally:
        db_conn.commit()


Let's apply previous functions to the 5 tables required by our project. Again, memory and CPU are checked or monitored.

In [5]:
# before: 50Mb (python core + imported modules)
# mem peak ~50Mb
start_time = time.time()
for table_name in ('nodes', 'node_tags', 'ways', 'way_nodes', 'way_tags'):
    print 'Populating %s' % table_name
    csv_file = os.path.join(project_path, "%s_%s.csv" % (db_name, table_name))
    insert_data(db_conn, table_name, csv_file)
# after: 50Mb
end_time = time.time()
print 'elapsed time = %s' % (end_time - start_time)

Populating nodes
Populating node_tags
Populating ways
Populating way_nodes
Populating way_tags
elapsed time = 93.3800001144


### Observation: CSV lazy reading instead of pandas read_csv allows to avoid any memory consumption. The cost of this strategie is a CPU factor 3. This strategy is preferred since preserving the table scheme and handling even big OSM data.

To finish, let's close the database connector and switch to another notebook dedicated to database checking

In [None]:
db_conn.close()