# SQLite database analysis

In [1]:
import sys, os, sqlite3, pandas, pprint
import numpy as np
project_path = "C:\\Users\\TO72078\\Documents\\BIG_DATA\\UDACITY\\projects\\openstreetmap"
if project_path not in sys.path: sys.path.append(project_path)
from myeasysql import db_query
db_name = 'toulouse'
db_conn = sqlite3.connect(os.path.join(project_path, '%s.db' % db_name))
c = db_conn.cursor()

## Nodes and Ways

In [3]:
print 'Number of nodes is %d' % db_query(c, 'SELECT COUNT(*) FROM nodes')[0]

Number of nodes is 1898299


Almost 2 millions of nodes within Toulouse OSM, not ridiculous!

In [22]:
print 'Number of ways is %d' % db_query(c, 'SELECT COUNT(*) FROM ways')[0]

Number of ways is 322102


## Contributors

In [5]:
# number of unique users
QUERY = '''
SELECT COUNT(*)
FROM
    (SELECT e.uid
     FROM (SELECT uid FROM nodes UNION ALL SELECT uid FROM ways) e
     GROUP BY e.uid) u;
'''
print 'Number of unique users is %d' % db_query(c, QUERY)[0]

Number of unique users is 1360


Let's try to query the same information without grouping thanks to `DISTINCT` sql function:

In [6]:
# number of unique users (simpler query)
QUERY = '''SELECT COUNT(DISTINCT(e.uid))          
FROM (SELECT uid FROM nodes UNION ALL SELECT uid FROM ways) e;'''
print 'Number of unique users is %d' % db_query(c, QUERY)[0]

Number of unique users is 1360


In [7]:
# number of users with single contribution
QUERY = '''
SELECT COUNT(*)
FROM
    (SELECT e.user, COUNT(*) as num
     FROM (SELECT user FROM nodes UNION ALL SELECT user FROM ways) e
     GROUP BY e.user
     HAVING num=1)  u;
'''
print 'Number of users with single contribution is %d' % db_query(c, QUERY)[0]

Number of users with single contribution is 309


We may assume that people who made only one contribution were rather bystanders than contributors. Unless their single contribution aimed at fixing some defect!

In [8]:
# users sorted by number of contributions
QUERY = '''
SELECT e.user, COUNT(*) as num
FROM (SELECT user FROM nodes UNION ALL SELECT user FROM ways) e
GROUP BY e.user
ORDER by num DESC
LIMIT 10;
'''
for (k,v) in db_query(c, QUERY): print '%s:%d' % (k,v)

Pinpin:825803
emarsden:221134
Sebastien Dinot:125725
don-vip:119111
PierenBot:105636
Spontex:103742
Chouloute:55983
square:55749
FredB:41053
Florian Birée:40605


Let's build an intermediate table to ease next queries based on contribution level:

In [11]:
# create table with uid, user and number of contributions
db_query(c, 'DROP TABLE IF EXISTS contribution_level')
QUERY = '''
CREATE TABLE contribution_level AS
SELECT e.uid as uid, e.user as user, COUNT(*) as num
FROM (SELECT user, uid FROM nodes UNION ALL SELECT user, uid FROM ways) e
GROUP BY e.uid;
'''
result = db_query(c, QUERY)

Checking this new table:

In [19]:
QUERY = '''
select user, num from contribution_level order by num DESC limit 3;
'''
for (k,v) in db_query(c, QUERY): print '%s:%d' % (k,v)

Pinpin:825803
emarsden:221134
Sebastien Dinot:125725


Now let's exploit this new table to extract the number of users with contribution level greater than level average:

In [20]:
QUERY = '''
select count(*) from
(select user, num
from contribution_level, (select avg(num) as av from contribution_level) as subq
where num > av);
'''
print 'Number of users with contribution level greater than average is %d' % db_query(c, QUERY)[0]

Number of users with contribution level greater than average is 55


Let's figure out how complex would be the same query without the table `contribution_level`:

In [21]:
QUERY = '''
select count(*) from
(select user, num from
(SELECT e.uid as uid, e.user as user, COUNT(*) as num
FROM (SELECT user, uid FROM nodes UNION ALL SELECT user, uid FROM ways) e
GROUP BY e.uid) as contribution_level,
(select avg(num) as av from contribution_level) as subq
where num > av);
'''
print 'Number of users with contribution level greater than average is %d' % db_query(c, QUERY)[0]

Number of users with contribution level greater than average is 55


Conclusion: possible, but not so easy to build...

Let's check that the analysis could be performed as well with pandas module by converting the query result into a pandas dataframe:

In [15]:
QUERY = '''
SELECT e.user, COUNT(*) as num
FROM (SELECT user FROM nodes UNION ALL SELECT user FROM ways) e
GROUP BY e.user;
'''
rows = db_query(c, QUERY)
contribution_level_df = pandas.DataFrame(rows)
contribution_level_df.head()

Unnamed: 0,0,1
0,Jean Yves Garinet,27
1,00seb,1
2,0120134,21
3,0live,2
4,0livier,1


It works. Now pandas could be used to explore the contributors dataframe...

## Sort amenity nodes by occurrence (descending/ascending)

In [23]:
db_query(c, "SELECT value, COUNT(*) as num FROM node_tags WHERE key = 'amenity' GROUP BY value ORDER BY num DESC LIMIT 10;")

[(u'bench', 843),
 (u'restaurant', 773),
 (u'waste_basket', 454),
 (u'bicycle_parking', 431),
 (u'post_box', 415),
 (u'recycling', 415),
 (u'fast_food', 333),
 (u'bicycle_rental', 271),
 (u'parking', 257),
 (u'bank', 238)]

In [25]:
db_query(c, "SELECT value, COUNT(*) as num FROM node_tags WHERE key = 'amenity' GROUP BY value ORDER BY num ASC LIMIT 20;")

[(u'animal_training', 1),
 (u'bicycle_repair_station', 1),
 (u'casino', 1),
 (u'copyrama', 1),
 (u'courthouse', 1),
 (u'dancing_school', 1),
 (u'financial_advice', 1),
 (u'library;recycling', 1),
 (u'memorial', 1),
 (u'mobile_library', 1),
 (u'money_transfer', 1),
 (u'music_school', 1),
 (u'music_venue', 1),
 (u'nursing_home', 1),
 (u'post_pickup', 1),
 (u'retirement_home', 1),
 (u'showcase', 1),
 (u'sloped_curb', 1),
 (u'smoking_area', 1),
 (u'social_centre', 1)]

There are twice more restaurants than fast food, that's a very good indicator for gastronomy level of the city.
Yet Toulouse seems to be a quite "young" city since we find there many schools and only one retirement home. Only one??? Is this map really complete? For a city of half a million of people living intra-muros, it seems very low...
Let's add way_tags to our query

In [54]:
db_query(c, "SELECT COUNT(*) FROM node_tags WHERE value LIKE '%retire%' UNION ALL SELECT COUNT(*) FROM way_tags WHERE value LIKE '%retire%';")

[(1,), (5,)]

Now it seems a better estimate of this kind of amenities. Why are some (5) of these locations referenced as ways rather than nodes? Is the single node referenced as retirement home linked with one of the ways objects? Let's investigate.

In [94]:
rh_ways_ids = db_query(c, "SELECT id FROM way_tags WHERE value LIKE '%retire%';")
print 'ways referenced as retirement home:', rh_ways_ids
for (wid,) in rh_ways_ids:
    print '*****************'
    print 'way =', db_query(c, "SELECT * FROM ways WHERE id = %d;" % wid)
    for kv in db_query(c, "SELECT key, value FROM way_tags WHERE id = %d;" % wid):
        print '%s = %s' % kv

ways referenced as retirement home: [(64420853,), (70491815,), (74711365,), (105556140,), (186123723,)]
*****************
way = [(64420853, u'Tiss\xe9o', 2875126, u'6', 31843590, u'2015-06-09T13:01:21Z')]
name = EHPAD Résidence Émeraude, Maison de Retraite Médicalisée
source = cadastre-dgi-fr source : Direction Générale des Impôts - Cadastre. Mise à jour : 2010
amenity = retirement_home
building = yes
street = Rue des Amandiers
housenumber = 14
*****************
way = [(70491815, u'Spontex', 115737, u'2', 5385690, u'2010-08-02T22:35:43Z')]
name = Résidence Loubayssens
source = cadastre-dgi-fr source : Direction Générale des Impôts - Cadastre. Mise à jour : 2010
amenity = retirement_home
building = yes
*****************
way = [(74711365, u'Luuuddooo', 388649, u'5', 30824077, u'2015-05-05T21:29:55Z')]
name = EHPAD Arc en Ciel
source = cadastre-dgi-fr source : Direction Générale des Impôts - Cadastre. Mise à jour : 2010
amenity = retirement_home
website = http://www.maisonretraite.com/
bu

All these locations described as ways seem valid. The common point within these ways is the key-value building=yes. By reading the wiki https://wiki.openstreetmap.org/wiki/Way we can learn that ways objects, when they are "closed" (with the last node corresponding to the first one), can be interpreted as "polygons", or "areas". That's the case here, the building limits are given by the list of way nodes. Let's confirm it with a screen capture of OSM result for "L'Edelweiss Toulouse":

![title](img/edelweiss.png)
<center>https://www.openstreetmap.org/way/186123723</center>


In [90]:
rh_nodes_ids = db_query(c, "SELECT id FROM node_tags WHERE value LIKE '%retire%';")
print 'nodes referenced as retirement home:', rh_nodes_ids
for (nid,) in rh_nodes_ids:
    print '*****************'
    print 'node =',db_query(c, "SELECT * FROM nodes WHERE id = %d;" % nid)
    for kv in db_query(c, "SELECT key, value FROM node_tags WHERE id = %d;" % nid):
        print '%s = %s' % kv

nodes referenced as retirement home: [(1572879008,)]
*****************
node = [(1572879008, 43.5357128, 1.4688832, u'Tiss\xe9o', 2875126, u'2', 35262786, u'2015-11-12T14:45:53Z')]
name = EHPAD les Fontenelles
amenity = retirement_home
emergency = no


As already seen, only one node is referenced as retirement home.

Let's check that this node is not referenced by the 5 previous ways:

In [91]:
for (wid,) in rh_ways_ids:
    result = db_query(c, "SELECT * FROM way_nodes WHERE id = %d AND node_id = %d;" % (wid, nid))
    print 'match!' if len(result) > 0 else 'no match'

no match
no match
no match
no match
no match


Now let's see if there is some building in Toulouse defined as closed way and referecing our node:

In [92]:
result = db_query(c, "SELECT * FROM way_nodes WHERE node_id = %d;" % nid)
print 'match!' if len(result) > 0 else 'no match'

no match


Failed, there is no relation between our retirement home node and any already existing building?. Do we have to draw this building and inject it into OSM? That's some work, let's try a last shot by comparing lat/lon locations of the existing buildings and our mystery node. For that we need two helper functions:
- a function or table that computes the center of gravity of a (closed) way
- a function that roughly computes the distance between two points given their lat/lon attributes

In [157]:
def compute_cdg(wid):
    '''Computes average latitude/longitude of the given way object'''
    QUERY = """
    SELECT avg(lat), avg(lon) FROM
    (SELECT lat, lon FROM nodes JOIN way_nodes ON nodes.id=way_nodes.node_id WHERE way_nodes.id=%d)
    """ % wid
    (cdg_lat, cdg_lon) = db_query(c, QUERY)[0]
    return (cdg_lat, cdg_lon)

def equi_rect_distance(lat1deg,lon1deg,lat2deg,lon2deg):
    '''
    Computes equirectangular distance between two points expressed in (lat, lon) in degrees
    '''
    lat1rad,lon1rad,lat2rad,lon2rad = tuple(map((lambda x:x*np.pi/180.), (lat1deg,lon1deg,lat2deg,lon2deg)))
    R = 6371000.  #radius of the earth in m
    dx = (lon1rad - lon2rad) * np.cos( 0.5*(lat1rad + lat2rad) )
    dy = lat1rad - lat2rad
    d = np.sqrt( dx*dx + dy*dy )
    return R*d

Let's build a table made of buildings with on-th-fly averaging of referenced nodes, starting from position 1 to avoid over-weighting of duplicated first and last node:

In [121]:
QUERY = """
CREATE TABLE IF NOT EXISTS buildings AS
SELECT wid, avglat, avglon
FROM (
    SELECT way_nodes.id as wid, avg(lat) as avglat, avg(lon) as avglon
    FROM way_tags
    JOIN way_nodes ON way_tags.id=way_nodes.id
    JOIN nodes ON nodes.id=way_nodes.node_id
    WHERE way_nodes.position > 0 AND way_tags.key='building' AND way_tags.value='yes'
    GROUP BY wid
    )
"""
db_query(c, QUERY)

[]

Now let's find within buildings table which one is the closest from our orphan retirement home node.
Let's use pandas to have some good exercise...

In [126]:
df_buildings = pandas.DataFrame(db_query(c, 'select * from buildings'), columns=['id', 'lat', 'lon'])

In [127]:
df_buildings.head()

Unnamed: 0,id,lat,lon
0,14804388,43.65834,1.427857
1,17027846,43.563258,1.46269
2,17027867,43.56283,1.466148
3,17028232,43.56869,1.46535
4,17028292,43.56757,1.466419


In [128]:
df_buildings.size

771174

Let's compute distance between these 771174 buildings and our mystery node, whose lat/lon are:

In [140]:
reflat, reflon = db_query(c, "SELECT lat, lon FROM nodes WHERE id = %d;" % nid)[0]
print reflat, reflon

43.5357128 1.4688832


In [148]:
np.where(df_buildings['lon'].isnull())

(array([], dtype=int64),)

In [155]:
df_buildings['distance'] = equi_rect_distance(df_buildings['lat'].values, df_buildings['lon'].values, reflat, reflon) 

In [158]:
df_buildings.head()

Unnamed: 0,id,lat,lon,distance
0,14804388,43.65834,1.427857,14030.10215
1,17027846,43.563258,1.46269,3103.30896
2,17027867,43.56283,1.466148,3023.358828
3,17028232,43.56869,1.46535,3677.982346
4,17028292,43.56757,1.466419,3547.880663


In [167]:
df_buildings[df_buildings['distance']==df_buildings['distance'].min()]

Unnamed: 0,id,lat,lon,distance
234854,129748161,43.535836,1.469016,17.339462


The closest building from our retirement home node is referenced with id `129748161`, with center only at 17 meters from our orphan node.
Let's check that both `way #129748161` and `node #1572879008` describe both the same place:
- [https://www.openstreetmap.org/way/129748161](https://www.openstreetmap.org/way/129748161)
- [https://www.openstreetmap.org/node/1572879008](https://www.openstreetmap.org/node/1572879008)

Answer: Yes, that's definitely the same place, despite the lack of relation between them!
Conclusion: our mystery node could be removed from the database...
