# OpenStreetMap Data Case Study
## by Sergio Marfull
__________________________________________

## Map Area

Madrid, Spain

* https://www.openstreetmap.org/relation/5326784

I was born in Madrid and I decided to investigate the dataset of my city and, who knows, I may help the OpenStreetMap community someday to get an improved database!

## Setup and preparing the dataset

In [1]:
## import all the necessary packages to run the code

import sqlite3 as sqlite
import xml.etree.cElementTree as ET
from pprint import pprint
import re
from collections import defaultdict
from time import time
import mapparser
import tags
import users
import audit 
import schema
import data
import cerberus
import csv
import codecs

Open the dataset and store it in order to manipulate it in python

In [9]:
## store the dataset from the .osm file into a python variable
osm_file = open("OSM Madrid - XML/madrid_spain.osm","rb")

Create a smaller sample that can be inspected faster before investigating the entire dataset

In [3]:
## create a sample file, 1000 times smaller than the original file in order to speed up the 
## investigation of the dataset and the iterations of the code
sample_file = "OSM Madrid - XML/sample.osm"
t0=time()
k = 1000 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(sample_file, 'w') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(osm_file)):
        if i % k == 0:
            output.write(ET.tostring(element).decode('UTF-8'))

    output.write('</osm>')
print(time()-t0)

63.55955743789673


## Problems found in the Map
________________________________________________-

I have encountered so many challenges for this project. First of all, it is the first project of a kind in my life and I have gone through all existing problems of types, formats and coding. Regarding dataset problems, I encountered many syntactical problems which a fixed with the same function (audit.py), namely:

* **Abbreviated street names** (Avenida, Avda., AV.,...)
* **Misspelling** (Carretetera, punctuation,...)
* **Incongruent versions of the same street names, mixing lower and uppercases** (Calle, calle, CALLE)

I have also had one special problem: the language. In Spain, we call the street type at the beginning, and all the references that I found explained the English-speaking case (even when I asked directly in some forums). So I had to practice with the regular expression by myself until I got with the formula:


In [4]:
street_type_re = re.compile(r'^\w+\.?', re.IGNORECASE)

Since that moment, I could accelerate the project path again and correct the words replacing them for their mapping in audit.py 

In [None]:
def audit(filename):
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(filename, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    return street_types

> All helper functions used in the code can be found included within the .zip file of the project

## Mapparser.py

In [None]:
mapparser.count_tags(osm_file)

## tags.py

In [None]:
tags.process_map(osm_file)

## users.py

In [6]:
users.process_map(sample_file)

{'1011130',
 '101150',
 '101355',
 '1014878',
 '1068293',
 '107257',
 '1078372',
 '110263',
 '1168897',
 '117664',
 '1181326',
 '121415',
 '1214953',
 '1215580',
 '1222737',
 '123364',
 '1241009',
 '1280319',
 '13091',
 '13135',
 '1326584',
 '1334676',
 '13832',
 '13957',
 '1432104',
 '1438481',
 '1440585',
 '1461937',
 '1465214',
 '1476663',
 '14794',
 '149606',
 '1504312',
 '15188',
 '1522593',
 '153530',
 '153567',
 '1569519',
 '1580969',
 '159810',
 '1635134',
 '1639933',
 '165869',
 '170106',
 '1722789',
 '1725553',
 '1727619',
 '173225',
 '1735748',
 '1754095',
 '175769',
 '1761936',
 '1767321',
 '1773299',
 '1781370',
 '178711',
 '179025',
 '1792089',
 '179540',
 '1796109',
 '18004',
 '1803492',
 '1815833',
 '1829683',
 '1833457',
 '18675',
 '1868925',
 '1872708',
 '1877329',
 '1888813',
 '1890655',
 '1902358',
 '193123',
 '1947048',
 '1949691',
 '1957205',
 '1957538',
 '196617',
 '1974064',
 '1977383',
 '1995961',
 '2018195',
 '2019603',
 '2024941',
 '2047207',
 '2049744',
 '20

## audit.py

In [3]:
osm_file.seek(0)
audit.test(osm_file)

{'11': {'11 Posterior'},
 'A': {'A-4 KM 23.3', 'A-5'},
 'AUTOP.': {'AUTOP. VALENCIA KM 7.1'},
 'AUTOPISTA': {'AUTOPISTA AUTOPISTA AP-41 KM 31.780',
               'AUTOPISTA AUTOPISTA AP41 KM 31.760'},
 'AUTOVIA': {'AUTOVIA A-1 KM 29.1',
             'AUTOVIA A-3 KM 62.5',
             'AUTOVIA A-3 KM 71.9',
             'AUTOVIA A-42 KM 57.700',
             'AUTOVIA E-901/A-3 KM 35.2',
             'AUTOVIA NACIONAL I KM 26.200'},
 'AV': {'AV del Deporte', 'AV DE MOSTOLES, Nº 6'},
 'AV.': {'AV. CASTILLA'},
 'AVDA': {'AVDA ARCAS DE AGUA SN',
          'AVDA DE MADRID',
          'AVDA DEL VALLE 36 (CRTA M 604 KM 24.100)'},
 'AVDA.': {'AVDA. ALTO DEL LEÓN, Nº 3',
           'AVDA. DEL ANTIGÜO FERROCARRIL, Nº 1',
           'AVDA. GENERALISIMO,12',
           'AVDA. SAN PABLO'},
 'AVENIDA': {'AVENIDA ANDALUCIA',
             'AVENIDA AURELIO ALVAREZ, S/N',
             'AVENIDA AVDA DEL SOL ESQ.AVDA DE FRANCIA, SN',
             'AVENIDA AVDA. DE LA CABRERA S/N JUNTO AL HOSTAL EL CANCHO

CARRETERA N-110 A SORIA KM 189 PARKING SUPERMERCADO KM. 110 => Carretera N-110 A SORIA KM 189 PARKING SUPERMERCADO KM. 110
CARRETERA VALDEMAQUEDA KM. 0 => Carretera VALDEMAQUEDA KM. 0
CARRETERA N-400 KM 24.6 => Carretera N-400 KM 24.6
CARRETERA CARRETERA TOLEDO KM 80.250 => Carretera Carretera TOLEDO KM 80.250
CARRETERA M-501 KM 17.2 => Carretera M-501 KM 17.2
CARRETERA M-404 KM. 31 => Carretera M-404 KM. 31
CARRETERA M-600 KM. 14 => Carretera M-600 KM. 14
CARRETERA TOLEDO-MOCEJON KM 4.300 => Carretera TOLEDO-MOCEJON KM 4.300
CARRETERA N-II KM. 38 => Carretera N-II KM. 38
CARRETERA CTRA.N-301 KM. 67 => Carretera CTRA.N-301 KM. 67
CARRETERA COMARCAL M236 KM 51.600 => Carretera COMARCAL M236 KM 51.600
CARRETERA M-501 KM. 27 => Carretera M-501 KM. 27
CARRETERA M-300 KM 8.1 => Carretera M-300 KM 8.1
CARRETERA M-610 KM 19.8 => Carretera M-610 KM 19.8
CARRETERA FONTANAR KM. 1 => Carretera FONTANAR KM. 1
CARRETERA CM-4004 KM 17.50 => Carretera CM-4004 KM 17.50
CARRETERA M-405 KM 5.6 => Carret

In [12]:
data.process_map(sample_file, validate = False)

UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-4: ordinal not in range(128)

In [8]:
db = sqlite.connect("osm_madrid")
c = db.cursor()
query = "CREATE TABLE nodes (id INTEGER PRIMARY KEY NOT NULL,lat REAL,lon REAL,user TEXT,uid INTEGER,version INTEGER,changeset INTEGER,timestamp TEXT);"
c.execute(query)
rows = c.fetchall()
db.close()

In [12]:
db = sqlite.connect("osm_madrid")
c = db.cursor()
query = ".schema"
c.execute(query)
rows = c.fetchall()
db.close()

OperationalError: near ".": syntax error

In [37]:
# counting phone entries in 'nodes' 'tags'
t0 = time()
counter = 0 
for event, element in ET.iterparse(sample_file):
    if element.tag == 'node':
        for tag in element.iter('tag'):
            if tag.attrib['k'] == 'phone' or tag.attrib['k'] == 'contact:phone':
                counter += 1
print(time() - t0)
print(counter)

0.005881547927856445
1


In [38]:
# counting phone entries in 'nodes' 'tags'
t0 = time()
counter = 0 
for event, element in ET.iterparse(sample_file):
    if element.tag == 'tag':
        for tag in element.iter('tag'):
            if tag.attrib['k'] == 'phone' or tag.attrib['k'] == 'contact:phone':
                counter += 1
print(time() - t0)
print(counter)

0.007307291030883789
1


In [39]:
# Audit tag key string types
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

def key_type(element, keys):
    if element.tag == "tag":
        k_value = element.attrib['k']
        if lower.search(k_value):
            keys['lower'] += 1
        elif lower_colon.search(k_value):
            keys['lower_colon'] += 1
        elif problemchars.search(k_value):
            keys['problemchars'] += 1
        else:
            keys['other'] += 1
    return keys

def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

In [44]:

t0= time()
print(process_map(sample_file))
print(time() - t0)

{'lower': 183, 'lower_colon': 101, 'problemchars': 0, 'other': 5}
0.007745981216430664


In [45]:
# Extract unique user ids
def get_user(element):
    if element.tag == 'way' or element.tag == 'node':
        if 'uid' in element.attrib.keys():
            uid = element.attrib['uid']
        else:
            uid = 'uid_absent'
    else:
        uid = None
    return uid 


def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if get_user(element):
            users.add(get_user(element))
    return users

In [51]:
t0= time()
user_id = process_map(sample_file)
print(len(user_id))
print(time() - t0)

123
0.008822917938232422


In [58]:
print('type:', type(user_id))
print('length:', len(user_id))
missing_uids = 0
index_list = []
for i in user_id:
    if i == 'uid_absent':
        missing_uids += 1
        index_list.append('uid_missing')
    else:
        index_list.append('uid_present')
print('missing uids:', missing_uids)
#print 'missing uid index', index_list.index('uid_missing')

type: <class 'set'>
length: 123
missing uids: 0


In [59]:

# Extract unique user names# Extrac 
def get_user(element):
    if element.tag == 'way' or element.tag == 'node':
        if 'user' in element.attrib.keys():
            user = element.attrib['user']
        else:
            user = 'user_name_absent'
    else:
        user = None
    return user

def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if get_user(element):
            users.add(get_user(element))
    return users

In [64]:

t0= time()
user_name = process_map(sample_file)
print(user_name)
print(len(user_name))
print(type(user_name))
print(time()-t0)

{'Luiyo', 'Pelanas', 'Sabiki', 'Mateño', 'scambelo', 'JeLuF', 'valdenebro', 'nicole_germany', 'Diego Esteban Alonso Blas', 'BCNorwich', 'Steeley', 'Marion_Moseby', 'dardhal', 'Hispadat', 'mojitopt', 'polkillas', 'Aaco', 'Urii_67', 'Pluvium', 'jorvime', 'jamesks', 'Polyglot', 'davidd95', 'jrgamo', 'Sercontr', 'FrViPofm', 'alcatoo', 'Antonio Garzón', 'Angel Fernandez', 'ANewTolay', 'flierfy', 'rafaerti', 'Habbit', 'c4rodriguez', 'carlosz22', 'sanchi', 'Marcsinnick', 'jcsogo', 'Pozuelo de Alarcon', 'R2ree2', 'Latze', 'NachoBlanco', 'Feuerfluss', 'Eduarson26', 'jespa', 'Cresques', 'lesp78', 'cronoser', 'C-SIm', 'gpesquero', 'VOST Madrid', 'intercacia', 'wingmaster', 'FahRadler', 'yabanci', 'Takeoff', 'Importación catastro de Sevilla la Nueva (Madrid)', 'antecessor', 'felixi', 'dvidrie', 'Anonxak', 'CarlosMG', 'Imp_GL', 'MorenoAraujo', 'robertogeb', 'Jesús Gómez', 'oscarorbe', 'cirdancarpintero', 'Graeme Herbert', 'JavierSp', 'Davpalnan', '_Reppu_', 'Canellone', 'Serfuen', 'x0c0', 'c_gonzal

In [73]:
sorted(lowers)


['access',
 'amenity',
 'barrier',
 'bench',
 'bridge',
 'building',
 'bus',
 'created_by',
 'crossing',
 'cuisine',
 'destination',
 'emergency',
 'entrance',
 'height',
 'hgv',
 'highway',
 'int_ref',
 'is_in',
 'landuse',
 'lanes',
 'layer',
 'leisure',
 'maxspeed',
 'motorroad',
 'name',
 'natural',
 'noexit',
 'note',
 'oneway',
 'opening_hours',
 'operator',
 'parking',
 'place',
 'power',
 'public_transport',
 'ref',
 'service',
 'shelter',
 'shop',
 'smoking',
 'source',
 'surface',
 'tourism',
 'type',
 'waterway',
 'wheelchair']

In [74]:
sorted(lowers_colon)


['addr:city',
 'addr:country',
 'addr:district',
 'addr:housenumber',
 'addr:postcode',
 'addr:state',
 'addr:street',
 'building:levels',
 'contact:email',
 'contact:phone',
 'contact:website',
 'educamadrid:bilingue',
 'educamadrid:cod_centro',
 'educamadrid:codigo_postal',
 'educamadrid:comedor',
 'educamadrid:distrito',
 'educamadrid:fax',
 'educamadrid:horario_amp',
 'educamadrid:municipio',
 'educamadrid:nombre_via',
 'educamadrid:numero_portal',
 'educamadrid:publico_bilingue',
 'educamadrid:telefono',
 'educamadrid:tipo',
 'educamadrid:tipo_via',
 'educamadrid:transporte',
 'educamadrid:url',
 'fire_hydrant:type',
 'is_in:state',
 'is_in:state_code',
 'ngbe:codigo',
 'ngbe:grupo',
 'ngbe:huso',
 'ngbe:id',
 'ngbe:subgrupo',
 'ngbe:tema',
 'ngbe:tipotexto',
 'ngbe:version',
 'note:es',
 'ref:colour',
 'ref:colour_bg',
 'source:date',
 'source:file',
 'source:name',
 'source:url']

In [75]:
sorted(others)


['ngbe:hojabcn25',
 'ngbe:lat_ed50',
 'ngbe:lon_ed50',
 'ngbe:xutm_ed50',
 'ngbe:yutm_ed50']

In [77]:
# Find values listed under 'addr:street:en' and 'addr:street_1'
t0 = time()
streets_under_alt_k = []
for _, element in ET.iterparse(sample_file):
    if element.tag == 'tag':
        if element.attrib['k'] == 'addr:street:en' or element.attrib['k'] == 'addr:street_1':
            streets_under_alt_k.append(element.attrib['v'])
for i in streets_under_alt_k:
    print(i)

## ENTENDER

In [80]:

'''Auditing phone numbers'''
'''def audit_phone_number(phone_number)'''

#regex_normal = re.compile(r'(\+?38)?\W*(0?\W*\d{2})?\W*(\d\W*\d\W*\d)\W*(\d{2})\W*(\d{2})')
#regex_normal = re.compile(r'(\+?38)?\W*(0?\d{2})?\W*(\d\d\d)\W*(\d{2})\W*(\d{2})')
#regex_normal = re.compile(r'(38)?\W*(0)?\W*(\d{2})?\W*(\d{3})\W*(\d{2})\W*(\d{2})')
#regex_38 = re.compile(r'(\+?)(38)\W*(0)\W*(\d{2})\W*(\d{3})\W*(\d{2})\W*(\d{2})')
regex_38 = re.compile(r'(3)\W*(8)\W*(0)\W*(\d)\W*(\d)\W*(\d)\W*(\d)\W*(\d)\W*(\d)\W*(\d)\W*(\d)\W*(\d)\b')
#regex_0xx = re.compile(r'^\W?(0\d{2})\W*(\d{3})\W*(\d{2})\W*(\d{2})\b')
regex_0xx = re.compile(r'^\W?(0)\W*(\d{2})\W*(\d)\W*(\d)\W*(\d)\W*(\d)\W*(\d)\W*(\d)\W*(\d)\b')
#regex_800 = re.compile(r'\d\W800\W\d{3}\W\d{3}')
regex_800 = re.compile(r'^(0|8)\W*(800)\W*(\d)\W*(\d)\W*(\d)\W*(\d)\W*(\d)\W*(\d)\b')
regex_xxx = re.compile(r'^(\d{3})\W*(\d{2})\W*(\d{2})$')

In [81]:
ph = r'0800500-609'
m = regex_800.findall(ph)
m

[('0', '800', '5', '0', '0', '6', '0', '9')]

In [85]:
# Count regex matches 
t0 = time()
counter = {'w38':0, 'w800':0, 'w0xx':0, 'wxxx':0, 'problematic':0}
w38 = []
w800 = []
w0xx = []
wxxx = []
problematics = []
for event, elem in ET.iterparse(sample_file):
    if elem.tag == "node" or elem.tag == "way":
        for tag in elem.iter("tag"):
            if tag.attrib['k'] == "phone" or tag.attrib['k'] == "contact:phone":
                if regex_38.search(tag.attrib['v']):
                    counter['w38'] += 1
                    w38.append(tag.attrib['v'])
                elif regex_800.search(tag.attrib['v']):
                    counter['w800'] += 1
                    w800.append(tag.attrib['v'])
                elif regex_0xx.search(tag.attrib['v']):
                    counter['w0xx'] += 1
                    w0xx.append(tag.attrib['v'])
                elif regex_xxx.search(tag.attrib['v']):
                    counter['wxxx'] += 1
                    wxxx.append(tag.attrib['v'])
                else:
                    counter['problematic'] += 1
                    problematics.append(tag.attrib['v'])
print(time() - t0)
print(counter)

0.0059735774993896484
{'w38': 0, 'w800': 0, 'w0xx': 0, 'wxxx': 0, 'problematic': 1}


In [86]:
sorted(problematics)

['+34 915424131']

In [89]:
phone_mapping = {
    '(44)4247431' : '+38-044-424-7431',
     '+1 347 868 0740' : '+1-347-868-0740', 
     '+3-044-257-20-97' : '+38-044-257-2097',
     '+3-8-044-446-77e-70' : '+38-044-446-7770',
     '+30 (44) 536-99-06; +30 (44) 536-99-08; +30 (44) 536-99-07' : '+38-044-536-9906', 
     '+3044 401-42-94' : '+38-044-401-4294',
     '+30442556013' : '+38-044-255-6013', 
     '+38 44 2784864' : '+38-044-278-4864', 
     '+38 44 425 03 98' : '+38-044-4250-0398', 
     '+380 (044) 235-73-82' : '+38-044-235-7382', 
     '+380 (044) 275-33-00' : '+38-044-275-3300', 
     '+380 (044) 360 02 09' : '+38-044-360 0209', 
     '+380 (044) 486-18-08' : '+38-044-486-1808', 
     '+380 (067) 912-20-66' : '+38 067 912-2066', 
     '+380 44 01010' : 'ERRONEUS', 
     '+380 9905577327' : '+38-099-055-77327', 
     '+380(044) 528-30-47' : '+38-044-528-3047',
     '+380-044-4172526' : '+38-044-417-2526', 
     '+3800675055958' : '+38-067-505-5958', 
     '+3804118875' : 'ERRONEOUS', 
     '+38986073213' : '+38-098-607-3213',
     '+39 044 5939575' : '+38-044-593-9575',
     '+800 1800 1800' : 'ERRONEUOUS', 
     '044526' : 'ERRONEOUS', 
     '08005005000' : 'ERRONEOUS', 
     '102' : ' ERRONEOUS', 
     '234-55-83;234-05-88;235-23-21' : '+38-044-234-5583', 
     '287-32-11 066-563-57-29' : '38-044-287-3211', 
     '2870711,2870020' : '+38-044-287-0711', 
     '2876149,2876216' : '+38-044-287-6149', 
     '4-60-85' : 'ERRONEOUS', 
     '5-74-41' : 'ERRONEOUS',
     '67 401 21 66, 044 287 5252' : '+38-067-401-2166', 
     '8097-331-17-93' : '+38-097-331-1793', 
     '88003000500' : 'ERRONEOUS', 
 u'\u0420\u0435\u0433\u0438\u0441\u0442\u0440\u0430\u0442\u0443\u0440\u0430 - (044) 408-03-41, \u0412\u044b\u0437\u043e\u0432 \u0432\u0440\u0430\u0447\u0430 - (044) 408-74-40, \u041d\u0435\u043e\u0442\u043b\u043e\u0436\u043d\u0430\u044f \u043f\u043e\u043c\u043e\u0449\u044c - (044) 497-60-61'
: '38-044-408-0341'}

In [87]:
# Define function to standardize phone numbers
def standardize_phone(number, phone_mapping):
    stnd = ''
    if regex_38.search(number):
        m = regex_38.search(number)
        stnd = stnd + '+' + m.group(1) + m.group(2) + '-' + m.group(3) + m.group(4) + m.group(5) + '-' + m.group(6) + \
                m.group(7) + m.group(8) + '-' + m.group(9) + m.group(10) + m.group(11) + m.group(12)
    elif regex_800.search(number):
        m = regex_800.search(number)
        stnd = stnd + m.group(1) + '-' + m.group(2) + '-' + m.group(3) + m.group(4) + m.group(5) + '-' + m.group(6) + \
                m.group(7) + m.group(8)
    elif regex_0xx.search(number):
        m = regex_0xx.search(number)
        stnd = stnd + '+38-' + m.group(1) + m.group(2) + '-' + m.group(3) + m.group(4) + m.group(5) + '-' + m.group(6) + \
                m.group(7) + m.group(8) + m.group(9) 
    elif regex_xxx.search(number):
        m = regex_xxx.search(number)
        stnd = stnd + '+38-044-' + m.group(1) + '-' + m.group(2) + '-' + m.group(3)
    else:
        stnd = phone_mapping[number]
    return stnd

In [90]:
standardize_phone('0 (44) 52026 08', phone_mapping)


'+38-044-520-2608'

In [93]:

# Count phones number entries in 'node' and 'way' 'tags'# Count  
t0 = time()
counter = 0 
for event, elem in ET.iterparse(sample_file):
    if elem.tag == "node" or elem.tag == "way":
        for tag in elem.iter("tag"):
            if tag.attrib['k'] == "phone":
                counter += 1
print(time() - t0)
print(counter)


0.006216526031494141
0


In [97]:
sqlite> SELECT value, COUNT(*) as num
FROM nodes_tags
WHERE key='amenity'
GROUP BY value
ORDER BY num DESC
LIMIT 10;

SyntaxError: invalid syntax (<ipython-input-97-c09f27433d22>, line 1)

In [29]:
sample_file = open("OSM Madrid - XML/sample.osm","rb")
k_values = defaultdict(int)
for event, elem in ET.iterparse(sample_file):
    if elem.tag == 'node':
        pprint(elem.tag.tag)

AttributeError: 'str' object has no attribute 'tag'

In [95]:
#importing cleaned .csv files to SQL schema
CREATE TABLE nodes (
    id INTEGER PRIMARY KEY NOT NULL,
    lat REAL,
    lon REAL,
    user TEXT,
    uid INTEGER,
    version INTEGER,
    changeset INTEGER,
    timestamp TEXT
);

CREATE TABLE nodes_tags (
    id INTEGER,
    key TEXT,
    value TEXT,
    type TEXT,
    FOREIGN KEY (id) REFERENCES nodes(id)
);

CREATE TABLE ways (
    id INTEGER PRIMARY KEY NOT NULL,
    user TEXT,
    uid INTEGER,
    version TEXT,
    changeset INTEGER,
    timestamp TEXT
);

CREATE TABLE ways_tags (
    id INTEGER NOT NULL,
    key TEXT NOT NULL,
    value TEXT NOT NULL,
    type TEXT,
    FOREIGN KEY (id) REFERENCES ways(id)
);

CREATE TABLE ways_nodes (
    id INTEGER NOT NULL,
    node_id INTEGER NOT NULL,
    position INTEGER NOT NULL,
    FOREIGN KEY (id) REFERENCES ways(id),
    FOREIGN KEY (node_id) REFERENCES nodes(id)
);

SyntaxError: invalid syntax (<ipython-input-95-f628155ad85e>, line 2)