# What are we going to do?

We will build a data processing pipeline that takes our data sources performs aggregations by precise boundaries. Then run a weighted algorithm to calculate a "cultural" score for each city normalized by its total population. So the pipeline is something like this:

![](images/sketch.png)

## What do we mean by culture/cultural score?

This is a completely subjective and our algorithm is therefore only meant as an excuse to have fun learning PySpark and Spatial Analysis. We collect data from OpenStreetMap that seems culturally relevant. These include the following tags:

    "tourism"="artwork"
    "tourism"="gallery"
    "amenity"="theatre"
    "amenity"="arts_centre"
    "tourism"="museum"

We will also assign different weights to each tag. From 1 to 5. The tags above are listed from highest (artwork) to lowest (museum). This should make our analysis interesting. We will calculate a score by performing a weighted count for each of the sites found per city. Finally we will normalize our count by dividing it by the total population of the city.

In [1]:
# %load './code/helpers/imports.py'
import notebook
import os.path, json, io, pandas
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize'] = (16, 20)

from retrying import retry # for exponential back down when calling TurboOverdrive API

import pyspark.sql.functions as func # resuse as func.coalace for example
from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType,DecimalType

import pandas as pandas
from geopandas import GeoDataFrame # Loading boundaries Data
from shapely.geometry import Point, Polygon, shape # creating geospatial data
from shapely import wkb, wkt # creating and parsing geospatial data
import overpy # OpenStreetMap API

from ast import literal_eval as make_tuple # used to decode data from java

# make sure nbextensions are installed
notebook.nbextensions.check_nbextension('usability/codefolding', user=True)

try:
    sc
except NameError:
    import pyspark
    sc = pyspark.SparkContext('local[*]')
    sqlContext = pyspark.sql.SQLContext(sc)


In [5]:
# %load './code/helpers/osm2geojson.py'
### Helper functions

# given shapely bounds return bbox compatiable with overpass turbo openstreetmap API
def bbox(bounds):
    return (bounds[1],bounds[0],bounds[3],bounds[2])

# given an openstreetmap node retrun a GeoJSON feature
def nodeToFeature(node):
    properties = node.tags
    properties['wkt'] = Point(node.lon, node.lat).wkt
    return {
        "type": "Feature",
        "properties": properties,
        "geometry": {
            "type": "Point",
            "coordinates": [
                float(node.lon),
                float(node.lat)
            ]
        }
    }

# given an array of nodes return an array of GeoJSON features
def nodesToFeatures(nodes):
    """
    :param nodes
    :type nodes from overpy.Result (result.nodes)
    :return:
    """
    features = []
    for node in nodes:
        features.append(nodeToFeature(node))
    return features

def waysToFeatures(ways):
    print ways
    features = []
    return features


In [7]:
# %load './code/helpers/query_overpass_api.py'
# method to handle OverpassTooManyRequests exception from OpenStreetMap/overpass turbo API
def retry_if_overpass_too_many_requests(exception):
    return isinstance(exception, overpy.exception.OverpassTooManyRequests)

# decorator to retry with exponential back off
@retry(wait_exponential_multiplier=2000,
       wait_exponential_max=60000,
       retry_on_exception=retry_if_overpass_too_many_requests)
def call_overpass_api(q):
    return OVERPASS_API.query(q)

def run_overpass_api(bounding_geo_df):
    local_pois = []
    for index, row in bounding_geo_df.iterrows():
        # For documentation see:
        # http://wiki.openstreetmap.org/wiki/Tag:{key}={value}
        # e.g: http://wiki.openstreetmap.org/wiki/Tag:amenity=theatre
        payload = """
            [out:json][timeout:60];
            (
              node["tourism"="gallery"]%(box)s;
              node["tourism"="artwork"]%(box)s;
              node["tourism"="museum"]%(box)s;
              node["amenity"="arts_centre"]%(box)s;
              node["amenity"="theatre"]%(box)s;
            );
            out body;""" % {'box': str(bbox(row.geometry.bounds))}
        result = call_overpass_api(payload)
        local_pois.extend(nodesToFeatures(result.nodes))


In [8]:
OVERPASS_API         = overpy.Overpass()
BASE_DIR             = os.path.join(os.path.abspath('.'), 'work-flow')
URBAN_BOUNDARIES_FILE = '06_Europe_Cities_Boundaries_with_Labels_Population.geo.json'

# Paths to base datasets that we are using:
URBAN_BOUNDARIES_PATH = os.path.join(BASE_DIR,URBAN_BOUNDARIES_FILE)
POIS_PATH            = os.path.join(BASE_DIR, "pois.json")