In [1]:
import pandas as pd
import numpy as np
import feather
import pickle
import re
import sqlite3
import spatialite
import shapely.wkb
import shapely.wkt
from shapely.geometry import *

# optional libs to run other non-core code
from polyfuzz import PolyFuzz
from polyfuzz.models import EditDistance, TFIDF, Embeddings
from flair.embeddings import TransformerWordEmbeddings
import geopandas as gpd

# note pandarallel works well on mac but has issue with windows
# see requirements for windows  - https://github.com/nalepae/pandarallel
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

pd.options.display.max_columns = None
pd.set_option('display.float_format', lambda x: '%.6f' % x)

# connect to the database
# note: connects to/creates a db file with the name in the quotes if does not exist
db_name='streetsofnyc_spatial.db'
con = sqlite3.connect(db_name)    # for regular SQL
spatcon = spatialite.connect(db_name)    # for spatial SQL
cur = con.cursor()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [5]:
%%time

query='''
Select `Summons Number`, lat, lon from ticketstreetdem
'''

result = pd.read_sql(query,spatcon)

CPU times: user 1min 27s, sys: 2min 13s, total: 3min 41s
Wall time: 23min 10s


In [7]:
# result.to_feather('Geospatial_conversion/summonlatlonall.feather')

In [2]:
result=pd.read_feather('Geospatial_conversion/summonlatlonall.feather')

In [3]:
# convert dataframe to GeoDataFrame to get Point Geom from Lat Lon columns

gdf = gpd.GeoDataFrame(result,geometry=gpd.points_from_xy(result.lon,result.lat))

In [4]:
# Convert each shapely geometry into WKT representation

records = [
    {'Summons Number': gdf['Summons Number'].iloc[i],'wkb':shapely.wkt.dumps(gdf['geometry'].iloc[i])}
    for i in range(gdf.shape[0])
]

In [5]:
# Create tuple of tuples for query parameter (for batch update with executemany)

tuples = tuple((d['wkb'],d['Summons Number'].astype(str)) for d in records)

In [None]:
with open('Geospatial_conversion/ticket_tuple.pickle','wb') as f:
    pickle.dump(tuples,f)

In [7]:
# Then add new column to store geometry

con.enable_load_extension(True)
con.load_extension('mod_spatialite')
con.execute('SELECT InitSpatialMetaData(1);')
con.execute(
    '''
    SELECT AddGeometryColumn('ticketstreetdem','wkb_geometry',4326,'POINT',2);
    '''
)

<sqlite3.Cursor at 0x7fa4cff74b90>

In [8]:
con.execute("CREATE INDEX  `Summons Number` ON ticketstreetdem(`Summons Number`)")

<sqlite3.Cursor at 0x7fa4cff74dc0>

In [9]:
%%time


# update geometry


with sqlite3.connect(db_name) as conn:
    conn.enable_load_extension(True)
    conn.load_extension("mod_spatialite")
    conn.executemany(
        """
        UPDATE ticketstreetdem
        SET wkb_geometry=ST_PointFromText(? , 4326)
        WHERE ticketstreetdem.`Summons Number` = ?;
        """, tuples
    )



CPU times: user 22min 22s, sys: 24min 55s, total: 47min 18s
Wall time: 1h 48min 29s


In [10]:
# Create a spatial index
spatcon.execute("SELECT CreateSpatialIndex('ticketstreetdem','wkb_geometry')")

<sqlite3.Cursor at 0x7fa4cff74b20>

In [4]:
spatcon.execute('DROP TABLE IF EXISTS ticketstreetdem19')

spatcon.execute(
    '''
    CREATE TABLE IF NOT EXISTS ticketstreetdem19 AS
    SELECT * 
    FROM ticketstreetdem
    WHERE `ISSUE DATE` LIKE '%2019%'
    '''
)
spatcon.execute("SELECT RecoverGeometryColumn('ticketstreetdem19','wkb_geometry',4326,'POINT',2)")
spatcon.execute("SELECT CreateSpatialIndex('ticketstreetdem19','wkb_geometry')")

<sqlite3.Cursor at 0x7fea2d5c9110>

In [5]:
spatcon.execute('DROP TABLE IF EXISTS ticketstreetdem20')

spatcon.execute(
    '''
    CREATE TABLE IF NOT EXISTS ticketstreetdem20 AS
    SELECT * 
    FROM ticketstreetdem
    WHERE `ISSUE DATE` LIKE '%2020%'
    '''
)
spatcon.execute("SELECT RecoverGeometryColumn('ticketstreetdem20','wkb_geometry',4326,'POINT',2)")
spatcon.execute("SELECT CreateSpatialIndex('ticketstreetdem20','wkb_geometry')")

<sqlite3.Cursor at 0x7fea2d5e9340>

In [2]:
con.execute("CREATE INDEX  tix_cat20 ON ticketstreetdem20(`Ticket Category`)")
con.execute("CREATE INDEX  tix_cat19 ON ticketstreetdem19(`Ticket Category`)")

<sqlite3.Cursor at 0x7f853e3450a0>

In [3]:
%%time
spatcon.execute('DROP TABLE IF EXISTS ticketstreetdem20tc1')

spatcon.execute(
    '''
    CREATE TABLE IF NOT EXISTS ticketstreetdem20tc1 AS
    SELECT * 
    FROM ticketstreetdem20
    WHERE `Ticket Category` = 1
    '''
)
spatcon.execute("SELECT RecoverGeometryColumn('ticketstreetdem20tc1','wkb_geometry',4326,'POINT',2)")
spatcon.execute("SELECT CreateSpatialIndex('ticketstreetdem20tc1','wkb_geometry')")

CPU times: user 2min 2s, sys: 59.6 s, total: 3min 1s
Wall time: 7min 28s


<sqlite3.Cursor at 0x7f852b0d1340>

In [4]:
%%time
spatcon.execute('DROP TABLE IF EXISTS ticketstreetdem19tc1')

spatcon.execute(
    '''
    CREATE TABLE IF NOT EXISTS ticketstreetdem19tc1 AS
    SELECT * 
    FROM ticketstreetdem19
    WHERE `Ticket Category` = 1
    '''
)
spatcon.execute("SELECT RecoverGeometryColumn('ticketstreetdem19tc1','wkb_geometry',4326,'POINT',2)")
spatcon.execute("SELECT CreateSpatialIndex('ticketstreetdem19tc1','wkb_geometry')")

CPU times: user 2min 14s, sys: 1min 6s, total: 3min 21s
Wall time: 7min 16s


<sqlite3.Cursor at 0x7f852b0d1570>

In [5]:
%%time
spatcon.execute('DROP TABLE IF EXISTS ticketstreetdem20tc7')

spatcon.execute(
    '''
    CREATE TABLE IF NOT EXISTS ticketstreetdem20tc7 AS
    SELECT * 
    FROM ticketstreetdem20
    WHERE `Ticket Category` = 7
    '''
)
spatcon.execute("SELECT RecoverGeometryColumn('ticketstreetdem20tc7','wkb_geometry',4326,'POINT',2)")
spatcon.execute("SELECT CreateSpatialIndex('ticketstreetdem20tc7','wkb_geometry')")

CPU times: user 1min 23s, sys: 30.4 s, total: 1min 53s
Wall time: 3min 56s


<sqlite3.Cursor at 0x7f852b0d1490>

In [2]:
%%time
spatcon.execute('DROP TABLE IF EXISTS ticketstreetdem19tc7')

spatcon.execute(
    '''
    CREATE TABLE IF NOT EXISTS ticketstreetdem19tc7 AS
    SELECT * 
    FROM ticketstreetdem19
    WHERE `Ticket Category` = 7
    '''
)
spatcon.execute("SELECT RecoverGeometryColumn('ticketstreetdem19tc7','wkb_geometry',4326,'POINT',2)")
spatcon.execute("SELECT CreateSpatialIndex('ticketstreetdem19tc7','wkb_geometry')")

CPU times: user 9.21 s, sys: 10.9 s, total: 20.1 s
Wall time: 1min 32s


<sqlite3.Cursor at 0x7ffa2afd2340>

In [None]:
%%time

query='''
Select * FROM ticketstreetdem19tc7
'''

result = pd.read_sql(query,spatcon)

In [8]:
result

Unnamed: 0,count(*)
0,2739973


In [4]:
# convert dataframe to GeoDataFrame to get Point Geom from Lat Lon columns

gdf = gpd.GeoDataFrame(result,geometry=gpd.points_from_xy(result.lon,result.lat))

# Convert each shapely geometry into WKT representation

records = [
    {'collision_id': gdf['collision_id'].iloc[i],'wkb':shapely.wkt.dumps(gdf['geometry'].iloc[i])}
    for i in range(gdf.shape[0])
]

# Create tuple of tuples for query parameter (for batch update with executemany)

tuples = tuple((d['wkb'],d['collision_id'].astype(str)) for d in records)

In [None]:
# Then add new column to store geometry

con.enable_load_extension(True)
con.load_extension('mod_spatialite')
con.execute('SELECT InitSpatialMetaData(1);')
con.execute(
    '''
    SELECT AddGeometryColumn('collisionstreetdem','wkb_geometry',4326,'POINT',2);
    '''
)

In [6]:
con.execute("CREATE INDEX  collision_id ON collisionstreetdem(collision_id)")

OperationalError: index collision_id already exists

In [9]:
%%time


# update geometry


with sqlite3.connect(db_name) as conn:
    conn.enable_load_extension(True)
    conn.load_extension("mod_spatialite")
    conn.executemany(
        """
        UPDATE collisionstreetdem
        SET wkb_geometry=ST_PointFromText(? , 4326)
        WHERE collisionstreetdem.collision_id = ?;
        """, tuples
    )

CPU times: user 11.6 s, sys: 15.1 s, total: 26.7 s
Wall time: 34.3 s


In [10]:
# Create a spatial index
spatcon.execute("SELECT CreateSpatialIndex('collisionstreetdem','wkb_geometry')")

<sqlite3.Cursor at 0x7ff11f111570>