In [1]:
%load_ext Cython
%matplotlib inline

import pylab
import re
import psycopg2

import pandas as pd
import numpy as np

from geopandas import GeoDataFrame,GeoSeries
from difflib import get_close_matches
from itertools import combinations

pd.set_option("display.max_rows",100)
pd.set_option("display.max_columns",20)
pd.set_option('precision',5)
pd.set_option('large_repr', 'truncate')

In [None]:
addr_cols = """f_add1 as f_add, 
               t_add1 as t_add, 
               pre_dir as pre_dir, 
               st_name1 as street_nam, 
               st_type1 as street_typ,
               geom, 
               'buildings' as source"""
curb_sql = """
             SELECT 
                 %s_f_add AS f_add,
                 %s_t_add AS t_add,
                 pre_dir, 
                 street_nam, 
                 street_typ, 
                 geom,
                 '%shs' AS source
             FROM transportation 
             WHERE street_nam IS NOT NULL 
             AND geom IS NOT null"""

lhs_sql = curb_sql % ('l','l','l')
rhs_sql = curb_sql % ('r','r','r')

buildings_sql = """SELECT %s FROM chicago WHERE st_name1 IS NOT NULL  
                   AND geom IS NOT NULL """ % (addr_cols)

curbs_sql = "%s UNION %s UNION %s" % (lhs_sql, rhs_sql, buildings_sql)

connection = psycopg2.connect(database='postgres', user='postgres')
mapdata = GeoDataFrame.from_postgis(curbs_sql, connection)

mapdata['f_add'] = mapdata['f_add'].apply(int)
mapdata['t_add'] = mapdata['t_add'].apply(int)

temp_mapgroups = mapdata.groupby(['street_nam'])
unique_mapstreets = [ dirstreet for dirstreet in temp_mapgroups.groups.keys() ]

In [None]:
#pycallgraph

import usaddress
from usaddress import parse

def get_mapgeo(streetdir, streetname, addressnumber):
    assert type(addressnumber) is int, "addressnumber is not an int"

    query = 'f_add >= %s & t_add <= %s & street_nam == "%s" & source == "buildings"' % (addressnumber, addressnumber, streetname)
    
    geo_results = mapdata.query(query)
    if len(geo_results['geom'].values) == 0:
        return np.NaN, np.NaN
    
    else:
        centroid = geo_results['geom'].values[0].centroid        
        return centroid.x,centroid.y
        

def get_closest(streetname, streetdir):
    street_str = "%s" % streetname
    
    match = get_close_matches(street_str, unique_mapstreets, cutoff=.8, n=1)
    if match:
        return match[0]
    
    if re.match('^[0-9]+',street_str):
        match = get_close_matches(streetname, unique_mapstreets, cutoff=.6, n=1)
        if match:
            return match[0]
        
    return np.NaN

def parse_addr(addr):
    parsed = usaddress.parse(addr)
    addr_dict = dict([ (key, val) for val, key in parsed ])
    
    for key in ['StreetName', 'AddressNumber', 'StreetNamePreDirectional']:
        if key not in addr_dict.keys():
            return pd.Series()
    
    streetdir = addr_dict['StreetNamePreDirectional']
    streetname = addr_dict['StreetName']
    addressnumber = int(addr_dict['AddressNumber'])

    if addr_dict['StreetName'] not in unique_mapstreets:
        addr_dict['StreetName'] = get_closest(streetname, streetdir)

    addr_dict['x'], addr_dict['y'] = get_mapgeo(streetdir, streetname, addressnumber)

                    
    return pd.Series(addr_dict)

In [None]:
useful_cols = ['address','timestamp','ticket_type','ticket_id']
df_tickets = pd.read_csv("/opt/ramdisk/overstay.csv", usecols=useful_cols) 
parsed_addrs = df_tickets['address'].apply(parse_addr)

ticket_types = list(set(df_tickets.ticket_type))
df_tickets['ticket_type'] = df_tickets['ticket_type'].apply(mask_ttype)
df_tickets = df_tickets.join(parsed_addrs, how='inner')

In [None]:
df_tickets.head()

In [None]:
import matplotlib.pyplot as plt
import pickle as pkl
#pkl.dumps(df_tickets, open('/tmp/df_tickets','w'))

df_tickets.to_csv()

#plt.plot(xdata, ydata, kind="scatter")

#a = df_tickets.groupby(['x','y','ticket_type'])

##someplot = df_tickets.plot(kind='scatter', x=a.x, y=a.y)
#someplot.scatter(df_tickets.x, y=df_tickets.y)



#df_tickets.plot(x='x',y='y',z='ticket_type', projection='3d')

In [None]:
geopoints = [  point.centroid for point in df_tickets['geometry'] if point is not None and type(point) is not float ]
a = GeoDataFrame()
a['geometry'] = geopoints
a.plot()

In [None]:
len(a)