In [8]:
import array, math, os, psycopg2, random
from shapely.geometry import *
from shapely.wkb import loads
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool 

def LonLatToPixelXY(lonlat, scale = 1.):
    (lon, lat) = lonlat
    x = (lon + 180.0) * 256.0 / 360.0
    y = 128.0 - math.log(math.tan((lat + 90.0) * math.pi / 360.0)) * 128.0 / math.pi
    return [x*scale, y*scale]

def randomPoint(geom):
    poly = loads(geom, True)
    bbox = poly.bounds
    l,b,r,t = bbox
    while True:
        point = Point(random.uniform(l,r),random.uniform(t,b))
        if point is None:
            break
        if poly.contains(point):
            break
    return point.__geo_interface__['coordinates']

def split_list(alist, wanted_parts=1):
    length = len(alist)
    return [ alist[i*length // wanted_parts: (i+1)*length // wanted_parts] 
             for i in range(wanted_parts) ]

def my_range(start, end, step):
    while start <= end:
        yield start
        start += step

def get_count(conn, table):
    query = "SELECT count(*) FROM %s" % table
    cur = conn.cursor()
    cur.execute(query)
    rows = cur.fetchall()
    cur.close()
    return rows
    
def pack_color(color):
    return color['r'] + color['g'] * 256.0 + color['b'] * 256.0 * 256.0;

def unpack_color(f):
    b = math.floor(f / 256.0 / 256.0)
    g = math.floor((f - b * 256.0 * 256.0) / 256.0)
    r = math.floor(f - b * 256.0 * 256.0 - g * 256.0)
    return {'r':r,'g':g,'b':b}

se01_color = {'r':25, 'g':75, 'b':255}
se02_color = {'r':20, 'g':138, 'b':9}
se03_color = {'r':227, 'g':30, 'b':30}
ANSI_CODES = [
    ('01', 'al'), ('02', 'ak'), ('04', 'az'), ('05', 'ar'), ('06', 'ca'),
    ('08', 'co'), ('09', 'ct'), ('10', 'de'), ('11', 'dc'), ('12', 'fl'),
    ('13', 'ga'), ('15', 'hi'), ('16', 'id'), ('17', 'il'), ('18', 'in'),
    ('19', 'ia'), ('20', 'ks'), ('21', 'ky'), ('22', 'la'), ('23', 'me'),
    ('24', 'md'), ('25', 'ma'), ('26', 'mi'), ('27', 'mn'), ('28', 'ms'),
    ('29', 'mo'), ('30', 'mt'), ('31', 'ne'), ('32', 'nv'), ('33', 'nh'),
    ('34', 'nj'), ('35', 'nm'), ('36', 'ny'), ('37', 'nc'), ('38', 'nd'),
    ('39', 'oh'), ('40', 'ok'), ('41', 'or'), ('42', 'pa'), ('44', 'ri'),
    ('45', 'sc'), ('46', 'sd'), ('47', 'tn'), ('48', 'tx'), ('49', 'ut'),
    ('50', 'vt'), ('51', 'va'), ('53', 'wa'), ('54', 'wi'), ('55', 'wy'),
    ('56', 'wv')
]

In [9]:
# faster
# runs multiple queries to build rows
# then runs multiple processes to process rows

state = 'ak'
dest = '%s-od-jt00-2011.bin' % state
STEP = 100000 # run SQL query 100000 rows at a time
    
packed_se01_color = pack_color(se01_color)
packed_se02_color = pack_color(se02_color)
packed_se03_color = pack_color(se03_color)

# find how many rows total:
conn = psycopg2.connect(dbname="tl_2010_tabblock", host="/var/run/postgresql")
rows = get_count(conn, '%s_od_jt00_2011' % state)
TOTAL = rows[0][0]
print 'Total Rows: %s' % TOTAL
conn.close()

def get_rows(index):
    offset = index * STEP    
    conn = psycopg2.connect(dbname="tl_2010_tabblock", host="/var/run/postgresql")
    print "Batch start: %s. Batch limit: %s" % (offset, STEP)
    query = (
            "select w.geom, h.geom, od.se01, od.se02, od.se03 "
            "from %s_od_jt00_2011 od  "
            "left join tl_2010_tabblock10 w on od.w_geocode = w.geoid10 "
            "left join tl_2010_tabblock10 h on od.h_geocode = h.geoid10 "
            "order by od.gid "
            "limit %s "
            "offset %s "
            ) % (state, STEP, offset)
    cur = conn.cursor()
    cur.execute(query)
    rows = cur.fetchall()
    cur.close()
    conn.close()
    return rows

def process_row(row):
    data = []
    workGeom = row[0]
    homeGeom = row[1]
    se01 = row[2]
    se02 = row[3]
    se03 = row[4]
    for i in range(se01):
        wpoint = randomPoint(workGeom)
        hpoint = randomPoint(homeGeom)
        data += LonLatToPixelXY(wpoint)
        data += LonLatToPixelXY(hpoint)
        data.append(packed_se01_color)
    for i in range(se02):
        wpoint = randomPoint(workGeom)
        hpoint = randomPoint(homeGeom)
        data += LonLatToPixelXY(wpoint)
        data += LonLatToPixelXY(hpoint)
        data.append(packed_se02_color)
    for i in range(se03):
        wpoint = randomPoint(workGeom)
        hpoint = randomPoint(homeGeom)
        data += LonLatToPixelXY(wpoint)
        data += LonLatToPixelXY(hpoint)
        data.append(packed_se03_color)
        
    #print "Randomizing points..."
    split = split_list(data,len(data)/3)
    random.shuffle(split)        
    data = []
    for x in split:
        for y in x:
            data += [y]
    return data

        
print "Querying rows..."
index = range(0, TOTAL / STEP)
pool = Pool(23)
all_rows = pool.map(get_rows, index)
pool.close()
pool.join()

rows = [row for rows in all_rows for row in rows] # flatten results
print 'Resulting rows: %s ' % len(rows)

print "Processing rows..."
pool = Pool(23) # use 24 of available 32 cores. Let's not be greedy...
results = pool.map(process_row, rows)
pool.close()
pool.join()

print "Appending results to %s" % dest
# at this point we have an array of arrays (e.g. [[73.1, 96.8, 73.2, 96.7, 625172], [73.2, 96.7, 73.1, 96.8, 622372], ...])
results = [item for sublist in results for item in sublist] # flatten results
array.array('f', results).tofile(open(dest, 'aw'))


print "Process finished."

Total Rows: 215901
Querying rows...
Batch start: 0. Batch limit: 100000
Batch start: 100000. Batch limit: 100000
Resulting rows: 200000 
Processing rows...


TypeError: object of type 'NoneType' has no len()

In [2]:
%%HTML
<iframe src="index.html" width="1000" height="500">