Skip to content

Commit

Permalink
updated analysis. fixed to_utf bug
Browse files Browse the repository at this point in the history
  • Loading branch information
sirrice committed May 17, 2012
1 parent 8be0e46 commit 7ab200a
Show file tree
Hide file tree
Showing 13 changed files with 876 additions and 193 deletions.
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -64,6 +64,7 @@ Immediate ToDos
* an HTML file may contain multiple tables to be imported
* ~~Support downloading URLS and HTML files~~
* ~~Support CSV output~~
* ~~Support Excel Files~~


## Comments
Expand Down
258 changes: 258 additions & 0 deletions dbtruck/analyze/fuzzyjoin.py
@@ -0,0 +1,258 @@
import csv
import os
import sys
import pdb
import traceback
sys.path.append(['..', '.', '../exporters'])

from geopy import geocoders
from collections import defaultdict

from load_data import *
from hidden import *
from dbtruck.exporters.db import *
import dbtruck.settings as settings




def get_join_candidates(db):#, t1, t2):
"""
Types of joins that make sense
Location
- same zoom level
- nearest neighbor with radius
- different zoom level
- within
- aggregate (avg, std, max, count)
Time
- same hour, same day, same day of week, same month, same day of month
Location: has shape file, no shape file
"""
tablestats = []
for table in get_hidden_table_names(db):
radius = compute_radius(db, table)
bshape = has_shape(db, table)
tablestats.append((table, radius, bshape))

res = []
for idx, (t1, r1, s1) in enumerate(tablestats):
for t2, r2, s2 in tablestats[idx+1:]:
print t1, t2
loc_join = run_join(db, t1, t2, r1, r2, s1, s2, 'loc_join')
agg_join = run_join(db, t1, t2, r1, r2, s1, s2, 'aggregate_loc_join')
res.append((t1, t2, loc_join, agg_join))
return res


def run_join(db, t1, t2, r1, r2, s1, s2, prefix):
if s1 and s2:
suffix = "3(db, t1, t2)"
elif s1:
suffix = "2(db, t2, t1, r2)"
elif s2:
suffix = "2(db, t1, t2, r1)"
else:
suffix = "1(db, t1, t2, r1, r2)"

cmd = "res = %s_%s" % (prefix, suffix)
exec(cmd)
return res





### Same zoom level

def loc_join_1(db, t1, t2, r1, r2):
# given radius r1, r2, find overlapping lat lons
t1cols, t2cols = get_numeric_columns(db, t1), get_numeric_columns(db, t2)
sels1 = map(lambda s: '%s.%s' % (t1, s), t1cols)
sels2 = map(lambda s: '%s.%s' % (t2, s), t2cols)
sels = sels1 + sels2
cols = [(t1, c) for c in t1cols] + [(t2, c) for c in t2cols]
q = """select %s from %s, %s where
circle(point(%s.latitude, %s.longitude), %%s) &&
circle(point(%s.latitude, %s.longitude), %%s)
""" % (','.join(sels), t1, t2, t1, t1, t2, t2)

res = query(db, q, (r1, r2))
return [dict(zip(cols, row)) for row in res]

def loc_join_2(db, t1, t2, r1):
# given radius r1, and t2 has shape file
t1cols, t2cols = get_numeric_columns(db, t1), get_numeric_columns(db, t2)
sels1 = map(lambda s: '%s.%s' % (t1, s), t1cols)
sels2 = map(lambda s: '%s.%s' % (t2, s), t2cols)
sels = sels1 + sels2
cols = [(t1, c) for c in t1cols] + [(t2, c) for c in t2cols]
q = """select %s from %s, %s where
circle(point(%s.latitude, %s.longitude), %%s) &&
%s.shape""" % (','.join(sels), t1, t2, t1, t1, t2)

res = query(db, q, (r1, ))
return [dict(zip(cols, row)) for row in res]

def loc_join_3(db, t1, t2):
# given both shape files
t1cols, t2cols = get_numeric_columns(db, t1), get_numeric_columns(db, t2)
sels1 = map(lambda s: '%s.%s' % (t1, s), t1cols)
sels2 = map(lambda s: '%s.%s' % (t2, s), t2cols)
sels = sels1 + sels2
cols = [(t1, c) for c in t1cols] + [(t2, c) for c in t2cols]
q = """select %s from %s, %s where
%s.shape && %s.shape""" % (','.join(sels), t1, t2, t1, t2)

res = query(db, q)
return [dict(zip(cols, row)) for row in res]


### Different zoom level -- t1 contains t2

def check_agg_loc_join_1(db, t1, t2, r1, r2):
# given radius r1, r2
q = """select %s.id, count(*) from %s, %s where
circle (point(%s.latitude, %s.longitude), %%s) ~
circle (point(%s.latitude, %s.longitude), %%s)
group by %s.id""" % (t1, t1, t2, t1, t1, t2, t2, t1)
return query(db, q, (r1, r2))

def check_agg_loc_join_2(db, t1, t2, r1):
# given radius r1, t2.shape
q = """select %s.id, count(*) from %s, %s where
circle (point(%s.latitude, %s.longitude), %%s) ~ %s.shape
group by %s.id""" % (t1, t1, t2, t1, t1, t2, t1)
return query(db, q, (r1,))

def check_agg_loc_join_3(db, t1, t2):
# t1.shape, t2.shape
q = """select %s.id, count(*) from %s, %s where
%s.shape ~ %s.shape
group by %s.id""" % (t1, t1, t2, t1, t2, t1)
return query(db, q)


def get_numeric_columns(db, table):
data_types = ['smallint', 'integer', 'bigint', 'decimal', 'numeric', 'real', 'double precision']
args = ','.join(['%s'] * len(data_types))
q = """select column_name
from information_schema.columns
where table_name = %%s and column_name != 'latitude' and
column_name != 'longitude' and data_type in (%s)""" % (args)
cols = query(db, q, tuple([table] + data_types))
cols = [c[0] for c in cols]
return cols


def aggregate_loc_join_1(db, t1, t2, r1, r2):
# given radius r1, r2
aggs = ['stddev', 'avg', 'min', 'max', 'count', 'sum']
t1cols = get_numeric_columns(db, t1)
t2cols = get_numeric_columns(db, t2)
sels1 = ['%s.%s' % (t1, c) for c in t1cols]
sels2 = ['%s(t2.%s) as %s_%s' % (agg, c, c, agg)
for c in t2cols for agg in aggs]
sels = [(t1, c) for c in t1cols] + [(t2, c, agg) for c in t2cols for agg in aggs]

subq = """select %s.id, %s
from %s, %s as t2
where circle(point(%s.latitude, %s.longitude), %%s) &&
circle(point(t2.latitude, t2.longitude), %%s)
group by %s.id""" % (t1, ','.join(sels2), t1, t2, t1, t1, t1)

q = """select %s, subq.* from %s, (%s) as subq
where subq.id = %s.id;""" % (','.join(sels1), t1, subq, t1)

res = query(db, q, (r1, r2))
return [dict(zip(sels, row)) for row in res]


def aggregate_loc_join_2(db, t1, t2, r1):
# given radius r1, r2
aggs = ['stddev', 'avg', 'min', 'max', 'count', 'sum']
t1cols = get_numeric_columns(db, t1)
t2cols = get_numeric_columns(db, t2)
sels1 = ['%s.%s' % (t1, c) for c in t1cols]
sels2 = ['%s(t2.%s) as %s_%s' % (agg, c, c, agg)
for c in t2cols for agg in aggs]
sels = [(t1, c) for c in t1cols] + [(t2, c, agg) for c in t2cols for agg in aggs]

subq = """select %s.id, %s
from %s, %s as t2
where circle(point(%s.latitude, %s.longitude), %%s) &&
t2.shape
group by %s.id""" % (t1, ','.join(sels2), t1, t2, t1, t1, t1)

q = """select %s, subq.* from %s, (%s) as subq
where subq.id = %s.id;""" % (','.join(sels1), t1, subq, t1)

res = query(db, q, (r1,))
return [dict(zip(sels, row)) for row in res]


def aggregate_loc_join_3(db, t1, t2):
# given radius r1, r2
aggs = ['stddev', 'avg', 'min', 'max', 'count', 'sum']
t1cols = get_numeric_columns(db, t1)
t2cols = get_numeric_columns(db, t2)
sels1 = ['%s.%s' % (t1, c) for c in t1cols]
sels2 = ['%s(t2.%s) as %s_%s' % (agg, c, c, agg)
for c in t2cols for agg in aggs]
sels = [(t1, c) for c in t1cols] + [(t2, c, agg) for c in t2cols for agg in aggs]

subq = """select %s.id, %s
from %s, %s as t2
where t1.shape && t2.shape
group by %s.id""" % (t1, ','.join(sels2), t1, t2, t1)

q = """select %s, subq.* from %s, (%s) as subq
where subq.id = %s.id;""" % (','.join(sels1), t1, subq, t1)

res = query(db, q)
return [dict(zip(sels, row)) for row in res]



def compute_radius(db, table):
"""Computing radius sizes for a table"""
# ath.erf(0.063 / (2 ** 0.5))
# 5% of distribution falls within 0.063 * stddev
q = """select stddev(latitude), stddev(longitude)
from %s
where latitude is not null and longitude is not null""" % table
latstd, lonstd = query(db, q)[0]
return 0.063 * (latstd + lonstd) / 2.

def has_shape(db, table):
try:
q = "select count(*) from %s where shape is not null" % table
return query(db, q)[0][0] > 0
except:
return False



if __name__ == '__main__':

db = connect('test')
t1, t2 = 'test2', 'test1'
r1, r2 = 40, 5
# print loc_join_1(db, t1, t2, r1, r2)
# print loc_join_2(db, t1, t2, r1)
# print loc_join_3(db, t1, t2)
# print check_agg_loc_join_1(db, t1, t2, r1, r2)
# print check_agg_loc_join_2(db, t1, t2, r1)
# print check_agg_loc_join_3(db, t1, t2)
# print aggregate_loc_join_1(db, t1, t2, r1, r2)
# print aggregate_loc_join_2(db, t1, t2, r1)
# print aggregate_loc_join_3(db, t1, t2)
# print compute_radius(db, t1)
# print compute_radius(db, t2)
ress = get_join_candidates(db)
for res in ress:
print res[:2]
print res[2][0].values()
print res[3][0].values()
db.close()
91 changes: 91 additions & 0 deletions dbtruck/analyze/hidden.py
@@ -0,0 +1,91 @@
import csv
import os
import sys
import pdb
import traceback
sys.path.append(['..', '.', '../exporters'])

from geopy import geocoders
from collections import defaultdict

from load_data import *
from dbtruck.exporters.db import *
import dbtruck.settings as settings



def create_hidden_table(db, table):
loctable = '_%s_loc_' % table
try:
query(db, 'select count(*) from %s' % loctable)[0]
return loctable
except:
try:


# create if it doesn't exist
q = """create table %s (
id int references %s(id),
latitude float null,
longitude float null,
address text null,
city varchar(128) null,
state varchar(128) null,
country varchar(128) null,
query text null,
description text null,
zoom float null,
shape polygon null
)""" % (loctable, table)
prepare(db, q, bexit=False)

q = 'create index %s_id on %s (id);' % (loctable, loctable)
prepare(db, q, bexit=False)
return loctable
except:
traceback.print_exc()
return None

def get_table_cols(db, table):
cols = query(db, """select column_name from
information_schema.columns where table_name = '%s' order by
ordinal_position asc""" % table)
return [c[0] for c in cols]


def get_all_table_names(db):
try:
tables = query(db, """select table_name from
information_schema.tables where table_schema = 'public' and
table_type = 'BASE TABLE';""")
return [row[0] for row in tables]
except:
return []

def get_hidden_table_names(db):
for regtable, loctable in get_table_name_pairs(db):
if regtable and loctable:
yield loctable

def get_regular_table_names(db):
for regtable, loctable in get_table_name_pairs(db):
yield regtable

def get_table_name_pairs(db):
tables = get_all_table_names(db)
loctables, regtables = [], []
for table in tables:
if table.startswith('_') and table.endswith('_loc_'):
loctables.append(table)
else:
regtables.append(table)

pairs = []
for regtable in regtables:
if ('_%s_loc_' % regtable) in loctables:
pairs.append((regtable, '_%s_loc_' % regtable))
else:
pairs.append((regtable, None))
return pairs


0 comments on commit 7ab200a

Please sign in to comment.