In [141]:
import OSGridConverter #To convert from =SGB36 to WGS84
import pandas as pd #To use pandas for elegant data handling
import numpy as np
import math

class SpatialIndex:
    
    def __init__(self, resolution):
        geograph = pd.read_csv('./data/geograph_mini_corpus.csv', encoding='latin-1')
        #First we need to reproject the data so that we have them in the correct projection
        
        sample = geograph.sample(n = 10000) # For testing
        
        sample.dropna() # Get rid of problematic rows with nas
        
        for i in sample.index:
            try:
                g = OSGridConverter.latlong2grid (sample.at[i, 'lat'], sample.at[i, 'lon'], tag = 'WGS84')
                sample.at[i, 'x'] = g.E
                sample.at[i, 'y'] = g.N
            except ValueError:
                #print("Problem with a document", sample.at[i,'id'])
                sample = sample.drop(i)

        # Now we can set up the parameters for our index        
        self.resolution = resolution

        self.minx = sample['x'].min()
        self.maxx = sample['x'].max()
        self.miny = sample['y'].min()
        self.maxy = sample['y'].max()

        w = self.maxx - self.minx
        h = self.maxy - self.miny

        nc = int(w/self.resolution) + 1
        nr = int(h/self.resolution) + 1

        #print(maxx, minx, maxy, miny)
        #print(nr, nc)

        #Build the spatial index now
        self.spatialIndex = pd.DataFrame(index=range(nc),columns=range(nr))

        #Now we populate the index with document ids
        for index, row in sample.iterrows():
            i = int((row['x'] - self.minx)/self.resolution)
            j = int((row['y'] - self.miny)/self.resolution)
            id = row['id']
    
            #print(row['id'])
            #print(row['x'],row['y'],i,j)
            if pd.isnull(self.spatialIndex.at[i,j]):
                self.spatialIndex.at[i,j] = {id:(row['x'],row['y'])}
            else:
                names = self.spatialIndex.at[i,j]
                names.update({id:(row['x'],row['y'])})
                self.spatialIndex.at[i,j] = names

        
    def rangeQuery(self, dist, point):
        x1 = point[0] - dist/2
        x2 = point[0] + dist/2
        y1 = point[1] - dist/2
        y2 = point[1] + dist/2
    
        i1 = int((x1 - self.minx)/self.resolution)
        j1 = int((y1 - self.miny)/self.resolution)
        i2 = int((x2 - self.minx)/self.resolution) + 1
        j2 = int((y2 - self.miny)/self.resolution) + 1

        # Retrieve only the relevant part of the index
        result = self.spatialIndex.iloc[j1:j2, i1:i2]
        # Turn the data frame into a 1d list
        tlist = result.values.flatten()
        # Remove all the nans
        filtered = filter(lambda i:not(type(i) is float), tlist)
        
        #Rank by distance
        ranked = []
        for item in filtered:
            for key in item:
                d = si.dist(point, item[key])
                #print(key, item[key], dist)
                ranked.append((key, d/dist))    
        ranked.sort(key = lambda x: x[1]) # Sort by rank - values are normalised between 0 and 1
        
        return ranked
    
    def dist(self, p1, p2):
        #print(p1[0], p1[1], p2[0], p2[1])
        dist = (((p1[0] - p2[0]) ** 2) + ((p1[1] - p2[1]) ** 2)) ** 0.5
        #print(dist)
        return dist

In [142]:
si = SpatialIndex(10000)

In [143]:
test = si.rangeQuery(100000, (164500,374500))
print(test)

[(468943, 0.5778278117224889), (1438140, 0.5959704238466872), (2314638, 0.6066402970459513), (404569, 0.6074849216235741), (865328, 0.635026023088818), (519511, 0.6553544460824234), (1267123, 0.6687383045706295), (1294612, 0.6837296513242642), (1059608, 0.6845898041893408), (2122130, 0.6870489182729277), (606008, 0.688472294286415), (2483509, 0.6938398230139289), (1200265, 0.7018782501972832), (132974, 0.7025510657596357), (2009756, 0.7026800836796216), (1903705, 0.7034965529410929), (1221696, 0.7079896962527068), (2054653, 0.7117238580376521), (2141139, 0.7123270667326913), (1023817, 0.7127463139855582), (665671, 0.7143261160002482), (2768787, 0.7191507937143643), (509484, 0.7198601253021312), (1286440, 0.7260365594100616), (1268706, 0.7328243241050341), (1935407, 0.7373670185192717), (997474, 0.7374032817936194), (2103754, 0.7510833242723473), (1335030, 0.7515087052722677), (1182903, 0.7524662371827722), (2121508, 0.7594703746690847), (468193, 0.7616308082660522), (2222894, 0.7621458

In [106]:
tlist = test.values.flatten()


In [124]:
#for item in tlist:
    #print(type(item), item)
    
filtered = filter(lambda i:not(type(i) is float), tlist)


In [125]:
ranked = []
for item in filtered:
    for key in item:
        dist = si.dist((164500,374500), item[key])
        print(key, item[key], dist)
        ranked.append((key, dist))    
ranked.sort(key = lambda x: x[1])
print(ranked)

164500 374500 220650.0 382150.0
56668.730354579144
2606571 (220650.0, 382150.0) 56668.730354579144
164500 374500 221689.0 381370.0
57600.1616403982
903327 (221689.0, 381370.0) 57600.1616403982
164500 374500 229459.0 328040.0
79863.65431784348
641312 (229459.0, 328040.0) 79863.65431784348
164500 374500 228189.0 327480.0
79165.45408825747
640120 (228189.0, 327480.0) 79165.45408825747
164500 374500 223710.0 329950.0
74098.08769462272
621806 (223710.0, 329950.0) 74098.08769462272
164500 374500 222579.0 331460.0
72288.39354280879
2204228 (222579.0, 331460.0) 72288.39354280879
164500 374500 228660.0 338160.0
73736.70185192717
1935407 (228660.0, 338160.0) 73736.70185192717
164500 374500 231499.0 338800.0
75916.77022239553
228546 (231499.0, 338800.0) 75916.77022239553
164500 374500 222959.0 331270.0
72706.86061851385
2207982 (222959.0, 331270.0) 72706.86061851385
164500 374500 224379.0 334780.0
71855.22278164615
2009108 (224379.0, 334780.0) 71855.22278164615
164500 374500 222860.0 331230.0
726