In [4]:
import OSGridConverter #To convert from =SGB36 to WGS84
import pandas as pd #To use pandas for elegant data handling
import numpy as np
import math

In [5]:
geograph = pd.read_csv('./data/geograph_mini_corpus.csv', encoding='latin-1')
        
sample = geograph.sample(n = 10000) # For testing

si = SpatialIndex(10000, sample)
postings = Postings(True, sample)
gaz = Gazetteer()

In [6]:
location = gaz.getLocation('Ben Nevis')
print(location)

['Ben Nevis' 771500 216500 'Highland' 'H']
(216500, 771500, 216250, 771250, 216750, 771750)


In [28]:
maxDist = 100000
docsSpatial = si.rangeQuery(maxDist, (location[0],location[1]))
docsSpatial

{1055018: 500794.3406639097,
 574101: 502740.85889253125,
 682910: 503352.03963131015,
 233904: 507179.0618716431,
 3174881: 511904.2510284516,
 232407: 515467.74234766624,
 3087618: 515802.40592789015,
 1263239: 522333.4856593056,
 235009: 525111.7709602404,
 669745: 530595.9291975392,
 833091: 555284.9011462494,
 833016: 555629.0006479144,
 949141: 558172.3765477829,
 1329739: 558498.6379580527,
 11734: 558804.733337147,
 489867: 559513.7133261347,
 1484005: 562460.4626993794,
 1483519: 568337.2928992079,
 3145200: 569120.7880300982,
 804905: 571929.4055921587,
 2667472: 573317.9257105084,
 793564: 576520.5643522181,
 3146058: 583429.9365819345,
 1217735: 587403.3517949996,
 269599: 587637.7285368937,
 2872330: 590154.9953029288}

In [19]:
docsThematic = postings.tfIdf('hill mountain summit')
maxThematic = next(iter(docsThematic.items()))[1]
print(maxThematic)

3.447076391653512


In [32]:
candidates = set(list(docsSpatial.keys()))

scores = dict()

for doc in candidates:
    st = 0
    ss = 0
    if doc in docsThematic:
        st = docsThematic[doc]/ maxThematic
        print(f'thematic {st}')
    if doc in docsSpatial:
        ss = docsSpatial[doc]
        print(f'spatial {ss}')
    

spatial 525111.7709602404
spatial 515802.40592789015
spatial 522333.4856593056
spatial 590154.9953029288
spatial 559513.7133261347
spatial 502740.85889253125
spatial 558172.3765477829
spatial 503352.03963131015
spatial 587637.7285368937
spatial 571929.4055921587
spatial 500794.3406639097
spatial 507179.0618716431
spatial 530595.9291975392
spatial 555284.9011462494
spatial 587403.3517949996
spatial 583429.9365819345
spatial 558498.6379580527
spatial 573317.9257105084
spatial 558804.733337147
spatial 515467.74234766624
spatial 576520.5643522181
spatial 511904.2510284516
spatial 562460.4626993794
spatial 569120.7880300982
spatial 555629.0006479144
spatial 568337.2928992079


In [33]:
class SpatialIndex:
    
    def __init__(self, resolution, sample):
        
        sample.dropna() # Get rid of problematic rows with nas
        
        for i in sample.index:
            try:
                g = OSGridConverter.latlong2grid (sample.at[i, 'lat'], sample.at[i, 'lon'], tag = 'WGS84')
                sample.at[i, 'x'] = g.E
                sample.at[i, 'y'] = g.N
            except ValueError:
                #print("Problem with a document", sample.at[i,'id'])
                sample = sample.drop(i)

        # Now we can set up the parameters for our index        
        self.resolution = resolution

        self.minx = sample['x'].min()
        self.maxx = sample['x'].max()
        self.miny = sample['y'].min()
        self.maxy = sample['y'].max()

        w = self.maxx - self.minx
        h = self.maxy - self.miny

        nc = int(w/self.resolution) + 1
        nr = int(h/self.resolution) + 1

        #print(maxx, minx, maxy, miny)
        #print(nr, nc)

        #Build the spatial index now
        self.spatialIndex = pd.DataFrame(index=range(nc),columns=range(nr))

        #Now we populate the index with document ids
        for index, row in sample.iterrows():
            i = int((row['x'] - self.minx)/self.resolution)
            j = int((row['y'] - self.miny)/self.resolution)
            id = row['id']
    
            #print(row['id'])
            #print(row['x'],row['y'],i,j)
            if pd.isnull(self.spatialIndex.at[i,j]):
                self.spatialIndex.at[i,j] = {id:(row['x'],row['y'])}
            else:
                names = self.spatialIndex.at[i,j]
                names.update({id:(row['x'],row['y'])})
                self.spatialIndex.at[i,j] = names

        
    def rangeQuery(self, dist, point):
        print(dist)
        x1 = point[0] - dist/2
        x2 = point[0] + dist/2
        y1 = point[1] - dist/2
        y2 = point[1] + dist/2
    
        i1 = int((x1 - self.minx)/self.resolution)
        j1 = int((y1 - self.miny)/self.resolution)
        i2 = int((x2 - self.minx)/self.resolution) + 1
        j2 = int((y2 - self.miny)/self.resolution) + 1

        # Retrieve only the relevant part of the index
        result = self.spatialIndex.iloc[j1:j2, i1:i2]
        # Turn the data frame into a 1d list
        tlist = result.values.flatten()
        # Remove all the nans
        filtered = filter(lambda i:not(type(i) is float), tlist)
        
        #Rank by distance
        ranked = {}
        for item in filtered:
            for key in item:
                d = si.dist(point, item[key])
                #print(key, item[key], dist)
                ranked[key] = d    
        ranked = dict(sorted(ranked.items(), key = lambda x: x[1], reverse=False))
                
        return ranked
    
    def dist(self, p1, p2):
        #print(p1[0], p1[1], p2[0], p2[1])
        dist = (((p1[0] - p2[0]) ** 2) + ((p1[1] - p2[1]) ** 2)) ** 0.5
        #print(dist)
        return dist

In [10]:
si.dist((3,3),(4,4))

1.4142135623730951

In [2]:
import pandas as pd #To use pandas for elegant data handling
import spacy #Our NLP tools
import math

class Postings:
    
    def __init__(self, firstMondayTerms, sample):
        #Load a language model to do NLP
        self.nlp = spacy.load("en_core_web_md")
        self.ndocs = len(sample)
                
        # firstMonday works like an inverse stop list, and we only use words in these lists for our posting file
        if firstMondayTerms:
            list = {}
            elements = set(pd.read_csv('./data/elements.txt', header=None)[0])
            qualities = set(pd.read_csv('./data/qualities.txt', header=None)[0])
            activities = set(pd.read_csv('./data/activities.txt', header=None)[0])

            terms = elements.union(qualities).union(activities)
            lemmas = ' '.join(str(e) for e in terms)

            doc = self.nlp(lemmas)
            terms = set()
            for token in doc:
                terms.add(token.lemma_)
                
            # Now we process our corpus and create a postings file
            docs = self.nlp.pipe(sample.text,n_process=2, batch_size=100)

            self.postings = dict()

            for (idxRow, s1), (_, s2) in zip(sample.iterrows(), enumerate(docs)):
                id = s1.id
                for token in s2:
                    lemma = token.lemma_
                    if lemma in terms:

                        if lemma in self.postings:
                            tf = self.postings[lemma]
                            if id in tf:
                                tf[id] = tf[id] + 1
                            else:
                                tf[id] = 1
                        else:
                            tf = {id: 1}
                        self.postings[lemma] = tf
                        
    def tfIdf(self, query):
        results = {}
        qdoc = self.nlp(query)
        for token in qdoc:
            qt = token.lemma_
            if qt in self.postings:
                dc = len(self.postings[qt])
                idf = math.log10(self.ndocs/(dc + 1))
                for doc in self.postings[qt]:
                    tf = self.postings[qt][doc]
                    tfidf = tf * idf
                    if doc in results:
                        score = results[doc]
                        results[doc] = tfidf + score
                    else:
                        results[doc] = tfidf
        results = dict(sorted(results.items(), key = lambda x: x[1], reverse=True))
        
        return results

In [3]:
import pandas as pd #To use pandas for elegant data handling
# Feature codes in gazetteer are as follows:
# A Antiquity (non-Roman)
# F Forest or wood
# FM Farm
# H Hill or mountain
# R Antiquity (Roman)
# C City
# T Town
# O Other
# W Water feature
# X All other features

class Gazetteer:
    
    def __init__(self):
        self.gaz = dict()
        self.offset = {'C': 2000, 'T':500, 'H':250, 'F':500}
        # Read in gazetteer data
        os_50k = pd.read_csv('./data/50kgaz2012.txt',sep=':', encoding='utf8', header=None)
        os_trimmed = os_50k.drop([0,1,3,4,5,6,7,10,11,12,15,16,17,18,19], axis = 1)
        os_trimmed.columns = ['name','y','x','county','type']
        for index, row in os_trimmed.iterrows():
            name = row['name']
            entry = os_trimmed.iloc[index].values 
            # Store gazetteer in a dictionary of unique names
            if name in self.gaz:
                entries = self.gaz[name]
                entries.append(entry)
                self.gaz[name] = entries
            else:
                self.gaz[name] = [entry]
            
    def getLocation(self, name):
        if (name in self.gaz) == False:
            return('Name not found in gazetteer')

        if len(self.gaz[name]) > 1:
            # We let the user disambiguate
            i = 0
            print("This place name is ambigous - choose an entry")
            for entry in self.gaz[name]:
                print(f'{i}: {name}, {entry[3]}')
                i = i + 1
            index = int(input("Choose a value:"))
            entry = self.gaz[name][index]
        else:
            entry = self.gaz[name][0]
            
        print(entry)
        x = entry[2]
        y = entry[1]
            
        if entry[4] in self.offset:
            diff = self.offset[entry[4]]
            return (x,y,x-diff, y-diff, x + diff, y + diff)
        else:
            return(x,y)
                                    
    def gazDump(self):
        for name in self.gaz:
            print(name)
            print(self.gaz[name])
            