In [1]:
baseUrl = "http://localhost:20000/"

import urllib, urllib2
from bs4 import BeautifulSoup
from modules.ONS import utils as search_utils
reload(search_utils)

class ONSScraper(object):
    def __init__(self, baseUrl):
        if (baseUrl.endswith("/")):
            baseUrl = str(baseUrl)[:-1]
        self.baseUrl = baseUrl
        
    def getPage(self, extension):
        targetUrl = self.baseUrl + extension
        page = urllib2.urlopen(targetUrl)
        return BeautifulSoup(page)
        
    def searchResultsList(self, searchTerm, method, **kwargs):
        page = method(searchTerm, **kwargs)
        divElements = page.select("div.search-results")
        if (len(divElements) == 1):
            ulElements = divElements[0].select("ul.list--neutral")
            if (len(ulElements) == 1):
                liElements = ulElements[0].select("li.search-results__item")
                
                results = []
                for elem in liElements:
                    href = elem.select("a")[0].attrs["href"]
                    results.append(href)
                return results
        return None

onsScraper = ONSScraper(baseUrl)

In [2]:
from pymongo import MongoClient, ASCENDING, DESCENDING
client = MongoClient('localhost', 27017)

db = client.local
collection = db.judgements

def mergeDicts(x, y):
    z = x.copy()   # start with x's keys and values
    z.update(y)    # modifies z with y's keys and values & returns None
    return z

In [3]:
searchTerms = ["rpi", "gender pay gap", "cpi", "gdp", "inflation", "crime", "unemployment", 
              "population", "immigration", "mental health", "london", "london population", 
              "retail price index", "life expectancy", "obesity", "religion", "migration", 
              "poverty", "social media", "employment"]
models = []
for i in range(9):
    models.append("ons_model_%d" % (i+1))
models.append("all")

def termQuery(term):
    return {"term": term}

def timeQuery(dateTime):
    return {"timeStamp": dateTime}

# Get date of most recent entries
doc = collection.find().sort([("timeStamp", DESCENDING)]).limit(1).next()
timeStamp = doc["timeStamp"]
    
timeStampQuery = timeQuery(timeStamp)
        

In [38]:
import numpy as np

MAX_SCORE = 4.0

def idealJudgement(num):
    i = 0
    incremenet = (1.0 / (float(num) - 1.0)) * num
    
    iJ = np.zeros(num)
    val = len(iJ)
    while (val > 0):
        iJ[i] = (val / float(num)) * MAX_SCORE
        i += 1
        val -= incremenet
        
    return iJ

def idealDiscountedCumulativeGain(num):
    idealGain = idealJudgement(num)
    iDCG = np.zeros(num)
    
    total = 0.0
    for i in range(num):
        total += idealGain[i] / float(i+1)
        iDCG[i] = total
    return iDCG

class Judgements(object):
    def __init__(self, qid, judgements):
        self.qid = qid
        self.judgements = judgements
        
    def dcg(self):
        total = 0.0
        
        dcg = []
        
        for i in range(len(self.judgements)):
            judgement = self.judgements[i]
            total += judgement["judgement"] / float(judgement["rank"])
            dcg.append(total)
            
        return np.array(dcg)
    
    def ndcg(self):
        
        dcg = self.dcg()
        idcg = idealDiscountedCumulativeGain(len(dcg))
        
        ndcg = np.zeros(len(dcg))
        
        for i in range(len(ndcg)):
            ndcg[i] = min(1.0, dcg[i] / idcg[i])
        return ndcg
    
    def __iter__(self):
        return self.judgements.__iter__()
    
    def __getitem__(self, i):
        return self.judgements[i]
    
    def __len__(self):
        return len(self.judgements)
    
    def remove(self, item):
        self.judgements.remove(item)

In [51]:
import copy, jwt
from urllib2 import HTTPError
reload(search_utils)

def getUriFromJwtToken(token):
    jwtDecodedDict = jwt.decode(token, 'secret', algorithms=['HS256'])
    return jwtDecodedDict["uri"]

modelPerformanceDict = {}

qid = 0
for searchTerm in searchTerms:
    qid += 1
    print "***********************************************"
    print searchTerm
    print "***********************************************"
    # Load the judgements
    query = mergeDicts(termQuery(searchTerm), timeStampQuery)
    cursor = collection.find(query)
    count = cursor.count()
    
    if (count > 0):
        judgements = cursor.next()
        
        # Get search results for each model
        for model in models:
            # Crawl the website
            modelJudgements = copy.deepcopy(judgements)

            # Scrape new ranks
            judgementList = modelJudgements["judgements"]["judgementList"]
            keep = []
#             judgementList = Judgements(qid, modelJudgements["judgements"]["judgementList"])
            for page in range(1, 10):
                try:
                    searchResults = onsScraper.searchResultsList(searchTerm, search_utils.sltr, 
                                            model=model, size=10, page=page, verbose=False)
    
                    searchResultsDecoded = []
                    for searchResult in searchResults:
                        token = searchResult.replace("/redir/", "")
                        uri = getUriFromJwtToken(token)
                        searchResultsDecoded.append(uri)

                    for judgement in judgementList:
                        judgement["rank"] = -1
                        url = judgement["attrs"]["uri"]
                        if (url in searchResultsDecoded):
                            newRank = int(((page - 1) * 10.0) + searchResultsDecoded.index(url) + 1)
                            judgement["rank"] = newRank
                            j = copy.deepcopy(judgement)
                            j["rank"] = newRank
                            keep.append(j)
                    
                except HTTPError:
                    break
                    
            if (len(keep) > 1):
                judgementList = Judgements(qid, keep)
                ndcg = judgementList.ndcg()
                print model, ndcg.mean()
                if (model not in modelPerformanceDict):
                    modelPerformanceDict[model] = []
                modelPerformanceDict[model].append(ndcg.mean())
         

***********************************************
rpi
***********************************************
ons_model_1 0.00826294121339
ons_model_2 0.01337037047
ons_model_3 0.749187714201
ons_model_4 0.00157126824964
ons_model_5 0.00854230578143
ons_model_7 0.00833333339542
ons_model_8 0.00753233397734
ons_model_9 0.796919538895
all 0.795164537127
***********************************************
gender pay gap
***********************************************
ons_model_1 0.898208546214
ons_model_2 0.898208546214
ons_model_3 0.936789639785
ons_model_4 0.015909090879
ons_model_5 0.898208546214
ons_model_6 0.493020628081
ons_model_7 0.898208546214
ons_model_8 0.898208546214
ons_model_9 0.936249297645
all 0.936499219908
***********************************************
cpi
***********************************************
ons_model_3 0.896437847866
ons_model_6 0.016917773238
ons_model_9 0.896437847866
all 0.896331168831
***********************************************
gdp
*******************************

In [56]:
def printStats(modelPerformanceDict):
    for model in models:
        entry = modelPerformanceDict[model]
        meanNdcg = float(sum(entry)) / float(len(entry))
        print "model: %s, mean ndcg: %f" % (model, meanNdcg)
    
printStats(modelPerformanceDict)

model: all, mean ndcg: 0.638534
model: ons_model_6, mean ndcg: 0.177339
model: ons_model_7, mean ndcg: 0.333300
model: ons_model_4, mean ndcg: 0.142874
model: ons_model_5, mean ndcg: 0.336833
model: ons_model_2, mean ndcg: 0.326902
model: ons_model_3, mean ndcg: 0.714560
model: ons_model_1, mean ndcg: 0.344628
model: ons_model_8, mean ndcg: 0.303266
model: ons_model_9, mean ndcg: 0.638115
