In [43]:
from modules.ONS import queries
reload(queries)

index = "ons1515656903908"

models = []
for i in range(9):
    modelName = "ons_model_%d" % (i+1)
    models.append(modelName)

boosts = [1.0]*9
queryWeights = [0.5]*9
rescoreWeights = [1.0]*9

size = 10

searchTerms = ["rpi", "gender pay gap", "cpi", "gdp", "inflation", "crime", "unemployment", 
              "population", "immigration", "mental health", "london", "london population", 
              "retail price index", "life expectancy", "obesity", "religion", "migration", 
              "poverty", "social media", "employment"]

qidDict = {}
for i in range(len(searchTerms)):
    qidDict[i+1] = searchTerms[i]

In [2]:
from elasticsearch import Elasticsearch

esUrl = "http://localhost:9200"
esClient = Elasticsearch(esUrl, timeout=1000)

from pymongo import MongoClient, ASCENDING, DESCENDING
mongoClient = MongoClient('localhost', 27017)

db = mongoClient.local
collection = db.judgements

def mergeDicts(x, y):
    z = x.copy()   # start with x's keys and values
    z.update(y)    # modifies z with y's keys and values & returns None
    return z

def termQuery(term):
    return {"term": term}

def timeQuery(dateTime):
    return {"timeStamp": dateTime}

# Get date of most recent entries
doc = collection.find().sort([("timeStamp", DESCENDING)]).limit(1).next()
timeStamp = doc["timeStamp"]
    
timeStampQuery = timeQuery(timeStamp)

In [63]:
import numpy as np

MAX_SCORE = 4.0

def idealJudgement(num):
    i = 0
    incremenet = (1.0 / (float(num) - 1.0)) * num
    
    iJ = np.zeros(num)
    val = len(iJ)
    while (val > 0):
        iJ[i] = (val / float(num)) * MAX_SCORE
        i += 1
        val -= incremenet
        
    return iJ

def idealDiscountedCumulativeGain(num):
    idealGain = idealJudgement(num)
    iDCG = np.zeros(num)
    
    total = 0.0
    for i in range(num):
        total += idealGain[i] / float(i+1)
        iDCG[i] = total
    return iDCG

class Judgements(object):
    def __init__(self, judgements):
        self.judgements = judgements
        
    def dcg(self):
        total = 0.0
        
        dcg = []
        
        for i in range(len(self.judgements)):
            judgement = self.judgements[i]
            total += judgement["judgement"] / float(judgement["rank"])
            dcg.append(total)
            
        return np.array(dcg)
    
    def ndcg(self):
        
        dcg = self.dcg()
        idcg = idealDiscountedCumulativeGain(len(dcg))
        
        ndcg = np.zeros(len(dcg))
        
        for i in range(len(ndcg)):
            ndcg[i] = min(1.0, dcg[i] / idcg[i])
        return ndcg
    
    def __iter__(self):
        return self.judgements.__iter__()
    
    def __getitem__(self, i):
        return self.judgements[i]
    
    def __len__(self):
        return len(self.judgements)
    
    def remove(self, item):
        self.judgements.remove(item)

In [64]:
import copy, json

def processTerm(searchTerm, boosts, queryWeights, rescoreWeights, pages=10):
    mongoQuery = mergeDicts(termQuery(searchTerm), timeStampQuery)
    cursor = collection.find(mongoQuery)
    judgementCount = cursor.count()
    
    if (judgementCount > 0):
        judgements = cursor.next()
        modelJudgements = copy.deepcopy(judgements)

        keep = []
        judgementList = modelJudgements["judgements"]["judgementList"]
        
        # Reset ranks
        for judgement in judgementList:
            judgement["rank"] = -1
        
        rescoreQueries = queries.getRescoreQueriesForModels(searchTerm, models, boosts, queryWeights, rescoreWeights)
        for page in range(1, pages+1):
            fromParam = (page - 1) * size
            esQuery = queries.getBaseQuery(searchTerm, rescoreQueries, fromParam, size)
#             print json.dumps(esQuery)

            hits = esClient.search(index=index, body=esQuery)
            
#             print "Got %d hits for page %d" % (len(hits["hits"]["hits"]), page)
            
            searchResults = []
            for hit in hits["hits"]["hits"]:
                searchResults.append( hit["_id"] )
            
            # Check for matches
            for judgement in judgementList:
                url = judgement["attrs"]["uri"]
                if (url in searchResults):
                    newRank = int(((page - 1) * 10.0) + searchResults.index(url) + 1)
                    judgement["rank"] = newRank
                    j = copy.deepcopy(judgement)
                    j["rank"] = newRank
                    keep.append(j)
                    
        if (len(keep) > 1):
            judgementList = Judgements(keep)
            ndcg = judgementList.ndcg()
#             print searchTerm, ndcg.mean()
            return ndcg.mean()
    else:
        return 0.0
        

In [65]:
processTerm("rpi", boosts, queryWeights, rescoreWeights)
# for qid in qidDict:
#     processTerm(qid, qidDict[qid], weights)

0.75368310310807696

In [13]:
# import random
# import numpy as np

# X = []
# for i in range(10):
#     W = np.zeros(len(models))
#     for j in range(len(models)):
#         W[j] = random.random()
#     X.append(W)

In [67]:
import random
reload(queries)

X = [(0.0, 100.0)]*len(models)

# X = [(0.0, 1.0)]*len(models)
# X.extend([(0.0, 100.0)]*len(models))
def fn(X):
#     queryWeights, rescoreWeights = np.split(np.array(X), 2)
    term = random.choice(searchTerms)
#     ndcg = processTerm(term, boosts, queryWeights, rescoreWeights)
    ndcg = processTerm(term, boosts, queryWeights, X)
    if (ndcg is None):
        return 1.0
    return 1.0 - ndcg
print X

[(0.0, 100.0), (0.0, 100.0), (0.0, 100.0), (0.0, 100.0), (0.0, 100.0), (0.0, 100.0), (0.0, 100.0), (0.0, 100.0), (0.0, 100.0)]


In [None]:
from skopt import gp_minimize
from skopt import Optimizer

print "Optimizing..."
res = gp_minimize(fn, X, verbose=False)

Optimizing...


In [62]:
print res.x

queryWeightsNew, rescoreWeightsNew = np.split(np.array(res.x), 2)

count = 0
for searchTerm in searchTerms:
    count += 1
    ndcg = processTerm(count, searchTerm, boosts, queryWeightsNew, rescoreWeightsNew)
    print searchTerm, ndcg

rpi 0.752345044613
gender pay gap 0.381042769502
cpi 0.875
gdp 0.857279266168
inflation 0.625943113654
crime 0.63212293121
unemployment 0.331586267837
population 0.273889506311
immigration 0.788106496236
mental health 0.636923056164
london None
london population 1.0
retail price index 0.671139674231
life expectancy 0.0371328264369
obesity 0.0
religion 0.568614989855
migration 0.157618326452
poverty 0.389800284551
social media 0.961466992539
employment 0.585233426968


In [None]:

[0.91512786228791776, 0.18637723563018363, 0.092858794570568129, 0.53273612668253156,
 0.76706400990704571, 0.58349107122737787, 0.6834580839907477, 0.88558707199299047,
 0.48277903739251804, 6.1817401341518003, 9.7805931268469362, 9.3722047829479767,
 4.713174354828487, 2.601749548125281, 4.3597225473416383, 8.9907343794135848,
 6.4090652493697053, 2.9974113081044855]

Out[53]:
0.76181999486796548