In [1]:
from __future__ import print_function
import sys
import os
import os.path
import csv
import re
import random
import struct
import math
import datetime
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

def print_message(s):
    print("[{}] {}".format(datetime.datetime.utcnow().strftime("%b %d, %H:%M:%S"), s), flush=True)

In [2]:
impacts = {}
idfs = {}
term_id = 0
with open('F:\\bmitra\\data\\quant\\bhaskar_data_m0708_filtered\\bhd_impacts', mode='rb') as reader:
#with open('F:\\bmitra\\data\\quant\\bhaskar_data_m0708_filtered\\bhd_impacts.v2', mode='rb') as reader:
    while reader:
        impacts[term_id] = {}
        size_packed = reader.read(4)
        if len(size_packed) == 4:
            size = struct.unpack('i', size_packed)
            idfs[term_id] = 1 #struct.unpack('f', reader.read(4))[0]
            for i in range(size[0]):
                did_packed =  reader.read(4)
                did = struct.unpack('i', did_packed)[0]
                impacts[term_id][did] = struct.unpack('f', reader.read(4))[0]
            term_id = term_id + 1
            if term_id % 100 == 0:
                print('{} terms loaded.'.format(term_id))
        else:
            break
IMPACTS_MEAN = 0
IMPACTS_COUNT = 0
IMPACTS_MIN = float("inf")
IMPACTS_MAX = 0
for k, v in impacts.items():
    for k2, v2 in v.items():
        IMPACTS_MIN = min(IMPACTS_MIN, v2)
        IMPACTS_MAX = max(IMPACTS_MAX, v2)
        IMPACTS_MEAN += v2
        IMPACTS_COUNT += 1
IMPACTS_MEAN /= IMPACTS_COUNT
IMPACTS_STD = 0
for k, v in impacts.items():
    for k2, v2 in v.items():
        IMPACTS_STD += (v2 - IMPACTS_MEAN)**2
IMPACTS_STD /= IMPACTS_COUNT
IMPACTS_STD = IMPACTS_STD**0.5

doc_map = {}
with open('F:\\bmitra\\data\\quant\\bhaskar_data_m0708_filtered\\bhd_doc_map.txt', mode='r') as reader:
    for line in reader:
        cols = line.split(' ')
        doc_map[cols[1]] = int(cols[0])
        
qd_pos = {}
qd_neg = {}
with open('F:\\bmitra\\data\\quant\\bhaskar_data_m0708_filtered\\07-08.results.filtered.cust', mode='r') as reader:
    for line in reader:
        cols = line.split(' ')
        qid = int(cols[1])
        did = doc_map[cols[4]]
        if qid not in qd_pos:
            qd_pos[qid] = []
            qd_neg[qid] = []
        if int(cols[5]) <= 1000:
            qd_pos[qid].append(did)
        else:
            qd_neg[qid].append(did)

terms = {}
term_id = 0
with open('F:\\bmitra\\data\\quant\\bhaskar_data_m0708_filtered\\bhd_term_stem.txt', mode='r') as reader:
    for line in reader:
        cols = line.split(' ')
        term = cols[1]
        terms[term] = term_id
        term_id = term_id + 1
        
q_terms = {}
with open('F:\\bmitra\\data\\quant\\bhaskar_data_m0708_filtered\\07-08.queries.filtered', mode='r') as reader:
    for line in reader:
        cols = line.split('\t')
        q_terms[int(cols[0])] = [term for term in [terms.get(term) for term in cols[1].strip().split(' ')[:10]]if term is not None]
        
qrels = {}
for qrels_file in ['F:\\bmitra\\data\\quant\\bhaskar_data_m0708_filtered\\07.prels.txt', 'F:\\bmitra\\data\\quant\\bhaskar_data_m0708_filtered\\prels']:
    with open(qrels_file, mode='r') as reader:
        for line in reader:
            cols = line.split(' ')
            qid = int(cols[0])
            if qid not in qrels:
                qrels[qid] = set()
            if int(cols[2]) > 0:
                qrels[qid].add(doc_map[cols[1]])

qids = list(qid for (qid, docs) in qd_neg.items() if len(docs) > 0)
random.shuffle(qids)
split_cnt = int(0.8*len(qids))
qids_train = qids[:split_cnt]
qids_test = qids[split_cnt:]

100 terms loaded.
200 terms loaded.
300 terms loaded.
400 terms loaded.
500 terms loaded.
600 terms loaded.
700 terms loaded.
800 terms loaded.
900 terms loaded.
1000 terms loaded.
1100 terms loaded.
1200 terms loaded.
1300 terms loaded.
1400 terms loaded.
1500 terms loaded.
1600 terms loaded.
1700 terms loaded.
1800 terms loaded.
1900 terms loaded.
2000 terms loaded.
2100 terms loaded.
2200 terms loaded.
2300 terms loaded.
2400 terms loaded.
2500 terms loaded.
2600 terms loaded.
2700 terms loaded.
2800 terms loaded.
2900 terms loaded.
3000 terms loaded.
3100 terms loaded.
3200 terms loaded.
3300 terms loaded.
3400 terms loaded.
3500 terms loaded.


In [7]:
DEVICE = torch.device("cuda:0")
NUM_BINS_TO_TRY = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
ALPHAS_TO_TRY = [8, 16, 32, 64]
MAX_TERMS = 10
MAX_DOCS_PER_Q = 10000
NUM_POS_DOCS = 1000
NUM_EPOCHS = 4
EPOCH_SIZE = 8192
MB_SIZE = len(qids_train)
LEARNING_RATE = 0.1
EPSILON = 1e-6

class Quant(torch.nn.Module):
    
    def __init__(self, alpha):
        super(Quant, self).__init__()
        self.scheme = 1
        self.alpha = alpha
        if self.scheme == 0:
            self.w = nn.Parameter(torch.Tensor(1, NUM_BINS - 1))
        else:
            self.register_parameter('w', None)
        self.b = nn.Parameter(torch.Tensor(NUM_BINS - 1))
        self.v = nn.Parameter(torch.Tensor(NUM_BINS - 1, 1))
        
        if self.scheme == 0:
            nn.init.normal_(self.w, mean=0, std=1)
        nn.init.normal_(self.b, mean=IMPACTS_MEAN, std=IMPACTS_STD)
        nn.init.normal_(self.v, mean=IMPACTS_MEAN, std=IMPACTS_STD)

    def forward(self, x, idf):
        if self.scheme == 0:
            return F.softmax((x.mm(self.w) + self.b) * self.alpha).mm(F.softplus(self.v)) * idf
        else:
            return F.sigmoid((x - F.softplus(self.b)) * self.alpha).mm(F.softplus(self.v)) * idf
        
    def forward_eval(self, x, idf):
        with torch.no_grad():
            if self.scheme == 0:
                idx = torch.empty(MAX_DOCS_PER_Q, NUM_BINS - 1)
                idx.scatter(1, torch.argmax(F.softmax(x.mm(self.w) + self.b).max(1), dim=1, keepdim=True), 1)
                return idx.mm(F.softplus(self.v)) * idf
            else:
                return torch.round_(F.sigmoid(x - F.softplus(self.b))).mm(F.softplus(self.v)) * idf

def get_mb_train():
    for i in range(2):
        for j in range(MAX_TERMS):
            feat_train[i][j].fill(np.float32(0))
            idf_train[i][j].fill(np.float32(0))
    for i in range(MB_SIZE):
        qid = qids_train[i]
        for j in range(2):
            did = random.sample(qd_pos[qid] if j == 0 else qd_neg[qid], 1)[0]
            terms = q_terms[qid]
            for k in range(len(terms)):
                term = terms[k]
                impact = impacts[term].get(did, 0)
                idf = idfs.get(term, 0)
                feat_train[j][k][i, 0] = np.float32(impact)
                idf_train[j][k][i, 0] = np.float32(idf)
    return feat_train, idf_train

def get_mb_test(qid):
    labels_test = []
    for i in range(MAX_TERMS):
        feat_test[i].fill(np.float32(0))
        idf_test[i].fill(np.float32(0))
    num_pos = len(qd_pos[qid])
    num_neg = len(qd_neg[qid])
    for i in range(num_pos + num_neg):
        if i < num_pos:
            did = qd_pos[qid][i]
        else:
            did = qd_neg[qid][i - num_pos]
        terms = q_terms[qid]
        for j in range(len(terms)):
            term = terms[j]
            impact = impacts[term].get(did, 0)
            idf = idfs.get(term, 0)
            feat_test[j][i, 0] = np.float32(impact)
            idf_test[j][i, 0] = np.float32(idf)
        labels_test.append(1 if did in qrels[qid] else 0)
    return feat_test, idf_test, num_neg, labels_test, len(qrels[qid])

feat_train = []
idf_train = []
for i in range(2):
    feat_train.append([])
    idf_train.append([])
    for j in range(MAX_TERMS):
        feat_train[i].append(np.zeros((MB_SIZE, 1), dtype=np.float32))
        idf_train[i].append(np.zeros((MB_SIZE, 1), dtype=np.float32))
feat_test = []
idf_test = []
for i in range(MAX_TERMS):
    feat_test.append(np.zeros((MAX_DOCS_PER_Q, 1), dtype=np.float32))
    idf_test.append(np.zeros((MAX_DOCS_PER_Q, 1), dtype=np.float32))
labels = np.zeros((MB_SIZE), dtype=np.int64)

print_message('Starting...')

overlap = 0
recall = 0
denom_recall = 0
for qid in qids_test:
    features, idf, num_neg, labels_test, num_rel = get_mb_test(qid)
    scores = [(random.random(), 1 if i < NUM_POS_DOCS else 0, labels_test[i]) for i in range(NUM_POS_DOCS + num_neg)]
    random.shuffle(scores)
    scores.sort(key=lambda tup: tup[0], reverse=True)
    overlap += (sum([rating for (score, rating, ignore) in scores[:NUM_POS_DOCS]]) / NUM_POS_DOCS)
    recall += (sum([rating for (score, ignore, rating) in scores[:NUM_POS_DOCS]]) / num_rel) if num_rel > 0 else 0
    if num_rel > 0:
        denom_recall += 1
overlap /= len(qids_test)
recall /= denom_recall
print_message('model: random scores, overlap: {}, recall: {}'.format(overlap, recall))

overlap = 0
recall = 0
denom_recall = 0
for qid in qids_test:
    features, idf, num_neg, labels_test, num_rel = get_mb_test(qid)
    out = sum([idf[i] * features[i] for i in range(MAX_TERMS)])
    scores = [(out[i][0], 1 if i < NUM_POS_DOCS else 0, labels_test[i]) for i in range(NUM_POS_DOCS + num_neg)]
    random.shuffle(scores)
    scores.sort(key=lambda tup: tup[0], reverse=True)
    overlap += (sum([rating for (score, rating, ignore) in scores[:NUM_POS_DOCS]]) / NUM_POS_DOCS)
    recall += (sum([rating for (score, ignore, rating) in scores[:NUM_POS_DOCS]]) / num_rel) if num_rel > 0 else 0
    if num_rel > 0:
        denom_recall += 1
overlap /= len(qids_test)
recall /= denom_recall
print_message('model: exact scores, overlap: {}, recall: {}'.format(overlap, recall))

for bins in NUM_BINS_TO_TRY:
    
    overlap = 0
    recall = 0
    denom_recall = 0
    for qid in qids_test:
        features, idf, num_neg, labels_test, num_rel = get_mb_test(qid)
        out = [sum([idf[j][i, 0] * math.floor(bins*((math.log(features[j][i, 0]) if features[j][i, 0] > 0 else 0) - math.log(IMPACTS_MIN)) / (math.log(IMPACTS_MAX) - math.log(IMPACTS_MIN) + EPSILON)) for j in range(MAX_TERMS)]) for i in range(NUM_POS_DOCS + num_neg)]
        scores = [(out[i], 1 if i < NUM_POS_DOCS else 0, labels_test[i]) for i in range(NUM_POS_DOCS + num_neg)]
        random.shuffle(scores)
        scores.sort(key=lambda tup: tup[0], reverse=True)
        overlap += (sum([rating for (score, rating, ignore) in scores[:NUM_POS_DOCS]]) / NUM_POS_DOCS)
        recall += (sum([rating for (score, ignore, rating) in scores[:NUM_POS_DOCS]]) / num_rel) if num_rel > 0 else 0
        if num_rel > 0:
            denom_recall += 1
    overlap /= len(qids_test)
    recall /= denom_recall
    print_message('model: uniformly quantized scores, bins: {}, overlap: {}, recall: {}'.format(bins, overlap, recall))
    for alpha in ALPHAS_TO_TRY:
        torch.manual_seed(1)
        NUM_BINS = bins
        net = Quant(alpha)
        net = net.to(DEVICE)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
        net.eval()
        overlap = 0
        recall = 0
        denom_recall = 0
        for qid in qids_test:
            features, idf, num_neg, labels_test, num_rel = get_mb_test(qid)
            out = sum([net.forward_eval(torch.from_numpy(features[i]).to(DEVICE), torch.from_numpy(idf[i][j]).to(DEVICE)) for i in range(MAX_TERMS)]).data.cpu()
            scores = [(out[i][0], 1 if i < NUM_POS_DOCS else 0, labels_test[i]) for i in range(NUM_POS_DOCS + num_neg)]
            random.shuffle(scores)
            scores.sort(key=lambda tup: tup[0], reverse=True)
            overlap += (sum([rating for (score, rating, ignore) in scores[:NUM_POS_DOCS]]) / NUM_POS_DOCS)
            recall += (sum([rating for (score, ignore, rating) in scores[:NUM_POS_DOCS]]) / num_rel) if num_rel > 0 else 0
            if num_rel > 0:
                denom_recall += 1
        overlap /= len(qids_test)
        recall /= denom_recall
        print_message('model: learned quantized scores, bins: {}, alpha: {}, epochs: 0, loss: NA, overlap: {}, recall: {}'.format(bins, alpha, overlap, recall))
        for ep_idx in range(NUM_EPOCHS):
            train_loss = 0.0
            net.train()
            for mb_idx in range(EPOCH_SIZE):
                features, idf = get_mb_train()
                out = torch.cat(tuple([sum([net(torch.from_numpy(features[i][j]).to(DEVICE), torch.from_numpy(idf[i][j]).to(DEVICE)) for j in range(MAX_TERMS)]) for i in range(2)]), 1)
                loss = criterion(out, torch.from_numpy(labels).to(DEVICE))
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
            net.eval()
            overlap = 0
            recall = 0
            denom_recall = 0
            for qid in qids_test:
                features, idf, num_neg, labels_test, num_rel = get_mb_test(qid)
                out = sum([net.forward_eval(torch.from_numpy(features[i]).to(DEVICE), torch.from_numpy(idf[i][j]).to(DEVICE)) for i in range(MAX_TERMS)]).data.cpu()
                scores = [(out[i][0], 1 if i < NUM_POS_DOCS else 0, labels_test[i]) for i in range(NUM_POS_DOCS + num_neg)]
                random.shuffle(scores)
                scores.sort(key=lambda tup: tup[0], reverse=True)
                overlap += (sum([rating for (score, rating, ignore) in scores[:NUM_POS_DOCS]]) / NUM_POS_DOCS)
                recall += (sum([rating for (score, ignore, rating) in scores[:NUM_POS_DOCS]]) / num_rel) if num_rel > 0 else 0
                if num_rel > 0:
                    denom_recall += 1
            overlap /= len(qids_test)
            recall /= denom_recall
            print_message('model: learned quantized scores, bins: {}, alpha: {}, epochs: {}, loss: {}, overlap: {}, recall: {}'.format(bins, alpha, ep_idx + 1, train_loss / EPOCH_SIZE, overlap, recall))
print_message('Finished')

[Sep 02, 15:37:07] Starting...
[Sep 02, 15:37:48] model: random scores, overlap: 0.10117751479289927, recall: 0.10300096876951861
[Sep 02, 15:38:29] model: exact scores, overlap: 0.9886844181459575, recall: 0.8655453971210136
[Sep 02, 15:42:50] model: uniformly quantized scores, bins: 2, overlap: 0.10024654832347137, recall: 0.1128778861112227
[Sep 02, 15:46:30] model: learned quantized scores, bins: 2, alpha: 8, epochs: 0, loss: NA, overlap: 0.3010946745562133, recall: 0.5400326884197078
[Sep 02, 15:58:23] model: learned quantized scores, bins: 2, alpha: 8, epochs: 1, loss: 0.4626709517069685, overlap: 0.2989447731755424, recall: 0.5349217529020001
[Sep 02, 16:10:16] model: learned quantized scores, bins: 2, alpha: 8, epochs: 2, loss: 0.46227058895965456, overlap: 0.29987179487179483, recall: 0.5481251864088187
[Sep 02, 16:22:07] model: learned quantized scores, bins: 2, alpha: 8, epochs: 3, loss: 0.4626789004141756, overlap: 0.29883037475345187, recall: 0.5330855893536621
[Sep 02, 16