# Assignment 2B: Ranking

This notebook contains the skeleton for training a model and then applying it to produce a document ranking.

## Loading the precomputed features

The code below loads the precomputed features and combines them into feature vectors for query-document pairs.

For this part to work, you'll need to run the `1_Feature_computation` notebook first to generate the sample features JSON files.

In [40]:
import json
from sklearn import preprocessing
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, random_state=42, shuffle=False)

In [41]:
QUERIES_FILE = "data/queries.txt"
QRELS_FILE = "data/qrels.csv"
QUERY2_FILE ='data/queries2.txt'

In [42]:
def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries
queries = load_queries(QUERIES_FILE)

In [77]:
def load_qrels(qrels_file):
    labels = {}
    with open(qrels_file, "r") as fin:
        for line in fin.readlines()[1:]:
            qid, docID, relevance = line.strip().split(",")
            label = qid+"-"+docID
            labels[label] = int(relevance)

    return labels

qrels = load_qrels(QRELS_FILE)

In [79]:
# qrels

In [93]:
def load_features(queries, features_to_load):
    feature_names = []
    features = {}
    for f in features_to_load:
        print("Loading features from {}".format(f['file']))
        feature_names += f['features']
        with open(f['file']) as json_file:
            data = json.load(json_file)
            
            for q, qdocs in data.items():
                for d, feature_values in qdocs.items():
                    
                    key = "{}-{}".format(q, d)
                    
                    for feature_name in f['features']:
                        
                        # Note: no error checking is performed. It is assumed that all feature files
                        # contain the same queries and documents.
                        fvect = features.get(key, [])
                        
                        fvect.append(feature_values[feature_name])
                        
                        features[key] = fvect

    print("Feature vector: {}".format(feature_names))
    new_feature = {}
    for key,value in features.items():
        if sum(value[1:]) > 0:
            new_feature[key] = value
    return new_feature

In [94]:
# Specify the features to be loaded from each file.
# They will make up the feature vector in this exact order.
features_to_load = [
    {
        'file': "data/features_1.json",
        'features': ["qlen"]
    },
    {
        'file': "data/features_2.json",
        'features': ["bm25_title", "bm25_content"]
    },
    #{
    #    'file': "data/features_3.json",
    #    'features': ["mlm"]
    #},
    {
        'file': "data/features_4.json", 
        'features': ["idf_title", "idf_content"]
    } ,
    {
        'file': "data/features_5.json", 
        'features': ["docLen_title", "docLen_content", "docLen_anchor"]
    },
    {
        'file': "data/features_6.json", 
        'features': ["pagerank"]
    } 
]
features = load_features(queries, features_to_load)
NO_OF_FEATURES = features
list(features.items())[0][1]

Loading features from data/features_1.json
Loading features from data/features_2.json
Loading features from data/features_4.json
Loading features from data/features_5.json
Loading features from data/features_6.json
Feature vector: ['qlen', 'bm25_title', 'bm25_content', 'idf_title', 'idf_content', 'docLen_title', 'docLen_content', 'docLen_anchor', 'pagerank']


[2,
 31.36776352363769,
 30.951880129983103,
 18.170072596082754,
 11.587501074035565,
 9,
 3355,
 0,
 5.2002362647602316e-08]

In [96]:
# This is how the feature vectors look like
# for k, v in features.items():
#     print(k, v)
# features

In [63]:
class PointWiseLTRModel(object):
    def __init__(self, regressor):
        """
        :param classifier: an instance of scikit-learn regressor
        """
        self.regressor = regressor

    def _train(self, X, y):
        """
        Trains and LTR model.
        :param X: features of training instances
        :param y: relevance assessments of training instances
        :return:
        """
        assert self.regressor is not None
        self.model = self.regressor.fit(X, y)

    def rank(self, ft, doc_ids):
        """
        Predicts relevance labels and rank documents for a given query
        :param ft: a list of features for query-doc pairs
        :param ft: a list of document ids
        :return:
        """
        assert self.model is not None
        rel_labels = self.model.predict(ft)
        sort_indices = np.argsort(rel_labels)[::-1]

        results = []
        for i in sort_indices:
            results.append((doc_ids[i], rel_labels[i]))
        return results

## Training a model

Training needs to be done differently based on the scenario:

  * **Scenario 1**: The model is trained using cross-validation, that is on 4/5 of queries, then applied on the remaining 1/5 of queries (repeated 5 times).
  * **Scenario 2**: The model is trained on all available training data.
  
The feature vectors at this point are already created. These should contain both (a) the training queries and (b) the queries on which you want to apply your model.

Train your model on queries (a). For that you'll also need to load the corresponding relevance labels.

### Evaluation of model

In [64]:
import math
def dcg(rel, p):
    dcg = rel[0]
    for i in range(1, min(p, len(rel))): 
        dcg += rel[i] / math.log(i + 1, 2)  # rank position is indexed from 1..
    return dcg



def load_qrelsdata(file):
    gtruth = {}
    with open(file, "r") as fin:
        fin.readline()  # excluding header line from processing
        for line in fin:
            qid, did, rel = line.strip().split(",", 2)
            if not qid in gtruth:
                gtruth[qid] = {}
            gtruth[qid][did] = int(rel)
    return gtruth
qrels = load_qrelsdata(QRELS_FILE)

def eval_scores(rankings):
    sum_ndcg5 = 0
    sum_ndcg10 = 0
    sum_ndcg20 = 0
    for qid, ranking in sorted(rankings.items()):
        gt = qrels[qid]    

        gains = [] # holds corresponding relevance levels for the ranked docs
        for doc_id,score in ranking: 
            gain = gt.get(doc_id, 0)
            gains.append(gain)

        gain_ideal = sorted([v for _, v in gt.items()], reverse=True)

        ndcg5 = dcg(gains, 5) / dcg(gain_ideal, 5)
        ndcg10 = dcg(gains, 10) / dcg(gain_ideal, 10)
        ndcg20 = dcg(gains, 20) / dcg(gain_ideal, 20)
        sum_ndcg5 += ndcg5
        sum_ndcg10 += ndcg10
        sum_ndcg20 += ndcg20
    return {
        "ndcg@5" : round(sum_ndcg5 / len(rankings), 3), 
        "ndcg@10": round(sum_ndcg10 / len(rankings), 3),
        "ndcg@20": round(sum_ndcg20 / len(rankings), 3)
    }

In [80]:
def load_data(features,qrels):
    X, Y, qids, doc_ids = [], [], [], []
    for label,feat in features.items():
        qdid=label.split('-')
        qid=qdid[0]
        doc_id = "{}-{}-{}-{}".format(qdid[1], qdid[2],qdid[3],qdid[4])
        qids.append(qid)
        doc_ids.append(doc_id)
        X.append(feat)
        Y.append(qrels.get(label,0))
    feature_vector = X
    mm_scaler = preprocessing.MinMaxScaler()
    feature_vector = mm_scaler.fit_transform(feature_vector)
    return feature_vector, Y, qids, doc_ids

In [66]:
def get_train_data(train_qids):
    train_X, train_y = [], []
    for i in range(len(X)):
        if qids[i] in train_qids:
            train_X.append(X[i])
            train_y.append(Y[i])
    
    return train_X,train_y

In [67]:
def load_documents_by_qid(qrels_file,query_id):
    docs = []
    with open(qrels_file, "r") as fin:
        for line in fin.readlines()[1:]:
            qid, docID, query = line.strip().split(",")
            if(qid == query_id):
                docs.append(docID)
    return list(set(docs))

def get_features_docs_by_qid(qid):
    test_doc_ids = load_documents_by_qid(QRELS_FILE,qid)
    test_X = []

    for d in test_doc_ids:
        key = "{}-{}".format(qid, d)
        if key in list(features.keys()):
            test_X.append(features[key])
        else:
            test_X.append([-1])
    
    return test_X, test_doc_ids

### Ranking with cross validation

In [81]:
X, Y, qids, doc_ids=load_data(features,qrels)

In [85]:
qids_unique= list(set(qids))

train_qids = []
test_qids = []

rankings = {}
clf = RandomForestRegressor(max_depth=3, random_state=0)

# split the data into test train set
for train_index, test_index in cv.split(qids_unique):

    train_qids = [qids_unique[i] for i in train_index]
    test_qids = [qids_unique[i] for i in test_index]

    train_X, train_Y = get_train_data(train_qids)
    
    ltr = PointWiseLTRModel(clf)
    
    # train with train split
    ltr._train(train_X, train_Y)
    
    # generate ranking for validation fold
    for test_qid in test_qids:
        test_X, test_doc_ids = get_features_docs_by_qid(test_qid)
        
        ranks = ltr.rank(test_X, test_doc_ids)
        rankings[test_qid] = sorted(ranks, key=lambda score: score[1], reverse = True)[:100]
        



In [87]:
# rankings

In [17]:
eval_scores(rankings)

{'ndcg@5': 0.131, 'ndcg@10': 0.144, 'ndcg@20': 0.168}

## Applying the model to produce a ranking

Apply the train model on queries (b) and sort documents according to the predicted relevance score.

#### Use all of query 1 as training set

In [18]:
# Specify the features to be loaded from each file.
# They will make up the feature vector in this exact order.
features_to_load_query2 = [
    {
        'file': "data/queries/features_1.json",
        'features': ["qlen"]
    },  # feature 1
    {
        'file': "data/queries/features_2.json",
        'features': ["bm25_title", "bm25_content"]
    },  # feature 2, feature 3
    {
        'file': "data/queries/features_3.json",
        'features': ["mlm"]
    },
]

In [19]:
queries2 = load_queries(QUERY2_FILE)
features2 = load_features(queries, features_to_load)

Loading features from data/features_1.json
Loading features from data/features_2.json
Feature vector: ['qlen', 'bm25_title', 'bm25_content']


In [20]:
# features2

In [21]:
ltr = PointWiseLTRModel(clf)

# get training data
train_X, train_Y = get_train_data(qids_unique)

ltr._train(train_X, train_Y)

In [328]:
X, Y, _, _=load_data(features,qrels)

In [329]:
def predict_result(features2, queries2, ltr):
    # output_format = "trec 22"
    OUTPUT_FILE = "data/queries2_ranking.csv"

    with open(OUTPUT_FILE, "w") as fout:
        fout.write("QueryId,DocumentId\n")
        for qid, query in sorted(queries2.items()):
            # Convert into the format required by the `PointWiseLTRModel` class
            # and deal with missing feature values
            features, _, qids, doc_ids=load_data(features2,qrels)
            
            r = ltr.rank(features, doc_ids)
            # Write the results to file
            rank = 1
            for doc_id, score in r:
                if rank <= 20:
                    fout.write(qid + "," + doc_id + "\n")
                    rank += 1
                else:
                    break


In [330]:
predict_result(features2, queries2, ltr)