# Assignment 2B: Feature computation

The purpose of this notebook is to perform the computation of features. 

Note that some features might be expensive, so you don't want to keep re-computing them. Instead, aim for writing a set of relatively simple feature extractors, each computing one or multiple features, and save their output to separate files. Then, load the pre-computed features from multiple files in the learning step (in the [ranking notebook](2_Ranking.ipynb)).

In [1]:
import urllib
import requests
import json
import math
from IPython.display import clear_output

Tunnel local 5002 to the server : `ssh -N -L 5002:gustav1.ux.uis.no:5002 username@ssh1.ux.uis.no`

In [2]:
API = "http://127.0.0.1:5002"

MAIN_INDEX = "clueweb12b"
ANCHORS_INDEX = "clueweb12b_anchors"

In [3]:
QUERIES_FILE = "data/queries.txt"
QRELS_FILE = "data/qrels.csv"
QUERY2_FILE ='data/queries2.txt'

In [4]:
def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries

In [5]:
def exists(indexname, doc_id):
    url = "/".join([API, indexname, doc_id, "_exists"])
    response = requests.get(url).text
    return json.loads(response)['exists']
    
print(exists(MAIN_INDEX, "clueweb12-0713wb-35-00870"))
print(exists(MAIN_INDEX, "clueweb12-0906wb-09-33744"))
#clear_output()

False
True


In [6]:
def analyze_query(indexname, query):
    url = "/".join([API, indexname, "_analyze"]) + "?" \
          + urllib.parse.urlencode({"text": query})
    response = requests.get(url).text
    r = json.loads(response)
    return [t["token"] for t in r["tokens"]]

In [7]:
def term_vectors(indexname, doc_id, term_statistics=False):
    ret = {}    
    url = "/".join([API, indexname, doc_id, "_termvectors"]) + "?" \
          + urllib.parse.urlencode({"term_statistics": str(term_statistics).lower()})
    response = requests.get(url).text
    try:
        ret = json.loads(response)
    except:
        print("Failed to json-decode this response:\n{}".format(response))
    return ret

## Feature extractors

Example feature extractors.

In [8]:
QUERIES_FILE = "data/queries.txt"
QRELS_FILE = "data/qrels.csv"

In [9]:
def feature_qlen(query, doc):
    """Feature: query length (number of terms). 
    This is a query feature, so it'll have the same value for all documents."""
    return len(query.split())

In [10]:
def searchByDocID(indexname, query, field, docID, size=10):
    url = "/".join([API, indexname, "_search"]) + "?" \
          + urllib.parse.urlencode({"q": query, "df": field, "size": size, "_id":docID})
    print(url)
    response = requests.get(url).text
    return json.loads(response)

In [11]:
def search(indexname, query, field, size=10):
    url = "/".join([API, indexname, "_search"]) + "?" \
          + urllib.parse.urlencode({"q": query, "df": field, "size": size})
    response = requests.get(url).text
    return json.loads(response)

In [12]:
def feature_bm25(query, doc, field, index):

    k1 = 1.2
    b=0.75
    
    score = 0
    try:
        term_vector = term_vectors(index, doc, term_statistics=True)['term_vectors'][field]
    except:
        return 0
    
    avg_doc_len = term_vector['field_statistics']['sum_ttf']/term_vector['field_statistics']['doc_count']
    doc_len = sum([stats['term_freq'] for term, stats in term_vector['terms'].items()])

    for term in query:
        if term in term_vector['terms'].keys():            
            ftd = term_vector['terms'][term]['term_freq']
            
            idf = math.log(term_vector['field_statistics']['doc_count']/term_vector['terms'][term]['doc_freq'])

            term_score = idf*((ftd*(k1+1))/(ftd*(1-b+b*(doc_len/avg_doc_len))))

            score = score + term_score

    return score

feature_bm25(['raspberri', 'pi'],"clueweb12-0906wb-09-33744","content",MAIN_INDEX)

51.31441387836048

In [13]:
def feature_idf(query, doc, field, index):
    score = 0
    try:
        term_vector = term_vectors(index, doc, term_statistics=True)['term_vectors'][field]
    except:
        return 0
    
    avg_doc_len = term_vector['field_statistics']['sum_ttf']/term_vector['field_statistics']['doc_count']
    doc_len = sum([stats['term_freq'] for term, stats in term_vector['terms'].items()])

    for term in query:
        if term in term_vector['terms'].keys():            
            idf = math.log(term_vector['field_statistics']['doc_count']/term_vector['terms'][term]['doc_freq'])
            score = score + idf

    return score

feature_idf(['raspberri', 'pi'],"clueweb12-0906wb-09-33744","content",MAIN_INDEX)

11.583445882212153

In [14]:
def feature_docLen(doc, field, index):
    score = 0
    try:
        term_vector = term_vectors(index, doc, term_statistics=True)['term_vectors'][field]
    except:
        return 0
    
    doc_len = sum([stats['term_freq'] for term, stats in term_vector['terms'].items()])
    return doc_len

feature_docLen("clueweb12-0906wb-09-33744","content",MAIN_INDEX)

1409

##### MLM

In [15]:
MLM_FIELDS=['title','content'] 
MLM_FIELD_WEIGHTS = [0.2, 0.8] 
MLM_LAMBDA=0.1

In [16]:
class CollectionLM(object):
    def __init__(self, qterms):
        self._probs = {}
        # computing P(t|C_i) for each field and for each query term
        for field in MLM_FIELDS:
            self._probs[field] = {}
            for t in qterms:
                self._probs[field][t] = self._get_prob(field, t)
        
    def _get_prob(self, field, term):
        # Use a boolean query to find a document that contains the term
        res = search(MAIN_INDEX, term, field, size=1)
        hits = res.get('hits', {}).get("hits", {})
        doc_id = hits[0]["_id"] if len(hits) > 0 else None
        if doc_id is not None:
            # Ask for global term statistics when requesting the term vector of that doc (`term_statistics=True`)
            # TODO: complete this part   
            tv = term_vectors(MAIN_INDEX, doc_id, term_statistics=True)['term_vectors'][field]
            ttf = tv['terms'].get(term, {}).get("ttf", 0)  # total term count in the collection (in that field)
            sum_ttf = tv['field_statistics']['sum_ttf']
            return ttf / sum_ttf

        return 0  # this only happens if none of the documents contain that term

    def prob(self, field, term):
        return self._probs.get(field, {}).get(term, 0)

#qterms = analyze_query(MAIN_INDEX, "ford edge problems")
#clm = CollectionLM(qterms)
#clm.prob("content", "problem")

In [17]:
def score_mlm(clm, qterms, doc_id):
    score = 0  # log P(q|d)
    
    # Getting term frequency statistics for the given document field from Elasticsearch
    # Note that global term statistics are not needed (`term_statistics=False`)
    tv = term_vectors(MAIN_INDEX, doc_id, term_statistics=False).get("term_vectors", {})
    #es.termvectors(index=INDEX_NAME, id=doc_id, fields=FIELDS,
    #                          term_statistics=False).get("term_vectors", {})

    # compute field lengths $|d_i|$
    len_d_i = []  # document field length
    for i, field in enumerate(MLM_FIELDS):
        if field in tv: 
            len_d_i.append(sum([s["term_freq"] for t, s in tv[field]["terms"].items()]))
        else:  # that document field may be empty
            len_d_i.append(0)
        
    # scoring the query
    for t in qterms:
        Pt_theta_d = 0  # P(t|\theta_d)
        for i, field in enumerate(MLM_FIELDS):
            if field in tv:
                Pt_di = tv[field]["terms"].get(t, {}).get("term_freq", 0) / len_d_i[i]  # $P(t|d_i)$
            else:  # that document field is empty
                Pt_di = 0
            Pt_Ci = clm.prob(field, t)  # $P(t|C_i)$
            Pt_theta_di = (1 - MLM_LAMBDA) * Pt_di + MLM_LAMBDA * Pt_Ci  # $P(t|\theta_{d_i})$ with J-M smoothing
            Pt_theta_d += MLM_FIELD_WEIGHTS[i] * Pt_theta_di
        try:
            score += math.log(Pt_theta_d)    
        except:
            pass
    
    return score

In [18]:
def feature_mlm(query, docId):
    """Feature: MLM retrieval score"""
    # TODO
    #get query terms
    qterms = analyze_query(MAIN_INDEX, query)
    clm = CollectionLM(qterms)
    return score_mlm(clm, qterms, docId)
    
    #get terms and continue
feature_mlm("raspberry pi","clueweb12-0906wb-09-33744")

-5.710131249740202

### Query feature

In [19]:
"""get number of documents that have a given term in the given field"""
def feature_query_matching_docs(term, docId, field):
    res = search(MAIN_INDEX, term, field, size=1)
    hits = res.get('hits', {}).get("hits", {})
    doc_id = hits[0]["_id"] if len(hits) > 0 else None
    if doc_id is not None:  
        tv = term_vectors(MAIN_INDEX, doc_id, term_statistics=True)['term_vectors'][field]
        return tv['field_statistics']['doc_count']
            
    return 0

feature_query_matching_docs("raspberry","clueweb12-0108wb-86-18203","content")

12264890

## Feature computation

Computes features for document-query pairs and saves them to a file.

Specifically, we will save features to a JSON file, using a nested map structure, with queries on the first level, documents on the second level, and individual features on the third level. 

```python
  features = {
      'query_i': {
          'doc_j': {
              'feature_1': 0,  # value of feature_1 for (query_i, doc_j) pair
              'feature_2': 0,  # value of feature_2 for (query_i, doc_j) pair
              ...
          }
          ...
      }
      ...
  }
```

**Note**: The set of documents for a query (for which you want to compute features) should be a combination of the documents for which you have relevance labels and the top-100 documents retrieved in first-pass retrieval.
You can then decide in the learning part if/how you want to deal with class imbalance.

In [20]:
import json

In [21]:
queries = load_queries(QUERIES_FILE)

In [None]:
# TODO load actual queries from file
#queries = ["q1", "q2", "q3"]

features_1 = {}
features_2 = {}
features_3 = {}
features_4 = {}
features_5 = {}

i = 0
for qid, query in sorted(queries.items()):
    i+=1
    clear_output()
    print(i)
    
    qterms = analyze_query(MAIN_INDEX, query)
    print(qterms)
    features_1[qid] = {}
    features_2[qid] = {}
    features_3[qid] = {}
    features_4[qid] = {}
    features_5[qid] = {}

    # load document_ids from qrels file
    # loaded document_ids are indexed
    doc_ids = load_vaid_documents_by_qid(QRELS_FILE, qid)
    
    for d in doc_ids:
        print(d)
        # Here, two sets of features are computed in a single go to produce some toy data.
        # Normally, you would run these sequentially.
        #features_1[qid][d] = {
        #    'qlen': feature_qlen(query, d)
        #}
        
        #feature_bm(['raspberri', 'pi'],"clueweb12-0906wb-09-33744","content",MAIN_INDEX)
        #features_2[qid][d] = {
        #    'bm25_content': feature_bm25(qterms, d, "content", MAIN_INDEX),
        #    'bm25_title': feature_bm25(qterms, d, "title", MAIN_INDEX),
        #    'bm25_anchor': feature_bm25(qterms, d, "anchor",ANCHORS_INDEX)
        #}
        
        #features_3[qid][d] = {
        #    "mlm" : feature_mlm(query, d)
        #}
        
        #idf
        #features_4[qid][d] = {
        #    "idf_title" : feature_idf(qterms, d, "title",MAIN_INDEX),
        #    "idf_content" : feature_idf(qterms, d, "content",MAIN_INDEX)
        #}
        
        #docLen
        # feature_docLen(doc, field, index)
        features_5[qid][d] = {
            "docLen_title" : feature_docLen(d, "title",MAIN_INDEX),
            "docLen_content" : feature_docLen(d, "content",MAIN_INDEX),
            "docLen_anchor" : feature_docLen(d, "anchor", ANCHORS_INDEX )
        }


# Write computed features to file
#with open("data/features_1.json", "w") as f:
#    json.dump(features_1, f, indent=4, sort_keys=True)
    
#with open("data/features_2.json", "w") as f:
#    json.dump(features_2, f, indent=4, sort_keys=True)

#with open("data/features_3.json", "w") as f:
#    json.dump(features_3, f, indent=4, sort_keys=True)

#with open("data/features_4.json", "w") as f:
#    json.dump(features_4, f, indent=4, sort_keys=True)

# with open("data/features_5.json", "w") as f:
#     json.dump(features_5, f, indent=4, sort_keys=True)

### Feature computation for queries2

In [None]:
features_1 = {}
features_2 = {}
features_3 = {}
features_4 = {}
features_5 = {}

i = 0
for qid, query in sorted(queries2.items()):
    i+=1
    clear_output()
    print(i)
    
    qterms = analyze_query(MAIN_INDEX, query)
    
    features_1[qid] = {}
    features_2[qid] = {}
    features_3[qid] = {}
    features_4[qid] = {}

    doc_ids = []
    # get document IDS
    for field in ["content","title","anchor"]:
        
        current_index = MAIN_INDEX
        if(field == 'anchors'):
            
            current_index = ANCHORS_INDEX
        
        
        res = search(current_index, ' '.join(qterms), field, size=100)['hits']['hits']
        for doc in res:
                if exists(MAIN_INDEX, doc['_id']) and exists(ANCHORS_INDEX, doc['_id']):
                    doc_ids.append(doc['_id'])
        
        key = 'bm25_{}'.format(field)           
        for d in doc_ids:
            features_1[qid][d] = {}
            features_2[qid][d] = {}
            
            
            if field == "anchor":
                features_2[qid][d][key] = feature_bm25(qterms, d, field, ANCHORS_INDEX)
               
            else:
                features_2[qid][d][key] = feature_bm25(qterms, d, field, MAIN_INDEX)

            features_1[qid][d] = {
                'qlen': len(query.split()),
                'q_token_len': len(qterms)
            }
            
            clear_output()
            print('done' + d)
    

# Write computed features to file
# with open("data/queries2/features_1.json", "w") as f:
#     json.dump(features_1, f, indent=4, sort_keys=True)
    
# with open("data/queries2/features_2.json", "w") as f:
#     json.dump(features_2, f, indent=4, sort_keys=True)

6


### Updated under one function

In [22]:
def generate_features(given_queries):
    features_1 = {}
    features_2 = {}
    features_3 = {}
    features_4 = {}
    features_5 = {}
    
    i = 0
    for qid, query in sorted(given_queries.items()):
        i+=1
        clear_output()
        print(i)
        
        qterms = analyze_query(MAIN_INDEX, query)
    
        features_1[qid] = {}
        features_2[qid] = {}
        features_3[qid] = {}
        features_4[qid] = {}
        features_5[qid] = {}
        
        doc_ids = []
        # get document IDS
        for field in ["content","title","anchors"]:
        
            current_index = MAIN_INDEX
            if(field == 'anchors'):
            
                current_index = ANCHORS_INDEX
        
        
            res = search(current_index, ' '.join(qterms), field, size=100)['hits']['hits']
            for doc in res:
                    if exists(MAIN_INDEX, doc['_id']) and exists(ANCHORS_INDEX, doc['_id']):
                        doc_ids.append(doc['_id'])
        
         
        for d in doc_ids:
            features_1[qid][d] = {}
            features_2[qid][d] = {}
            features_3[qid][d] = {}
            features_4[qid][d] = {}
            features_5[qid][d] = {}
            
            features_1[qid][d] = {
                'qlen': len(query.split()),
                'q_term_len': len(qterms)
            }
            
            for field in ["content","title","anchors"]:
                
                current_index = MAIN_INDEX
                if(field == 'anchors'):
                    current_index = ANCHORS_INDEX
                    
                key = 'bm25_{}'.format(field)
                
                features_2[qid][d][key] = feature_bm25(qterms, d, field, current_index)
               
                features_2[qid][d][key] = feature_bm25(qterms, d, field, current_index)
                
            features_5[qid][d] = {
            "docLen_title" : feature_docLen(d, "title",MAIN_INDEX),
            "docLen_content" : feature_docLen(d, "content",MAIN_INDEX),
            "docLen_anchor" : feature_docLen(d, "anchors", ANCHORS_INDEX )
        }
            
    print('finished')
    return features_1,features_2,features_3,features_4,features_5

In [33]:
queries = load_queries(QUERIES_FILE)

In [26]:
feat_1,feat_2,feat_3,feat_4,feat_5 = generate_features(queries)

50
finished


In [27]:
# feat_2

In [28]:
queries2 = load_queries(QUERY2_FILE)

In [29]:
test_feat_1,test_feat_2,test_feat_3,test_feat_4,test_feat_5 = generate_features(queries2)

24


JSONDecodeError: Expecting value: line 1 column 1 (char 0)