In [2]:
from whoosh.index import *
from whoosh.fields import *
from whoosh.qparser import *
from whoosh import scoring
from bs4 import BeautifulSoup as bs4 
import os
import json
INDEX_PATH = "./indexdir"
INDEX_NAME = "papers"

idx = open_dir(INDEX_PATH, indexname=INDEX_NAME)

from whoosh import qparser, query, scoring
from whoosh.analysis import RegexTokenizer
from whoosh.lang.morph_en import variations

freq_searcher = idx.searcher(weighting=scoring.Frequency())
tfidf_searcher = idx.searcher(weighting=scoring.TF_IDF())
bm25_searcher = idx.searcher(weighting=scoring.BM25F(B=0.74, K1=1.52))
query_parser = QueryParser('abstract', idx.schema)
query_parser.add_plugin(FuzzyTermPlugin())
title_parser = QueryParser('title', idx.schema)
title_parser.add_plugin(FuzzyTermPlugin())
tokenizer = RegexTokenizer()

In [3]:
def genearte_LETOR_data(qid, cur_query, docid, content, title, rel) :
    terms = cur_query.split(' ')
    q = query_parser.parse("path:\'"+docid+"\' "+" OR ".join(terms))
    q_2 = title_parser.parse("path:\'"+docid+"\' "+" OR ".join(terms))
    
    results = freq_searcher.search(q, limit=None)
    tf_a = 0.0
    if len(results):
#         print("Abstract TF feature", results[0].score)
        tf_a = results[0].score
    
    results = freq_searcher.search(q_2, limit=None)
    tf_t = 0.0
    if len(results):
#         print("Title TF feature", results[0].score)
        tf_t = results[0].score

    idf_a = sum(freq_searcher.idf("abstract", x) for x in terms)
#     print("Abstract IDF feature", idf_a)
    idf_t = sum(freq_searcher.idf("title", x) for x in terms)
#     print("Title IDF feature", idf_t)

    results = tfidf_searcher.search(q, limit=None)
    tfidf_a = 0.0
    if len(results):
#         print("Abstract TF-IDF feature", results[0].score)
        tfidf_a = results[0].score
    
    results = tfidf_searcher.search(q_2, limit=None)
    tfidf_t = 0.0
    if len(results):
#         print("Title TF-IDF feature", results[0].score)
        tfidf_t = results[0].score

    results = bm25_searcher.search(q, limit=None)
    bm25_a = 0.0
    if len(results):
#         print("Abstract BM25 feature", results[0].score)
        bm25_a = results[0].score
    
    results = bm25_searcher.search(q_2, limit=None)
    bm25_t = 0.0
    if len(results):
#         print("Title BM25 feature", results[0].score)
        bm25_t = results[0].score

    dl = len(list(x for x in tokenizer(content)))
#     print("DL feature", dl)
    
    tl = len(list(x for x in tokenizer(title)))
#     print("TL feature", tl)
    
    return rel + " qid:%s 1:%f 2:%f 3:%f 4:%f 5:%f 6:%f 7:%f 8:%f 9:%f 10:%f #docid = %s\n" % (qid, tf_a, tf_t, idf_a, idf_t, tfidf_a, tfidf_t, bm25_a, bm25_t, dl, tl, docid)

In [5]:
# Build the query map
query_map = dict()
with open("queries.indri", 'r') as q:
    soup = bs4(q, 'lxml')
    x = soup.find_all('query')
    i = 1
    for query in x :
        query_1 = query.text.strip().split('\n')
        query_map[str(i)] = query_1[1]
        i += 1

with open("./bob_acl_anth.qrels", 'r') as qrels, open('train_f.txt', 'w+') as w :
    for line in qrels :
        data = line.strip().split('\t')
        qid = data[0]
        docid = data[2]
        rel = data[3]
        cur_query = query_map[qid]
        stored = freq_searcher.document(path=docid+".tei.xml")
        if not stored :
            continue
        w.write(genearte_LETOR_data(qid, cur_query, docid+".tei.xml", stored['abstract'], stored['title'], rel))           

In [6]:
# Train data using Learning-To-Rank LambdaMART model
import pyltr

with open('train_f.txt') as f :
    features, rels, qids, comment = pyltr.data.letor.read_dataset(f)
    metric = pyltr.metrics.NDCG(k=10)
    model = pyltr.models.LambdaMART(
        metric=metric,
        n_estimators=800,
        learning_rate=0.02,
        max_features=0.5,
        query_subsample=0.5,
        max_leaf_nodes=10,
        min_samples_leaf=64,
        verbose=1,
    )
    model.fit(features, rels, qids)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.3836       0.2530       42.20s                                         
    2       0.4675       0.0969       40.38s                                         
    3       0.5149       0.0529       50.91s                                         
    4       0.5620      -0.0088       55.68s                                         
    5       0.5000      -0.0085       53.19s                                         
    6       0.5395       0.0027       50.69s                                         
    7       0.5626      -0.0146       48.49s                                         
    8       0.5195       0.0005       46.94s                                         
    9       0.5353      -0.0043       46.02s                                         
   10       0.4942       0.0030       45.35s                                         
   15       0.5700       0.0091       42.37s         

In [None]:
# serialize the model
import pickle

with open('ltr_model.pickle', 'wb') as f:
    # Pickle the 'ltr_model' dictionary using the highest protocol available.
    pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open('ltr_model.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    model = pickle.load(f)

In [None]:
from whoosh import qparser, query, scoring

words = "information"
with open('result_f.txt.tmp', 'w+') as wr :
    searcher = idx.searcher(weighting=scoring.BM25F(B=0.74, K1=1.52))
    query_parser = MultifieldParser(['title', 'abstract', 'area'], idx.schema, termclass=query.Variations)
    query_parser.add_plugin(FuzzyTermPlugin())
    query_parsed = query.And([query.Variations('abstract', x) for x in words.split(' ') if len(x) > 1])
    results = searcher.search(query_parsed, limit=100)
    for r in results :
        wr.write(genearte_LETOR_data('1', words, r['path'], r['abstract'], r['title'], '1'))

with open('result_f.txt.tmp', 'r') as f :
    features, _, qids, docs = pyltr.data.letor.read_dataset(f)
    p = model.predict(features)
    result = []
    for j in range(len(qids)) :
        result.append((p[j], searcher.document(path=docs[j][8:].strip())))
    result.sort(key = lambda x:x[0], reverse = True)

In [None]:
docnum = searcher.document_number(path=u"W05-0307.tei.xml")
r = searcher.more_like(docnum, 'abstract', top = 30)