In [15]:
from whoosh.index import *
from whoosh.fields import *
from whoosh.qparser import *
from whoosh import scoring
from bs4 import BeautifulSoup as bs4 
import os
import json
INDEX_PATH = "./indexdir"
INDEX_NAME = "papers"

idx = open_dir(INDEX_PATH, indexname=INDEX_NAME)

from whoosh import qparser, query, scoring
from whoosh.analysis import RegexTokenizer
from whoosh.lang.morph_en import variations

freq_searcher = idx.searcher(weighting=scoring.Frequency())
tfidf_searcher = idx.searcher(weighting=scoring.TF_IDF())
bm25_searcher = idx.searcher(weighting=scoring.BM25F(B=0.74, K1=1.52))
query_parser = QueryParser('abstract', idx.schema)
query_parser.add_plugin(FuzzyTermPlugin())
title_parser = QueryParser('title', idx.schema)
title_parser.add_plugin(FuzzyTermPlugin())
tokenizer = RegexTokenizer()

In [40]:
def genearte_LETOR_data(qid, cur_query, docid, content, title, rel) :
    terms = cur_query.split(' ')
    q = query_parser.parse("path:\'"+docid+"\' "+" OR ".join(terms))
    q_2 = title_parser.parse("path:\'"+docid+"\' "+" OR ".join(terms))
    
    results = freq_searcher.search(q, limit=None)
    tf_a = 0.0
    if len(results):
#         print("Abstract TF feature", results[0].score)
        tf_a = results[0].score
    
    results = freq_searcher.search(q_2, limit=None)
    tf_t = 0.0
    if len(results):
#         print("Title TF feature", results[0].score)
        tf_t = results[0].score

    idf_a = sum(freq_searcher.idf("abstract", x) for x in terms)
#     print("Abstract IDF feature", idf_a)
    idf_t = sum(freq_searcher.idf("title", x) for x in terms)
#     print("Title IDF feature", idf_t)

    results = tfidf_searcher.search(q, limit=None)
    tfidf_a = 0.0
    if len(results):
#         print("Abstract TF-IDF feature", results[0].score)
        tfidf_a = results[0].score
    
    results = tfidf_searcher.search(q_2, limit=None)
    tfidf_t = 0.0
    if len(results):
#         print("Title TF-IDF feature", results[0].score)
        tfidf_t = results[0].score

    results = bm25_searcher.search(q, limit=None)
    bm25_a = 0.0
    if len(results):
#         print("Abstract BM25 feature", results[0].score)
        bm25_a = results[0].score
    
    results = bm25_searcher.search(q_2, limit=None)
    bm25_t = 0.0
    if len(results):
#         print("Title BM25 feature", results[0].score)
        bm25_t = results[0].score

    dl = len(list(x for x in tokenizer(content)))
#     print("DL feature", dl)
    
    tl = len(list(x for x in tokenizer(title)))
#     print("TL feature", tl)
    
    return rel + " qid:%s 1:%f 2:%f 3:%f 4:%f 5:%f 6:%f 7:%f 8:%f 9:%f 10:%f #docid = %s\n" % (qid, tf_a, tf_t, idf_a, idf_t, tfidf_a, tfidf_t, bm25_a, bm25_t, dl, tl, docid)

In [34]:
# Build the query map
query_map = dict()
with open("queries.indri", 'r') as q:
    soup = bs4(q, 'lxml')
    x = soup.find_all('query')
    i = 1
    for query in x :
        query_1 = query.text.strip().split('\n')
        query_map[str(i)] = query_1[1]
        i += 1

with open("./bob_acl_anth.qrels", 'r') as qrels, open('train_f.txt', 'w+') as w :
    for line in qrels :
        data = line.strip().split('\t')
        qid = data[0]
        docid = data[2]
        rel = data[3]
        cur_query = query_map[qid]
        stored = freq_searcher.document(path=docid+".tei.xml")
        if not stored :
            continue
        w.write(genearte_LETOR_data(qid, cur_query, docid+".tei.xml", stored['abstract'], stored['title'], rel))           

Abstract IDF feature 92.4560724513172
Title IDF feature 104.56255732141997
DL feature 91
TL feature 6
Abstract IDF feature 92.4560724513172
Title IDF feature 104.56255732141997
DL feature 0
TL feature 7
Abstract IDF feature 92.4560724513172
Title IDF feature 104.56255732141997
DL feature 140
TL feature 5
Abstract IDF feature 92.4560724513172
Title IDF feature 104.56255732141997
DL feature 22
TL feature 5
Abstract IDF feature 92.4560724513172
Title IDF feature 104.56255732141997
DL feature 137
TL feature 6
Abstract IDF feature 92.4560724513172
Title IDF feature 104.56255732141997
DL feature 78
TL feature 6
Abstract IDF feature 92.4560724513172
Title IDF feature 104.56255732141997
DL feature 311
TL feature 7
Abstract IDF feature 92.4560724513172
Title IDF feature 104.56255732141997
DL feature 62
TL feature 14
Abstract IDF feature 92.4560724513172
Title IDF feature 104.56255732141997
DL feature 69
TL feature 8
Abstract IDF feature 92.4560724513172
Title IDF feature 104.56255732141997
DL f

Abstract TF feature 11.0
Title TF feature 3.0
Abstract IDF feature 64.49623960490962
Title IDF feature 67.12048582819972
Abstract TF-IDF feature 49.23167738548157
Title TF-IDF feature 20.76465190152212
Abstract BM25 feature 16.12837739875028
Title BM25 feature 11.778901804717318
DL feature 257
TL feature 10
Abstract TF feature 10.0
Title TF feature 6.0
Abstract IDF feature 64.49623960490962
Title IDF feature 67.12048582819972
Abstract TF-IDF feature 43.08131738412866
Title TF-IDF feature 33.8854671825644
Abstract BM25 feature 21.330901394283728
Title BM25 feature 19.245745786010808
DL feature 67
TL feature 11
Abstract IDF feature 31.632573305569885
Title IDF feature 36.105225673384595
DL feature 514
TL feature 7
Abstract IDF feature 31.632573305569885
Title IDF feature 36.105225673384595
DL feature 69
TL feature 10
Title TF feature 2.0
Abstract IDF feature 31.632573305569885
Title IDF feature 36.105225673384595
Title TF-IDF feature 16.112882136766654
Title BM25 feature 7.05186404942154

Abstract IDF feature 162.7096826022062
Title IDF feature 197.45717373512363
DL feature 154
TL feature 16
Abstract IDF feature 162.7096826022062
Title IDF feature 197.45717373512363
DL feature 106
TL feature 10
Abstract IDF feature 162.7096826022062
Title IDF feature 197.45717373512363
DL feature 88
TL feature 64
Abstract IDF feature 162.7096826022062
Title IDF feature 197.45717373512363
DL feature 90
TL feature 4
Abstract IDF feature 98.99568033369104
Title IDF feature 122.53810893846364
DL feature 0
TL feature 123
Abstract TF feature 3.0
Abstract IDF feature 98.99568033369104
Title IDF feature 122.53810893846364
Abstract TF-IDF feature 18.36827479620158
Abstract BM25 feature 9.317064266321873
DL feature 104
TL feature 11
Abstract TF feature 3.0
Abstract IDF feature 98.99568033369104
Title IDF feature 122.53810893846364
Abstract TF-IDF feature 20.327247200758816
Abstract BM25 feature 9.216322138322935
DL feature 56
TL feature 10
Abstract IDF feature 98.99568033369104
Title IDF feature 

Abstract BM25 feature 10.079051933435434
Title BM25 feature 18.939974564733543
DL feature 153
TL feature 8
Abstract TF feature 17.0
Title TF feature 5.0
Abstract IDF feature 108.89390630210647
Title IDF feature 134.44464349482573
Abstract TF-IDF feature 61.883922155988905
Title TF-IDF feature 28.11012853918465
Abstract BM25 feature 23.764098893078902
Title BM25 feature 20.91279699275468
DL feature 186
TL feature 8
Abstract TF feature 7.0
Title TF feature 3.0
Abstract IDF feature 108.89390630210647
Title IDF feature 134.44464349482573
Abstract TF-IDF feature 35.904017014406314
Title TF-IDF feature 19.746404046193902
Abstract BM25 feature 24.53563155888299
Title BM25 feature 10.15069629550727
DL feature 68
TL feature 9
Abstract TF feature 10.0
Title TF feature 3.0
Abstract IDF feature 108.89390630210647
Title IDF feature 134.44464349482573
Abstract TF-IDF feature 45.295692252223205
Title TF-IDF feature 21.87657305648589
Abstract BM25 feature 36.032390578261726
Title BM25 feature 12.99077

Title BM25 feature 9.390555456524384
DL feature 56
TL feature 11
Abstract TF feature 10.0
Title TF feature 5.0
Abstract IDF feature 90.34485415969442
Title IDF feature 107.20541401357636
Abstract TF-IDF feature 39.13371056856936
Title TF-IDF feature 28.33558725824617
Abstract BM25 feature 21.018376428335962
Title BM25 feature 20.030367541168914
DL feature 85
TL feature 9
Abstract TF feature 5.0
Title TF feature 2.0
Abstract IDF feature 90.34485415969442
Title IDF feature 107.20541401357636
Abstract TF-IDF feature 26.14337943421647
Title TF-IDF feature 14.482977737191236
Abstract BM25 feature 16.095905530955193
Title BM25 feature 4.9325853732218965
DL feature 87
TL feature 7
Abstract TF feature 2.0
Abstract IDF feature 156.6197711422267
Title IDF feature 186.70464966430882
Abstract TF-IDF feature 16.123634928542916
Abstract BM25 feature 7.3445641424467345
DL feature 69
TL feature 10
Abstract TF feature 2.0
Title TF feature 2.0
Abstract IDF feature 156.6197711422267
Title IDF feature 186

Abstract BM25 feature 29.177845410563002
Title BM25 feature 14.071938418280268
DL feature 56
TL feature 9
Abstract TF feature 4.0
Title TF feature 4.0
Abstract IDF feature 227.09629581313118
Title IDF feature 268.7218620887404
Abstract TF-IDF feature 23.192819889972274
Title TF-IDF feature 31.923293189150158
Abstract BM25 feature 18.839915816586764
Title BM25 feature 18.87659466869372
DL feature 37
TL feature 15
Abstract TF feature 3.0
Abstract IDF feature 227.09629581313118
Title IDF feature 268.7218620887404
Abstract TF-IDF feature 18.78522285670819
Abstract BM25 feature 8.583816638995753
DL feature 125
TL feature 9
Abstract TF feature 5.0
Title TF feature 3.0
Abstract IDF feature 227.09629581313118
Title IDF feature 268.7218620887404
Abstract TF-IDF feature 31.744934519143072
Title TF-IDF feature 22.97301116867247
Abstract BM25 feature 19.140808341834315
Title BM25 feature 14.977665129467278
DL feature 53
TL feature 8
Abstract TF feature 8.0
Title TF feature 3.0
Abstract IDF feature

DL feature 74
TL feature 5
Abstract IDF feature 201.86405195656255
Title IDF feature 240.80815917337404
DL feature 178
TL feature 11
Abstract IDF feature 201.86405195656255
Title IDF feature 240.80815917337404
DL feature 123
TL feature 11
Abstract TF feature 15.0
Abstract IDF feature 201.86405195656255
Title IDF feature 240.80815917337404
Abstract TF-IDF feature 52.914531195652806
Abstract BM25 feature 12.772510586674882
DL feature 603
TL feature 17
Abstract IDF feature 201.86405195656255
Title IDF feature 240.80815917337404
DL feature 192
TL feature 13
Abstract TF feature 3.0
Abstract IDF feature 116.15382370963232
Title IDF feature 147.9304044198475
Abstract TF-IDF feature 17.616502299196206
Abstract BM25 feature 5.614645809881072
DL feature 140
TL feature 5
Abstract TF feature 2.0
Abstract IDF feature 116.15382370963232
Title IDF feature 147.9304044198475
Abstract TF-IDF feature 14.934912964469447
Abstract BM25 feature 5.586760221401869
DL feature 86
TL feature 5
Abstract TF feature

Abstract BM25 feature 5.144231796865608
DL feature 62
TL feature 7
Abstract TF feature 3.0
Abstract IDF feature 108.40320387993046
Title IDF feature 122.26691027371939
Abstract TF-IDF feature 22.873789146191804
Abstract BM25 feature 10.359480776730457
DL feature 93
TL feature 13
Abstract IDF feature 108.40320387993046
Title IDF feature 122.26691027371939
DL feature 76
TL feature 12
Abstract IDF feature 108.40320387993046
Title IDF feature 122.26691027371939
DL feature 163
TL feature 13
Title TF feature 4.0
Abstract IDF feature 108.40320387993046
Title IDF feature 122.26691027371939
Title TF-IDF feature 29.838792614757928
Title BM25 feature 16.425221725611543
DL feature 0
TL feature 16
Abstract IDF feature 108.40320387993046
Title IDF feature 122.26691027371939
DL feature 86
TL feature 8
Abstract TF feature 2.0
Abstract IDF feature 108.40320387993046
Title IDF feature 122.26691027371939
Abstract TF-IDF feature 14.28380055159535
Abstract BM25 feature 4.516687329048166
DL feature 116
TL f

Abstract IDF feature 18.08954256769863
Title IDF feature 22.264102434440925
DL feature 106
TL feature 3
Abstract TF feature 2.0
Abstract IDF feature 18.08954256769863
Title IDF feature 22.264102434440925
Abstract TF-IDF feature 14.810201915687898
Abstract BM25 feature 6.925436423902942
DL feature 33
TL feature 9
Abstract IDF feature 18.08954256769863
Title IDF feature 22.264102434440925
DL feature 61
TL feature 7
Abstract IDF feature 18.08954256769863
Title IDF feature 22.264102434440925
DL feature 70
TL feature 10
Abstract IDF feature 18.08954256769863
Title IDF feature 22.264102434440925
DL feature 91
TL feature 9
Abstract TF feature 2.0
Abstract IDF feature 18.08954256769863
Title IDF feature 22.264102434440925
Abstract TF-IDF feature 18.85908410383324
Abstract BM25 feature 6.67797668130726
DL feature 247
TL feature 7
Abstract IDF feature 18.08954256769863
Title IDF feature 22.264102434440925
DL feature 150
TL feature 9
Title TF feature 2.0
Abstract IDF feature 18.08954256769863
Tit

Abstract TF-IDF feature 46.09378102177512
Title TF-IDF feature 17.590572778369733
Abstract BM25 feature 10.117128503820243
Title BM25 feature 9.254868066928722
DL feature 158
TL feature 6
Abstract TF feature 2.0
Abstract IDF feature 48.49475630469372
Title IDF feature 55.95056627499971
Abstract TF-IDF feature 15.277109567228505
Abstract BM25 feature 5.720746888125078
DL feature 94
TL feature 7
Abstract IDF feature 48.49475630469372
Title IDF feature 55.95056627499971
DL feature 40
TL feature 8
Abstract TF feature 3.0
Title TF feature 2.0
Abstract IDF feature 48.49475630469372
Title IDF feature 55.95056627499971
Abstract TF-IDF feature 16.667561852858668
Title TF-IDF feature 15.765519581042216
Abstract BM25 feature 5.306383559199342
Title BM25 feature 6.044610336715
DL feature 112
TL feature 11
Abstract TF feature 2.0
Abstract IDF feature 48.49475630469372
Title IDF feature 55.95056627499971
Abstract TF-IDF feature 15.277109567228505
Abstract BM25 feature 5.973337126750587
DL feature 85

Title TF feature 2.0
Abstract IDF feature 61.865609434118674
Title IDF feature 73.6078372600448
Title TF-IDF feature 15.853676564755832
Title BM25 feature 6.75239274311777
DL feature 0
TL feature 7
Abstract TF feature 2.0
Title TF feature 2.0
Abstract IDF feature 61.865609434118674
Title IDF feature 73.6078372600448
Abstract TF-IDF feature 14.70237351752617
Title TF-IDF feature 15.976458318051732
Abstract BM25 feature 4.745157826237808
Title BM25 feature 6.560310781031817
DL feature 132
TL feature 10
Abstract IDF feature 61.865609434118674
Title IDF feature 73.6078372600448
DL feature 0
TL feature 6
Abstract TF feature 3.0
Title TF feature 2.0
Abstract IDF feature 61.865609434118674
Title IDF feature 73.6078372600448
Abstract TF-IDF feature 19.099881182347417
Title TF-IDF feature 15.853676564755832
Abstract BM25 feature 7.415765563774222
Title BM25 feature 6.135539916853459
DL feature 92
TL feature 10
Abstract TF feature 9.0
Title TF feature 4.0
Abstract IDF feature 61.865609434118674


Abstract BM25 feature 25.350984192405353
DL feature 159
TL feature 14
Abstract IDF feature 205.58561401076946
Title IDF feature 228.02206442647693
DL feature 145
TL feature 4
Abstract TF feature 23.0
Title TF feature 4.0
Abstract IDF feature 205.58561401076946
Title IDF feature 228.02206442647693
Abstract TF-IDF feature 127.41307973707981
Title TF-IDF feature 30.334140170185258
Abstract BM25 feature 61.02375879243697
Title BM25 feature 22.20856483097996
DL feature 128
TL feature 8
Abstract IDF feature 205.58561401076946
Title IDF feature 228.02206442647693
DL feature 71
TL feature 9
Abstract TF feature 59.0
Abstract IDF feature 290.53681136841794
Title IDF feature 323.3392705589211
Abstract TF-IDF feature 184.1453573078077
Abstract BM25 feature 31.348246711576518
DL feature 133
TL feature 11
Abstract TF feature 59.0
Abstract IDF feature 290.53681136841794
Title IDF feature 323.3392705589211
Abstract TF-IDF feature 184.1453573078077
Abstract BM25 feature 31.348246711576518
DL feature 11

DL feature 169
TL feature 8
Abstract IDF feature 208.00907621912023
Title IDF feature 236.87004817511956
DL feature 193
TL feature 9
Abstract TF feature 7.0
Abstract IDF feature 208.00907621912023
Title IDF feature 236.87004817511956
Abstract TF-IDF feature 31.84659509453484
Abstract BM25 feature 18.980254298198265
DL feature 110
TL feature 8
Abstract IDF feature 208.00907621912023
Title IDF feature 236.87004817511956
DL feature 155
TL feature 13
Abstract TF feature 5.0
Abstract IDF feature 208.00907621912023
Title IDF feature 236.87004817511956
Abstract TF-IDF feature 21.427790430373292
Abstract BM25 feature 8.264405471899787
DL feature 107
TL feature 1
Abstract IDF feature 208.00907621912023
Title IDF feature 236.87004817511956
DL feature 0
TL feature 10
Abstract IDF feature 208.00907621912023
Title IDF feature 236.87004817511956
DL feature 131
TL feature 17
Abstract IDF feature 208.00907621912023
Title IDF feature 236.87004817511956
DL feature 186
TL feature 8
Abstract IDF feature 2

Abstract BM25 feature 7.77522937953684
DL feature 140
TL feature 10
Abstract IDF feature 109.37076890717958
Title IDF feature 145.27528987999153
DL feature 265
TL feature 11
Abstract TF feature 5.0
Abstract IDF feature 109.37076890717958
Title IDF feature 145.27528987999153
Abstract TF-IDF feature 26.975814271973476
Abstract BM25 feature 8.128335915154885
DL feature 503
TL feature 6
Abstract TF feature 5.0
Title TF feature 3.0
Abstract IDF feature 109.37076890717958
Title IDF feature 145.27528987999153
Abstract TF-IDF feature 32.23181132949395
Title TF-IDF feature 23.26777531698402
Abstract BM25 feature 14.083939603546817
Title BM25 feature 15.318218782930765
DL feature 137
TL feature 7
Abstract TF feature 6.0
Title TF feature 2.0
Abstract IDF feature 109.37076890717958
Title IDF feature 145.27528987999153
Abstract TF-IDF feature 29.317421762644948
Title TF-IDF feature 16.479023698670904
Abstract BM25 feature 17.60869144525677
Title BM25 feature 6.2207157335279994
DL feature 115
TL fea

DL feature 265
TL feature 11
Abstract IDF feature 150.323617816598
Title IDF feature 173.19371757902306
DL feature 93
TL feature 13
Abstract IDF feature 150.323617816598
Title IDF feature 173.19371757902306
DL feature 76
TL feature 12
Abstract IDF feature 150.323617816598
Title IDF feature 173.19371757902306
DL feature 98
TL feature 12
Abstract IDF feature 150.323617816598
Title IDF feature 173.19371757902306
DL feature 163
TL feature 13
Abstract IDF feature 150.323617816598
Title IDF feature 173.19371757902306
DL feature 142
TL feature 13
Abstract IDF feature 150.323617816598
Title IDF feature 173.19371757902306
DL feature 62
TL feature 7
Abstract IDF feature 150.323617816598
Title IDF feature 173.19371757902306
DL feature 64
TL feature 6
Abstract IDF feature 150.323617816598
Title IDF feature 173.19371757902306
DL feature 83
TL feature 8
Abstract IDF feature 150.323617816598
Title IDF feature 173.19371757902306
DL feature 58
TL feature 8
Abstract IDF feature 150.323617816598
Title ID

Title TF feature 4.0
Abstract IDF feature 146.21403157413445
Title IDF feature 178.68316876204688
Abstract TF-IDF feature 84.76168907196059
Title TF-IDF feature 25.799270191342735
Abstract BM25 feature 35.04143904973538
Title BM25 feature 17.266067968099144
DL feature 124
TL feature 7
Abstract TF feature 12.0
Abstract IDF feature 146.21403157413445
Title IDF feature 178.68316876204688
Abstract TF-IDF feature 47.249276642157305
Abstract BM25 feature 24.469329339395856
DL feature 126
TL feature 8
Abstract TF feature 14.0
Title TF feature 7.0
Abstract IDF feature 146.21403157413445
Title IDF feature 178.68316876204688
Abstract TF-IDF feature 61.63936506494514
Title TF-IDF feature 39.50862558685784
Abstract BM25 feature 24.519275579405573
Title BM25 feature 23.57654927511902
DL feature 134
TL feature 11
Abstract TF feature 8.0
Title TF feature 5.0
Abstract IDF feature 146.21403157413445
Title IDF feature 178.68316876204688
Abstract TF-IDF feature 40.05501463334933
Title TF-IDF feature 29.5

Abstract TF feature 2.0
Abstract IDF feature 96.06021823154984
Title IDF feature 107.7386405972583
Abstract TF-IDF feature 14.673224432775367
Abstract BM25 feature 3.5131471563847008
DL feature 254
TL feature 5
Abstract TF feature 3.0
Title TF feature 2.0
Abstract IDF feature 96.06021823154984
Title IDF feature 107.7386405972583
Abstract TF-IDF feature 20.314467604561273
Title TF-IDF feature 17.324609229872596
Abstract BM25 feature 7.2584655923631205
Title BM25 feature 8.029643238952751
DL feature 146
TL feature 9
Abstract IDF feature 96.06021823154984
Title IDF feature 107.7386405972583
DL feature 168
TL feature 11
Abstract TF feature 3.0
Abstract IDF feature 50.489573110700356
Title IDF feature 67.21522700082394
Abstract TF-IDF feature 16.667561852858668
Abstract BM25 feature 4.965109973278763
DL feature 144
TL feature 10
Abstract TF feature 2.0
Abstract IDF feature 50.489573110700356
Title IDF feature 67.21522700082394
Abstract TF-IDF feature 13.693504793890332
Abstract BM25 feature

Abstract TF-IDF feature 19.264549211941407
Title TF-IDF feature 20.140017949295306
Abstract BM25 feature 10.311836441625154
Title BM25 feature 14.057108866432069
DL feature 102
TL feature 3
Abstract TF feature 6.0
Title TF feature 3.0
Abstract IDF feature 90.1316079580514
Title IDF feature 96.67230012722317
Abstract TF-IDF feature 31.157772754535053
Title TF-IDF feature 21.31603410407991
Abstract BM25 feature 17.00603996667587
Title BM25 feature 13.063288512485444
DL feature 104
TL feature 9
Abstract IDF feature 90.1316079580514
Title IDF feature 96.67230012722317
DL feature 0
TL feature 1
Abstract TF feature 20.0
Title TF feature 2.0
Abstract IDF feature 90.1316079580514
Title IDF feature 96.67230012722317
Abstract TF-IDF feature 90.09691622913388
Title TF-IDF feature 16.983017528145194
Abstract BM25 feature 16.8662309085183
Title BM25 feature 8.508083626868785
DL feature 689
TL feature 7
Abstract TF feature 3.0
Abstract IDF feature 82.4874714192386
Title IDF feature 94.34751529832675

Abstract TF-IDF feature 19.425056750305828
Title TF-IDF feature 14.621762162382009
Abstract BM25 feature 11.33512494276447
Title BM25 feature 5.0838447454469815
DL feature 80
TL feature 10
Abstract TF feature 3.0
Abstract IDF feature 44.17526137215017
Title IDF feature 54.36725184386913
Abstract TF-IDF feature 17.146111134915955
Abstract BM25 feature 8.580472090202115
DL feature 81
TL feature 10
Abstract TF feature 9.0
Title TF feature 2.0
Abstract IDF feature 44.17526137215017
Title IDF feature 54.36725184386913
Abstract TF-IDF feature 35.96026074706971
Title TF-IDF feature 14.869585786236804
Abstract BM25 feature 7.030646192659361
Title BM25 feature 6.629752961834452
DL feature 650
TL feature 3
Abstract IDF feature 44.17526137215017
Title IDF feature 54.36725184386913
DL feature 0
TL feature 10
Abstract TF feature 10.0
Title TF feature 3.0
Abstract IDF feature 44.17526137215017
Title IDF feature 54.36725184386913
Abstract TF-IDF feature 40.96799601002466
Title TF-IDF feature 21.82746

Abstract BM25 feature 17.49091791182494
Title BM25 feature 10.584379222774249
DL feature 88
TL feature 8
Title TF feature 4.0
Abstract IDF feature 47.74482156258694
Title IDF feature 57.26212978909952
Title TF-IDF feature 25.926595870218083
Title BM25 feature 18.39006811906722
DL feature 0
TL feature 6
Abstract TF feature 4.0
Title TF feature 4.0
Abstract IDF feature 47.74482156258694
Title IDF feature 57.26212978909952
Abstract TF-IDF feature 22.757331628052512
Title TF-IDF feature 25.926595870218083
Abstract BM25 feature 16.338697535222625
Title BM25 feature 17.404838599085615
DL feature 62
TL feature 7
Abstract TF feature 5.0
Title TF feature 4.0
Abstract IDF feature 47.74482156258694
Title IDF feature 57.26212978909952
Abstract TF-IDF feature 25.57992225702278
Title TF-IDF feature 25.926595870218083
Abstract BM25 feature 16.128159194444375
Title BM25 feature 17.404838599085615
DL feature 85
TL feature 7
Abstract TF feature 6.0
Title TF feature 4.0
Abstract IDF feature 47.7448215625

Abstract BM25 feature 11.727402459983157
Title BM25 feature 11.127379588058734
DL feature 103
TL feature 8
Abstract IDF feature 511.187740526226
Title IDF feature 571.1592585707709
DL feature 147
TL feature 7
Abstract IDF feature 511.187740526226
Title IDF feature 571.1592585707709
DL feature 64
TL feature 7
Abstract IDF feature 511.187740526226
Title IDF feature 571.1592585707709
DL feature 77
TL feature 6
Abstract IDF feature 511.187740526226
Title IDF feature 571.1592585707709
DL feature 86
TL feature 9
Abstract IDF feature 511.187740526226
Title IDF feature 571.1592585707709
DL feature 169
TL feature 12
Abstract IDF feature 511.187740526226
Title IDF feature 571.1592585707709
DL feature 368
TL feature 2
Abstract IDF feature 511.187740526226
Title IDF feature 571.1592585707709
DL feature 98
TL feature 6
Abstract IDF feature 511.187740526226
Title IDF feature 571.1592585707709
DL feature 128
TL feature 8
Abstract IDF feature 511.187740526226
Title IDF feature 571.1592585707709
DL fea

DL feature 0
TL feature 9
Abstract TF feature 6.0
Title TF feature 3.0
Abstract IDF feature 118.52805493841825
Title IDF feature 139.75623528382187
Abstract TF-IDF feature 26.991829959527067
Title TF-IDF feature 18.320238782552146
Abstract BM25 feature 13.775576764261542
Title BM25 feature 7.935910818777874
DL feature 95
TL feature 10
Abstract TF feature 4.0
Title TF feature 3.0
Abstract IDF feature 118.52805493841825
Title IDF feature 139.75623528382187
Abstract TF-IDF feature 20.140797844970653
Title TF-IDF feature 18.320238782552146
Abstract BM25 feature 8.732499097962704
Title BM25 feature 8.679677175203922
DL feature 116
TL feature 10
Abstract TF feature 5.0
Title TF feature 3.0
Abstract IDF feature 118.52805493841825
Title IDF feature 139.75623528382187
Abstract TF-IDF feature 22.801605358847105
Title TF-IDF feature 18.320238782552146
Abstract BM25 feature 10.743121157737008
Title BM25 feature 9.602117784569211
DL feature 81
TL feature 9
Abstract TF feature 5.0
Title TF feature 2

Title BM25 feature 13.348500942384298
DL feature 116
TL feature 10
Abstract TF feature 11.0
Title TF feature 3.0
Abstract IDF feature 156.74859614402715
Title IDF feature 183.2668509914772
Abstract TF-IDF feature 47.270966103758305
Title TF-IDF feature 21.891969765256214
Abstract BM25 feature 23.737302966053917
Title BM25 feature 12.363741499363016
DL feature 102
TL feature 11
Abstract TF feature 5.0
Title TF feature 3.0
Abstract IDF feature 156.74859614402715
Title IDF feature 183.2668509914772
Abstract TF-IDF feature 22.932742018736697
Title TF-IDF feature 18.320238782552146
Abstract BM25 feature 9.422949403009147
Title BM25 feature 7.935910818777874
DL feature 95
TL feature 10
Abstract TF feature 5.0
Title TF feature 3.0
Abstract IDF feature 156.74859614402715
Title IDF feature 183.2668509914772
Abstract TF-IDF feature 22.932742018736697
Title TF-IDF feature 18.320238782552146
Abstract BM25 feature 8.9772035863116
Title BM25 feature 7.056120228553651
DL feature 116
TL feature 15
Abs

Abstract IDF feature 299.063306744729
Title IDF feature 345.44774916179614
DL feature 129
TL feature 15
Abstract IDF feature 299.063306744729
Title IDF feature 345.44774916179614
DL feature 61
TL feature 7
Abstract IDF feature 299.063306744729
Title IDF feature 345.44774916179614
DL feature 64
TL feature 6
Abstract IDF feature 299.063306744729
Title IDF feature 345.44774916179614
DL feature 135
TL feature 6
Abstract IDF feature 299.063306744729
Title IDF feature 345.44774916179614
DL feature 337
TL feature 7
Abstract IDF feature 299.063306744729
Title IDF feature 345.44774916179614
DL feature 117
TL feature 12
Abstract IDF feature 299.063306744729
Title IDF feature 345.44774916179614
DL feature 116
TL feature 6
Abstract IDF feature 299.063306744729
Title IDF feature 345.44774916179614
DL feature 179
TL feature 14
Abstract IDF feature 299.063306744729
Title IDF feature 345.44774916179614
DL feature 81
TL feature 6
Abstract TF feature 7.0
Abstract IDF feature 299.063306744729
Title IDF f

DL feature 525
TL feature 12
Abstract TF feature 5.0
Abstract IDF feature 90.87597538984059
Title IDF feature 110.80307748439503
Abstract TF-IDF feature 22.87681021853667
Abstract BM25 feature 11.790941450173069
DL feature 95
TL feature 10
Abstract IDF feature 90.87597538984059
Title IDF feature 110.80307748439503
DL feature 0
TL feature 213
Abstract TF feature 6.0
Title TF feature 2.0
Abstract IDF feature 90.87597538984059
Title IDF feature 110.80307748439503
Abstract TF-IDF feature 25.36468059022444
Title TF-IDF feature 16.491960489701626
Abstract BM25 feature 11.770781739803155
Title BM25 feature 8.375765984669439
DL feature 134
TL feature 5
Abstract IDF feature 90.87597538984059
Title IDF feature 110.80307748439503
DL feature 80
TL feature 6
Abstract TF feature 5.0
Title TF feature 2.0
Abstract IDF feature 90.87597538984059
Title IDF feature 110.80307748439503
Abstract TF-IDF feature 22.165090446745772
Title TF-IDF feature 14.64162097103161
Abstract BM25 feature 5.6596497168786035


DL feature 169
TL feature 8
Abstract IDF feature 67.49448811243029
Title IDF feature 79.85611292492274
DL feature 69
TL feature 7
Abstract IDF feature 67.49448811243029
Title IDF feature 79.85611292492274
DL feature 191
TL feature 11
Abstract TF feature 3.0
Title TF feature 3.0
Abstract IDF feature 67.49448811243029
Title IDF feature 79.85611292492274
Abstract TF-IDF feature 17.920474617109946
Title TF-IDF feature 20.16686324877388
Abstract BM25 feature 10.902251032401674
Title BM25 feature 10.584379222774249
DL feature 51
TL feature 10
Abstract TF feature 9.0
Abstract IDF feature 67.49448811243029
Title IDF feature 79.85611292492274
Abstract TF-IDF feature 33.45545296262684
Abstract BM25 feature 6.705543109580925
DL feature 177
TL feature 10
Abstract IDF feature 67.49448811243029
Title IDF feature 79.85611292492274
DL feature 72
TL feature 7
Abstract TF feature 5.0
Title TF feature 3.0
Abstract IDF feature 67.49448811243029
Title IDF feature 79.85611292492274
Abstract TF-IDF feature 2

Abstract TF-IDF feature 209.1236821840261
Title TF-IDF feature 67.02983238579905
Abstract BM25 feature 39.12571155631426
Title BM25 feature 24.473015911470622
DL feature 0
TL feature 8
Abstract TF feature 69.0
Title TF feature 14.0
Abstract IDF feature 327.9075331508061
Title IDF feature 381.70019703564543
Abstract TF-IDF feature 209.1236821840261
Title TF-IDF feature 67.02983238579905
Abstract BM25 feature 39.12571155631426
Title BM25 feature 24.473015911470622
DL feature 96
TL feature 10
Abstract TF feature 69.0
Title TF feature 14.0
Abstract IDF feature 327.9075331508061
Title IDF feature 381.70019703564543
Abstract TF-IDF feature 209.1236821840261
Title TF-IDF feature 67.02983238579905
Abstract BM25 feature 39.12571155631426
Title BM25 feature 24.473015911470622
DL feature 611
TL feature 16
Abstract TF feature 69.0
Title TF feature 14.0
Abstract IDF feature 327.9075331508061
Title IDF feature 381.70019703564543
Abstract TF-IDF feature 209.1236821840261
Title TF-IDF feature 67.02983

Title TF-IDF feature 16.00165650165643
Abstract BM25 feature 6.585465863859733
Title BM25 feature 7.301830953729674
DL feature 71
TL feature 5
Abstract TF feature 3.0
Abstract IDF feature 98.97404706679542
Title IDF feature 119.44592957440543
Abstract TF-IDF feature 20.116526169443162
Abstract BM25 feature 8.549667588816884
DL feature 73
TL feature 8
Title TF feature 2.0
Abstract IDF feature 98.97404706679542
Title IDF feature 119.44592957440543
Title TF-IDF feature 16.00165650165643
Title BM25 feature 6.923360306301642
DL feature 97
TL feature 7
Abstract IDF feature 98.97404706679542
Title IDF feature 119.44592957440543
DL feature 148
TL feature 14
Abstract TF feature 5.0
Title TF feature 2.0
Abstract IDF feature 98.97404706679542
Title IDF feature 119.44592957440543
Abstract TF-IDF feature 27.377322939155043
Title TF-IDF feature 16.00165650165643
Abstract BM25 feature 11.58613985647513
Title BM25 feature 6.28817392228857
DL feature 117
TL feature 10
Abstract TF feature 6.0
Title TF f

Abstract TF-IDF feature 14.943179751697123
Abstract BM25 feature 5.515561971704077
DL feature 89
TL feature 11
Abstract IDF feature 135.08231005006775
Title IDF feature 163.4842769238313
DL feature 0
TL feature 16
Abstract TF feature 2.0
Abstract IDF feature 135.08231005006775
Title IDF feature 163.4842769238313
Abstract TF-IDF feature 14.413495987669615
Abstract BM25 feature 5.277450701940743
DL feature 91
TL feature 5
Abstract IDF feature 135.08231005006775
Title IDF feature 163.4842769238313
DL feature 45
TL feature 12
Abstract TF feature 9.0
Title TF feature 2.0
Abstract IDF feature 135.08231005006775
Title IDF feature 163.4842769238313
Abstract TF-IDF feature 53.62222955063801
Title TF-IDF feature 16.553725083956472
Abstract BM25 feature 22.367647385121273
Title BM25 feature 7.980413234032985
DL feature 297
TL feature 6
Abstract IDF feature 135.08231005006775
Title IDF feature 163.4842769238313
DL feature 47
TL feature 13
Abstract TF feature 3.0
Abstract IDF feature 135.0823100500

Abstract IDF feature 61.74713634937396
Title IDF feature 65.82373173009178
DL feature 160
TL feature 7
Abstract IDF feature 61.74713634937396
Title IDF feature 65.82373173009178
DL feature 51
TL feature 10
Abstract IDF feature 61.74713634937396
Title IDF feature 65.82373173009178
DL feature 164
TL feature 9
Abstract IDF feature 61.74713634937396
Title IDF feature 65.82373173009178
DL feature 128
TL feature 5
Abstract IDF feature 61.74713634937396
Title IDF feature 65.82373173009178
DL feature 103
TL feature 10
Abstract TF feature 5.0
Abstract IDF feature 87.80637912823863
Title IDF feature 97.14768601695161
Abstract TF-IDF feature 36.650372458950265
Abstract BM25 feature 12.095763908386507
DL feature 153
TL feature 8
Abstract IDF feature 87.80637912823863
Title IDF feature 97.14768601695161
DL feature 137
TL feature 7
Abstract TF feature 3.0
Abstract IDF feature 87.80637912823863
Title IDF feature 97.14768601695161
Abstract TF-IDF feature 23.762550194907483
Abstract BM25 feature 9.6962

Title TF feature 2.0
Abstract IDF feature 150.24485062360012
Title IDF feature 178.50934803252431
Abstract TF-IDF feature 72.13596905144975
Title TF-IDF feature 16.234013016192723
Abstract BM25 feature 16.468660801162788
Title BM25 feature 6.841016356609747
DL feature 643
TL feature 11
Abstract IDF feature 150.24485062360012
Title IDF feature 178.50934803252431
DL feature 175
TL feature 6
Abstract TF feature 2.0
Title TF feature 2.0
Abstract IDF feature 150.24485062360012
Title IDF feature 178.50934803252431
Abstract TF-IDF feature 13.89507465107704
Title TF-IDF feature 14.202217926773017
Abstract BM25 feature 4.115690459543934
Title BM25 feature 4.626588842188112
DL feature 102
TL feature 8
Abstract TF feature 6.0
Title TF feature 2.0
Abstract IDF feature 150.24485062360012
Title IDF feature 178.50934803252431
Abstract TF-IDF feature 30.795458339448253
Title TF-IDF feature 16.234013016192723
Abstract BM25 feature 10.0345356298042
Title BM25 feature 9.149576384885483
DL feature 220
TL 

Title TF feature 2.0
Abstract IDF feature 80.30384741527877
Title IDF feature 93.20312951179545
Abstract TF-IDF feature 40.72823460148554
Title TF-IDF feature 15.406454548538814
Abstract BM25 feature 18.613083530212812
Title BM25 feature 7.386335201831228
DL feature 105
TL feature 3
Title TF feature 2.0
Abstract IDF feature 80.30384741527877
Title IDF feature 93.20312951179545
Title TF-IDF feature 15.406454548538814
Title BM25 feature 6.570230729524147
DL feature 41
TL feature 6
Title TF feature 3.0
Abstract IDF feature 80.30384741527877
Title IDF feature 93.20312951179545
Title TF-IDF feature 18.7737668629549
Title BM25 feature 7.425016418036973
DL feature 0
TL feature 18
Abstract TF feature 4.0
Title TF feature 4.0
Abstract IDF feature 80.30384741527877
Title IDF feature 93.20312951179545
Abstract TF-IDF feature 21.067351170780356
Title TF-IDF feature 23.21154346425397
Abstract BM25 feature 10.732195715287506
Title BM25 feature 15.25325171350277
DL feature 76
TL feature 7
Abstract TF

Abstract TF feature 5.0
Title TF feature 3.0
Abstract IDF feature 114.28481978018587
Title IDF feature 134.6813331384893
Abstract TF-IDF feature 25.14893541776137
Title TF-IDF feature 19.399554934214272
Abstract BM25 feature 12.344133293199942
Title BM25 feature 10.291100057384913
DL feature 73
TL feature 8
Abstract TF feature 5.0
Title TF feature 3.0
Abstract IDF feature 114.28481978018587
Title IDF feature 134.6813331384893
Abstract TF-IDF feature 26.87019428397366
Title TF-IDF feature 19.399554934214272
Abstract BM25 feature 18.48717401784617
Title BM25 feature 10.849097991613213
DL feature 68
TL feature 8
Abstract TF feature 14.0
Title TF feature 5.0
Abstract IDF feature 114.28481978018587
Title IDF feature 134.6813331384893
Abstract TF-IDF feature 63.12573478545024
Title TF-IDF feature 32.19572630287752
Abstract BM25 feature 29.8526438584105
Title BM25 feature 25.63306318855149
DL feature 103
TL feature 7
Abstract TF feature 2.0
Abstract IDF feature 114.59966757996159
Title IDF fe

Abstract BM25 feature 4.767978584484824
Title BM25 feature 5.937155509922722
DL feature 124
TL feature 9
Abstract IDF feature 135.25336395195035
Title IDF feature 164.3547914423473
DL feature 77
TL feature 12
Abstract IDF feature 135.25336395195035
Title IDF feature 164.3547914423473
DL feature 138
TL feature 12
Title TF feature 4.0
Abstract IDF feature 135.25336395195035
Title IDF feature 164.3547914423473
Title TF-IDF feature 28.200868447505414
Title BM25 feature 18.871059288009118
DL feature 0
TL feature 9
Abstract TF feature 9.0
Title TF feature 4.0
Abstract IDF feature 135.25336395195035
Title IDF feature 164.3547914423473
Abstract TF-IDF feature 43.05306656090979
Title TF-IDF feature 28.465067387851924
Abstract BM25 feature 19.65942683538374
Title BM25 feature 17.386387472397796
DL feature 169
TL feature 12
Abstract TF feature 6.0
Title TF feature 2.0
Abstract IDF feature 135.25336395195035
Title IDF feature 164.3547914423473
Abstract TF-IDF feature 32.93382406105834
Title TF-IDF

Abstract TF feature 3.0
Abstract IDF feature 91.1204604127542
Title IDF feature 112.54840297426341
Abstract TF-IDF feature 15.36196929616329
Abstract BM25 feature 4.564573919990446
DL feature 192
TL feature 13
Abstract TF feature 4.0
Title TF feature 2.0
Abstract IDF feature 91.1204604127542
Title IDF feature 112.54840297426341
Abstract TF-IDF feature 18.49599832598976
Title TF-IDF feature 16.07613259455791
Abstract BM25 feature 8.000152691647328
Title BM25 feature 6.668944501786797
DL feature 70
TL feature 10
Abstract TF feature 5.0
Title TF feature 2.0
Abstract IDF feature 91.1204604127542
Title IDF feature 112.54840297426341
Abstract TF-IDF feature 24.08127549877561
Title TF-IDF feature 18.01178624344604
Abstract BM25 feature 13.661159147198626
Title BM25 feature 6.80522734842228
DL feature 144
TL feature 23
Abstract TF feature 2.0
Abstract IDF feature 91.1204604127542
Title IDF feature 112.54840297426341
Abstract TF-IDF feature 16.111101192395658
Abstract BM25 feature 5.82536427880

Title TF-IDF feature 31.068583924504665
Abstract BM25 feature 42.45468020569848
Title BM25 feature 21.828966345346743
DL feature 69
TL feature 10
Abstract TF feature 23.0
Title TF feature 6.0
Abstract IDF feature 262.85467771515056
Title IDF feature 295.50712187979116
Abstract TF-IDF feature 95.88829570998944
Title TF-IDF feature 37.70343820741362
Abstract BM25 feature 39.89560329763923
Title BM25 feature 28.672491257503957
DL feature 112
TL feature 11
Abstract TF feature 5.0
Title TF feature 4.0
Abstract IDF feature 262.85467771515056
Title IDF feature 295.50712187979116
Abstract TF-IDF feature 23.957091009734278
Title TF-IDF feature 24.192056695309155
Abstract BM25 feature 14.358633697828832
Title BM25 feature 16.38608067190173
DL feature 81
TL feature 9
Abstract TF feature 5.0
Abstract IDF feature 262.85467771515056
Title IDF feature 295.50712187979116
Abstract TF-IDF feature 23.442549227202285
Abstract BM25 feature 11.591624256476077
DL feature 49
TL feature 8
Abstract TF feature 1

Title BM25 feature 12.418257432613931
DL feature 112
TL feature 11
Abstract TF feature 3.0
Abstract IDF feature 312.42447480703026
Title IDF feature 373.65228439287216
Abstract TF-IDF feature 16.317080238361193
Abstract BM25 feature 7.578389237221787
DL feature 81
TL feature 9
Abstract TF feature 9.0
Abstract IDF feature 312.42447480703026
Title IDF feature 373.65228439287216
Abstract TF-IDF feature 49.15918245713751
Abstract BM25 feature 27.221588833579666
DL feature 75
TL feature 13
Abstract IDF feature 312.42447480703026
Title IDF feature 373.65228439287216
DL feature 100
TL feature 10
Abstract TF feature 8.0
Title TF feature 4.0
Abstract IDF feature 312.42447480703026
Title IDF feature 373.65228439287216
Abstract TF-IDF feature 46.736478889191645
Title TF-IDF feature 28.942187117995836
Abstract BM25 feature 23.4555517212766
Title BM25 feature 24.723666326080977
DL feature 101
TL feature 6
Title TF feature 3.0
Abstract IDF feature 312.42447480703026
Title IDF feature 373.65228439287

In [35]:
# Train data using Learning-To-Rank LambdaMART model
import pyltr

with open('train_f.txt') as f :
    features, rels, qids, comment = pyltr.data.letor.read_dataset(f)
    metric = pyltr.metrics.NDCG(k=10)
    model = pyltr.models.LambdaMART(
        metric=metric,
        n_estimators=800,
        learning_rate=0.02,
        max_features=0.5,
        query_subsample=0.5,
        max_leaf_nodes=10,
        min_samples_leaf=64,
        verbose=1,
    )
    model.fit(features, rels, qids)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.3877       0.2878       41.86s                                         
    2       0.4648       0.0531       42.88s                                         
    3       0.4629       0.0260       45.65s                                         
    4       0.5201       0.0054       48.15s                                         
    5       0.5392       0.0028       50.62s                                         
    6       0.4770       0.0067       53.02s                                         
    7       0.5034       0.0044        1.11m                                         
    8       0.4921       0.0019        1.11m                                         
    9       0.5212       0.0027        1.13m                                         
   10       0.5498      -0.0020        1.28m                                         
   15       0.5400       0.0000        1.29m         

In [36]:
# serialize the model
import pickle

with open('ltr_model.pickle', 'wb') as f:
    # Pickle the 'ltr_model' dictionary using the highest protocol available.
    pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

In [37]:
with open('ltr_model.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    model = pickle.load(f)

In [50]:
from whoosh import qparser, query, scoring

words = "information"
with open('result_f.txt.tmp', 'w+') as wr :
    searcher = idx.searcher(weighting=scoring.BM25F(B=0.74, K1=1.52))
    query_parser = MultifieldParser(['title', 'abstract', 'area'], idx.schema, termclass=query.Variations)
    query_parser.add_plugin(FuzzyTermPlugin())
    query_parsed = query.And([query.Variations('abstract', x) for x in words.split(' ') if len(x) > 1])
    results = searcher.search(query_parsed, limit=100)
    for r in results :
        wr.write(genearte_LETOR_data('1', words, r['path'], r['abstract'], r['title'], '1'))

with open('result_f.txt.tmp', 'r') as f :
    features, _, qids, docs = pyltr.data.letor.read_dataset(f)
    p = model.predict(features)
    result = []
    for j in range(len(qids)) :
        result.append((p[j], searcher.document(path=docs[j][8:].strip())))
    result.sort(key = lambda x:x[0], reverse = True)
result

[(0.7924560110721995,
  {'abstract': 'We present a framework for the integrated analysis of the textual and prosodic characteristics of information structure in the Switchboard corpus of conversational En-glish. Information structure describes the availability, organisation and salience of entities in a discourse model. We present standards for the annotation of information status (old, mediated and new), and give guidelines for annotating information structure, i.e. theme/rheme and back-ground/kontrast. We show that information structure in English can only be analysed concurrently with prosodic prominence and phrasing. This annotation, using stand-off XML in NXT, can help establish standards for the annotation of information structure in discourse.',
   'area': '',
   'path': 'W05-0307.tei.xml',
   'title': 'A Framework for Annotating Information Structure in Discourse'}),
 (0.7603878087648669,
  {'abstract': 'This paper provides a quick summary of the following topics: enhancements 

In [59]:
docnum = searcher.document_number(path=u"W05-0307.tei.xml")
r = searcher.more_like(docnum, 'abstract', top = 30)

<Hit {'abstract': 'We demonstrate work in progress 1 using the Nite XML Toolkit on a corpus of multimodal dialogues with an MP3 player collected in a Wizard-of-Oz (WOZ) experiments and annotated with a rich feature set at several layers. We designed an NXT data model, converted experiment log file data and manual transcriptions into NXT, and are building annotation tools using NXT libraries.', 'area': '', 'path': 'W06-2711.tei.xml', 'title': 'The SAMMIE Multimodal Dialogue Corpus Meets the Nite XML Toolkit'}>
<Hit {'abstract': 'It is natural to expect phrase structure to be important in predicting prosodic phrasing. Yet there appears to be a concensus that syntactic phrases do not correspond well to prosodic phrasing, and independent structures have been proposed to account for prosody. I propose that the problem with phrase structure lies with the particular measures of boundary strength applied to syntactic structures, and with the fact that phrase structure is viewed as an immediate