forked from xapian/xapian
/
letor.h
204 lines (171 loc) · 9.33 KB
/
letor.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
/** @file letor.h
* @brief weighting scheme based on Learning to Rank. Note: letor.h is not a part of official stable Xapian API.
*/
/* Copyright (C) 2011 Parth Gupta
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef XAPIAN_INCLUDED_LETOR_H
#define XAPIAN_INCLUDED_LETOR_H
#include <xapian.h>
#include <xapian/intrusive_ptr.h>
#include <xapian/types.h>
#include <xapian/visibility.h>
//#include <ranklist.h>
#include <string>
#include <map>
namespace Xapian {
class XAPIAN_VISIBILITY_DEFAULT Letor {
public:
/// @private @internal Class representing the Letor internals.
class Internal;
/// @private @internal Reference counted internals.
Xapian::Internal::intrusive_ptr<Internal> internal;
/// Copy constructor.
Letor(const Letor & o);
/// Assignment.
Letor & operator=(const Letor & o);
/// Default constructor.
Letor();
/// Destructor.
~Letor();
/// Specify the database to use for retrieval. This database will be used directly by the methods of Xapian::Letor::Internal
void set_database(const Xapian::Database & db);
/// Specify the query. This will be used by the internal class.
void set_query(const Xapian::Query & query);
/** This method finds the frequency of the query terms in the specified documents. This method is a helping method and statistics gathered through
* this method are used in feature value calculation. It return the frequency of the terms of query in std::map<string, long int> form.
*
* @param doc This is the document in which we want the frequency of the terms in query. Generally in Letor, this document is taken
* from the Xapian::MSet retrieved from the first retrieval.
* @param query This is the query for which terms we want frequency in the document.
*/
// std::map<std::string, long int> termfreq(const Xapian::Document & doc, const Xapian::Query & query);
/** This method calculates the inverse document frequency(idf) of query terms in the database. It returns the idf of each term in
* std::map<string, double> form.
*
* Note: idf of a term 't' is calculated as below:
*
* idf(t) = log(N/df(t))
* Where,
* N = Total number of documents in database and
* df(t) = number of documents containing term 't'
*
* @param db specify the database being used for search to calculate idf values.
* @param query query being used in the retrieval
*/
// std::map<std::string, double> inverse_doc_freq(const Xapian::Database & db, const Xapian::Query & query);
/** This method calculates the length of the documents as number of 'terms'. It calculates the length for three different
* parts: title, body and whole document. This information is returned in the std::map<string, long int> format.
* It can be accessed as below:
*
* @code
* map<string, long int> len;
* len["title"];
* len["body"];
* len["whole"];
* @endcode
*
* @param db Database containing that document.
* @param doc The document whose length is to be found.
*/
// std::map<std::string, long int> doc_length(const Xapian::Database & db, const Xapian::Document & doc);
/** This method calculates the length of the collection in number of terms for different parts like 'title', 'body' and 'whole'. This is calculated
* as a stored user metadata in omindex otherwise it is calculated out of scratch (this might take some time depending upon the size of the
* database. Length information is stored in std::map<string, long int> format and can be accessed as below:
*
* @code
* map<string, long int> len;
* len["title"];
* len["body"];
* len["whole"];
* @endcode
*
* @param db Database used for retrieval.
*
*/
// std::map<std::string, long int> collection_length(const Xapian::Database & db);
/** This method calculates the frequency of query terms in the whole database. The information is stored in std::map<string, long int> format and
* used during the feature calculation methods.
*
* @param db Database to be used
* @param query Query being searched.
*/
// std::map<std::string, long int> collection_termfreq(const Xapian::Database & db, const Xapian::Query & query);
/** It calculated the feature value for the query-document pair. These feature calculation methods uses the data generated using above defined
* methods like termfreq, inverse_doc_freq, doc_length, coll_freq and collection_length.
*
* These features are depicted in overview documentation of Letor.
*
* @param query Query being used.
* @param tf Map generated using Letor::termfreq() method.
* @param ch Specifies the part of document for which the value needs to be calculated. Values:
* 't' Title only
* 'b' Body only
* 'w' Whole document
*/
// double calculate_f1(const Xapian::Query & query, std::map<std::string, long int> & tf, char ch);
// double calculate_f2(const Xapian::Query & query, std::map<std::string, long int> & tf, std::map<std::string, long int> & doc_length, char ch);
// double calculate_f3(const Xapian::Query & query, std::map<std::string, double> & idf, char ch);
// double calculate_f4(const Xapian::Query & query, std::map<std::string, long int> & tf, std::map<std::string, long int> & coll_len, char ch);
// double calculate_f5(const Xapian::Query & query, std::map<std::string, long int> & tf, std::map<std::string, double> & idf, std::map<std::string, long int> & doc_length, char ch);
// double calculate_f6(const Xapian::Query & query, std::map<std::string, long int> & tf, std::map<std::string, long int> & doc_length, std::map<std::string, long int> & coll_tf, std::map<std::string, long int> & coll_length, char ch);
/** Gives the scores to each item of initial mset using the trained model. Note: It assigns a score to each document only so user needs to sort that map
* as descending order of the value of map.
*
* @return Letor score corresponding to each document in mset as map<docid, score> format.
*/
std::map<Xapian::docid, double> letor_score(const Xapian::MSet & mset);
/** In this method the model is learnt and stored in 'model.txt' file using training file 'train.txt'. It is required that libsvm is
* installed in the system. The SVM model is learnt using libsvm.
*
* @param s svm_type (default s=4). In libsvm-3.1,
* 1 -- C-SVC
* 1 -- nu-SVC
* 2 -- one-class SVM
* 3 -- epsilon-SVR
* 4 -- nu-SVR
* @param k kernel_type (default k=0). In libsvm-3.1,
* 0 -- linear
* 1 -- polynomial
* 2 -- radial basis function
* 3 -- sigmoid
* 4 -- precomputed kernel
*
*/
void letor_learn_model();
/** This method prepares the 'train.txt' file in the current working directory. This file is used to train a model which in turn will be used to
* assign scores to the documents based of Learning-to-Rank model. File 'train.txt' is created in the standard format of Letor training file
* as below:
*
* 0 qid:102 1:0.130742 2:0.000000 3:0.333333 4:0.000000 ... 18:0.750000 19:1.000000 #docid = 13566007
* 1 qid:102 1:0.593640 2:1.000000 3:0.000000 4:0.000000 ... 18:0.500000 19:0.000000 #docid = 0740276
*
* where first column is relevance judgement of the document with docid as shown in the last column. The second column is query id and in between
* there are 19 feature values.
*
* @param query_file Here you have to give a path to the file (in free text form) containing training queries in specified format.
* @param qrel_file Here supply the path to the qrel file (in free text form) containing the relevance judgements for the
* queries in the training file. This file should be in standard format specified.
* @param msetsize This is the mset size used for the first retrieval for training queries. It should be selected depending on the qrel file
* and database size.
*/
void prepare_training_file(const std::string & query_file, const std::string & qrel_file, Xapian::doccount msetsize);
void prepare_training_file_listwise(const std::string & query_file, int num_features);
void create_ranker(int ranker_type);
// void test(Xapian::RankList rl);
};
}
#endif /* XAPIAN_INCLUDED_LETOR_H */