Skip to content

Commit

Permalink
Refactored UnigramLMweight to LMWeight,bug of doclength upper bound n…
Browse files Browse the repository at this point in the history
…ot calculated in LMWeight
  • Loading branch information
gauravaror committed Jul 5, 2012
1 parent 6f51cb8 commit 91bfc05
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 50 deletions.
2 changes: 1 addition & 1 deletion xapian-core/api/registry.cc
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ Registry::Internal::add_defaults()
wtschemes[weighting_scheme->name()] = weighting_scheme;
weighting_scheme = new Xapian::TradWeight;
wtschemes[weighting_scheme->name()] = weighting_scheme;
weighting_scheme = new Xapian::UnigramLMWeight;
weighting_scheme = new Xapian::LMWeight;
wtschemes[weighting_scheme->name()] = weighting_scheme;

Xapian::PostingSource * source;
Expand Down
5 changes: 3 additions & 2 deletions xapian-core/examples/simplesearch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ try {
}

// Open the database for searching.
Xapian::Database db(Xapian::Chert::open(argv[1]));
Xapian::Database db(Xapian::Brass::open(argv[1]));

// Start an enquire session.
Xapian::Enquire enquire(db);
Expand All @@ -69,14 +69,15 @@ try {
Xapian::QueryParser qp;
Xapian::Stem stemmer("english");
qp.set_stemmer(stemmer);
// qp.set_bigram(true);
qp.set_database(db);
qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
Xapian::Query query = qp.parse_query(query_string);
cout << "Parsed query is: " << query.get_description() << endl;

// Find the top 10 results for the query.
enquire.set_query(query);
enquire.set_weighting_scheme(Xapian::UnigramLMWeight(Xapian::Weight::DIRICHLET_SMOOTHING,0,0));
enquire.set_weighting_scheme(Xapian::LMWeight());
Xapian::MSet matches = enquire.get_mset(0, 10);

// Display the results.
Expand Down
30 changes: 16 additions & 14 deletions xapian-core/include/xapian/weight.h
Original file line number Diff line number Diff line change
Expand Up @@ -544,16 +544,16 @@ class XAPIAN_VISIBILITY_DEFAULT TradWeight : public Weight {
double get_maxextra() const;
};

/** Xapian::Weight subclass implementing the unigram Language Model formula.
/** Xapian::Weight subclass implementing the Language Model formula.
*
* This class implements the "unigram Language Model " Weighting scheme, as
* This class implements the "Language Model " Weighting scheme, as
* described by the early papers on LM by bruce croft generally
* gives better results.
*
* LM have no parameter as it doenot assume hueristic and work on comparing query with Language
* model of the document.
*/
class XAPIAN_VISIBILITY_DEFAULT UnigramLMWeight : public Weight {
class XAPIAN_VISIBILITY_DEFAULT LMWeight : public Weight {
/// Variable to be used to store collection frequency of the term to be used for
// calculating the smoothning factor in case the withing document frequency of term is zero.
Xapian::termcount collection_freq;
Expand All @@ -573,13 +573,13 @@ class XAPIAN_VISIBILITY_DEFAULT UnigramLMWeight : public Weight {
// Parameter for handelling negative value of log,smoothing.
double param_log,param_smoothing1,param_smoothing2;

UnigramLMWeight * clone() const;
LMWeight * clone() const;


void init(double factor);

public:
/** Construct a UnigramLMWeight.
/** Construct a LMWeight.
*
* @param_log A non-negative parameter controlling how much to clamp
* negetive value returned due to log. log is calculated by
Expand All @@ -600,9 +600,9 @@ class XAPIAN_VISIBILITY_DEFAULT UnigramLMWeight : public Weight {
* @param_smoothing2 A non-negative parameter which is used only when user select
* TWO_STAGE_SMOOTHING as parameter for DIRICHLET_SMOOTHING.(default 2000).
*/
// Unigram LM constructor to select smoothing type and select parameter for log handelling automatically
// LM constructor to select smoothing type and select parameter for log handelling automatically

UnigramLMWeight(type_smoothing select_smoothing_,double param_smoothing1_,double param_smoothing2_)
LMWeight(type_smoothing select_smoothing_,double param_smoothing1_,double param_smoothing2_)
: select_smoothing(select_smoothing_),param_log(0.0), param_smoothing1(param_smoothing1_),
param_smoothing2(param_smoothing2_)
{
Expand All @@ -621,9 +621,9 @@ class XAPIAN_VISIBILITY_DEFAULT UnigramLMWeight : public Weight {
need_stat(DOC_LENGTH_MAX);
}

// Unigram LM Constructor to specifically mention all parameters for handelling negative log value and smoothing.
// LM Constructor to specifically mention all parameters for handelling negative log value and smoothing.

UnigramLMWeight(double param_log_,type_smoothing select_smoothing_,double param_smoothing1_,double param_smoothing2_)
LMWeight(double param_log_,type_smoothing select_smoothing_,double param_smoothing1_,double param_smoothing2_)
: select_smoothing(select_smoothing_), param_log(param_log_), param_smoothing1(param_smoothing1_),
param_smoothing2(param_smoothing2_)
{
Expand All @@ -639,11 +639,12 @@ class XAPIAN_VISIBILITY_DEFAULT UnigramLMWeight : public Weight {
need_stat(WDF_MAX);
need_stat(WDF);
need_stat(COLLECTION_FREQ);
need_stat(DOC_LENGTH_MAX);
}

//Unigram LM Constructor to specifically mention parameter for handelling negetive log value
//LM Constructor to specifically mention parameter for handelling negetive log value
//and select default value for smoothing.
UnigramLMWeight(double param_log_)
LMWeight(double param_log_)
: select_smoothing(TWO_STAGE_SMOOTHING), param_log(param_log_), param_smoothing1(0.7),
param_smoothing2(2000.0)
{
Expand All @@ -659,10 +660,11 @@ class XAPIAN_VISIBILITY_DEFAULT UnigramLMWeight : public Weight {
need_stat(WDF_MAX);
need_stat(WDF);
need_stat(COLLECTION_FREQ);
need_stat(DOC_LENGTH_MAX);
}

//Unigram LM Constructure to use default value for smoothing.
UnigramLMWeight()
//LM Constructure to use default value for smoothing.
LMWeight()
: select_smoothing(TWO_STAGE_SMOOTHING), param_log(0.0), param_smoothing1(0.7),
param_smoothing2(2000.0)
{
Expand All @@ -683,7 +685,7 @@ class XAPIAN_VISIBILITY_DEFAULT UnigramLMWeight : public Weight {
std::string name() const;

std::string serialise() const;
UnigramLMWeight * unserialise(const std::string & s) const;
LMWeight * unserialise(const std::string & s) const;

double get_sumpart(Xapian::termcount wdf,
Xapian::termcount doclen) const;
Expand Down
22 changes: 11 additions & 11 deletions xapian-core/tests/api_weight.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,16 @@ DEFINE_TESTCASE(tradweight3, !backend) {

//Test Exception for junk after serialised weight.
DEFINE_TESTCASE(unigramlmweight3, !backend) {
Xapian::UnigramLMWeight wt(79898.0,Xapian::Weight::JELINEK_MERCER_SMOOTHING,0.5,1.0);
Xapian::LMWeight wt(79898.0,Xapian::Weight::JELINEK_MERCER_SMOOTHING,0.5,1.0);
try {
Xapian::UnigramLMWeight t;
Xapian::UnigramLMWeight * t2 = t.unserialise(wt.serialise() + "X");
Xapian::LMWeight t;
Xapian::LMWeight * t2 = t.unserialise(wt.serialise() + "X");
// Make sure we actually use the weight.
bool empty = t2->name().empty();
delete t2;
if (empty)
FAIL_TEST("Serialised UnigramLMWeight with junk appended unserialised to empty name!");
FAIL_TEST("Serialised UnigramLMWeight with junk appended unserialised OK");
FAIL_TEST("Serialised LMWeight with junk appended unserialised to empty name!");
FAIL_TEST("Serialised LMWeight with junk appended unserialised OK");
} catch (const Xapian::SerialisationError &e) {
//GOOD!
}
Expand Down Expand Up @@ -117,8 +117,8 @@ DEFINE_TESTCASE(unigramlmweight4,backend) {
enquire2.set_query(Xapian::Query("paragraph"));
Xapian::MSet mset2;
//5 documents avaialble with term paragraph so mset size should be 5
enquire1.set_weighting_scheme(Xapian::UnigramLMWeight(Xapian::Weight::TWO_STAGE_SMOOTHING,1,0));
enquire2.set_weighting_scheme(Xapian::UnigramLMWeight(Xapian::Weight::JELINEK_MERCER_SMOOTHING,1,0));
enquire1.set_weighting_scheme(Xapian::LMWeight(Xapian::Weight::TWO_STAGE_SMOOTHING,1,0));
enquire2.set_weighting_scheme(Xapian::LMWeight(Xapian::Weight::JELINEK_MERCER_SMOOTHING,1,0));
mset1 = enquire1.get_mset(0,10);
mset2 = enquire2.get_mset(0,10);

Expand All @@ -144,10 +144,10 @@ DEFINE_TESTCASE(unigramlmweight5,backend) {
enquire4.set_query(Xapian::Query("paragraph"));
Xapian::MSet mset4;
//5 documents avaialble with term paragraph so mset size should be 5
enquire1.set_weighting_scheme(Xapian::UnigramLMWeight(10000.0,Xapian::Weight::TWO_STAGE_SMOOTHING,0,0));
enquire2.set_weighting_scheme(Xapian::UnigramLMWeight(10000.0,Xapian::Weight::JELINEK_MERCER_SMOOTHING,0,0));
enquire3.set_weighting_scheme(Xapian::UnigramLMWeight(10000.0,Xapian::Weight::ABSOLUTE_DISCOUNT_SMOOTHING,0,0));
enquire4.set_weighting_scheme(Xapian::UnigramLMWeight(10000.0,Xapian::Weight::DIRICHLET_SMOOTHING,0,0));
enquire1.set_weighting_scheme(Xapian::LMWeight(10000.0,Xapian::Weight::TWO_STAGE_SMOOTHING,0,0));
enquire2.set_weighting_scheme(Xapian::LMWeight(10000.0,Xapian::Weight::JELINEK_MERCER_SMOOTHING,0,0));
enquire3.set_weighting_scheme(Xapian::LMWeight(10000.0,Xapian::Weight::ABSOLUTE_DISCOUNT_SMOOTHING,0,0));
enquire4.set_weighting_scheme(Xapian::LMWeight(10000.0,Xapian::Weight::DIRICHLET_SMOOTHING,0,0));

mset1 = enquire1.get_mset(0,10);
mset2 = enquire2.get_mset(0,10);
Expand Down
2 changes: 1 addition & 1 deletion xapian-core/weight/Makefile.mk
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ lib_src +=\
weight/tradweight.cc\
weight/weight.cc\
weight/weightinternal.cc\
weight/unigramlmweight.cc
weight/lmweight.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/** @file unigramlmweight.cc
* @brief Xapian::UnigramLMWeight class - the Unigram Language Modelling formula.
/** @file lmweight.cc
* @brief Xapian::LMWeight class - the Language Modelling formula.
*/
/* Copyright (C) 2012 Gaurav Arora
*
Expand Down Expand Up @@ -34,12 +34,12 @@ using namespace std;

namespace Xapian {

UnigramLMWeight *
UnigramLMWeight::clone() const {
return new UnigramLMWeight(param_log,select_smoothing, param_smoothing1, param_smoothing2);
LMWeight *
LMWeight::clone() const {
return new LMWeight(param_log,select_smoothing, param_smoothing1, param_smoothing2);
}
void
UnigramLMWeight::init(double )
LMWeight::init(double )
{
//Storing collection frequency of current term in collection_freq to be accessed while smoothing of weights for the term,for term not present in the document.
collection_freq = get_collectionfreq();
Expand All @@ -59,7 +59,14 @@ UnigramLMWeight::init(double )
* intializing param_log to upperbound of document_length.*/

if(param_log == 0.0) {
param_log = get_doclength_upper_bound();
param_log = get_doclength_upper_bound();
if(select_smoothing == TWO_STAGE_SMOOTHING) {
/* Since we are combining result of two sommothing with factor.
* We multiply with factor < 1.Hence our doc_length_upper_bound.
* will not work in some case so multiplying by 10.
*/
param_log = param_log*10.0;
}
}

/* * since the optimal parameter for Jelinek mercer smoothing
Expand All @@ -84,13 +91,13 @@ UnigramLMWeight::init(double )
}
}
string
UnigramLMWeight::name() const
LMWeight::name() const
{
return "Xapian::UnigramLMWeight";
return "Xapian::LMWeight";
}

string
UnigramLMWeight::serialise() const
LMWeight::serialise() const
{

string result = serialise_double(param_log);
Expand All @@ -101,8 +108,8 @@ UnigramLMWeight::serialise() const
return result;
}

UnigramLMWeight *
UnigramLMWeight::unserialise(const string & s) const
LMWeight *
LMWeight::unserialise(const string & s) const
{
const char *ptr = s.data();
const char *end = ptr + s.size();
Expand All @@ -111,25 +118,25 @@ UnigramLMWeight::unserialise(const string & s) const
double param_smoothing1_ = unserialise_double(&ptr,end);
double param_smoothing2_ = unserialise_double(&ptr,end);
if(rare(ptr != end))
throw Xapian::SerialisationError("Extra data in UnigramLMWeight::unserialise()");
return new UnigramLMWeight(param_log_,select_smoothing_,param_smoothing1_,param_smoothing2_);
throw Xapian::SerialisationError("Extra data in LMWeight::unserialise()");
return new LMWeight(param_log_,select_smoothing_,param_smoothing1_,param_smoothing2_);
}

double
UnigramLMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len) const
LMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len) const
{
return get_sumpart(wdf,len,Xapian::termcount(1));
}

double
UnigramLMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,Xapian::termcount uniqterm) const
LMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,Xapian::termcount uniqterm) const
{
//Withing Document Frequency of the term in document being considered.
double wdf_double(wdf);
//Length of the Document in terms of number of terms.
double len_double(len);
double nouniqterm_double(uniqterm);
// varioable to store weight contribution of term in the document socring for unigram LM.
// varioable to store weight contribution of term in the document socring for LM.
double weight_collection,weight_document,weight_sum;
/* In case the within document frequency of term is zero smoothining
* will be required and should be return instead of returning zero,
Expand All @@ -154,7 +161,7 @@ UnigramLMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,Xapian
}


/* Since unigram LM score is calculated with multiplication,
/* Since LM score is calculated with multiplication,
* instead of changing the current implementation log trick have been used
* to calculate the product since (sum of log is log of product and
* since aim is ranking ranking document by product or log of
Expand All @@ -165,7 +172,7 @@ UnigramLMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,Xapian
}

double
UnigramLMWeight::get_maxpart() const
LMWeight::get_maxpart() const
{
// Sufficiently large bound is being returned ,to optimize the matching process this needs to be fixed and changed to good max bound
// Need to be fixed
Expand All @@ -174,13 +181,13 @@ UnigramLMWeight::get_maxpart() const
}

double
UnigramLMWeight::get_sumextra(Xapian::termcount) const
LMWeight::get_sumextra(Xapian::termcount) const
{
return 0;
}

double
UnigramLMWeight::get_maxextra() const
LMWeight::get_maxextra() const
{
return 0;
}
Expand Down

0 comments on commit 91bfc05

Please sign in to comment.