Skip to content

Commit

Permalink
Added per document statistics to the backend
Browse files Browse the repository at this point in the history
  • Loading branch information
gauravaror committed Jun 13, 2012
1 parent 094a51b commit f4e1036
Show file tree
Hide file tree
Showing 6 changed files with 287 additions and 25 deletions.
21 changes: 18 additions & 3 deletions xapian-core/backends/brass/brass_database.cc
Expand Up @@ -824,8 +824,7 @@ BrassDatabase::get_nouniqterm(Xapian::docid did) const
LOGCALL(DB, Xapian::termcount, "BrassDatabase::get_nouniqterm", did);
Assert(did != 0);
intrusive_ptr<const BrassDatabase> ptrtothis(this);
BrassTermList termlist(ptrtothis, did);
RETURN(termlist.get_approx_size());
RETURN(postlist_table.get_nouniqterms(did,ptrtothis));
}

Xapian::doccount
Expand Down Expand Up @@ -1185,15 +1184,28 @@ BrassWritableDatabase::add_document_(Xapian::docid did,
value_manager.add_document(did, document, value_stats);

brass_doclen_t new_doclen = 0;
brass_doclen_t new_nouniqterms = 0;
brass_doclen_t new_bigramdoclen = 0;
brass_doclen_t new_nouniqbigrams = 0;
{
Xapian::TermIterator term = document.termlist_begin();
for ( ; term != document.termlist_end(); ++term) {
termcount wdf = term.get_wdf();
// Calculate the new document length
string tname = *term;
if(tname.find(" ") == string::npos)
{
new_doclen += wdf;
new_nouniqterms++;
stats.check_wdf(wdf);
}
else
{
new_bigramdoclen += wdf;
new_nouniqbigrams++;
}


string tname = *term;
if (tname.size() > MAX_SAFE_TERM_LENGTH)
throw Xapian::InvalidArgumentError("Term too long (> "STRINGIZE(MAX_SAFE_TERM_LENGTH)"): " + tname);

Expand All @@ -1216,6 +1228,9 @@ BrassWritableDatabase::add_document_(Xapian::docid did,

// Set the new document length
inverter.set_doclength(did, new_doclen, true);
inverter.set_nouniqterms(did, new_nouniqterms, true);
inverter.set_bigramdoclength(did, new_bigramdoclen, true);
inverter.set_nouniqbigrams(did, new_nouniqbigrams, true);
stats.add_document(new_doclen);
} catch (...) {
// If an error occurs while adding a document, or doing any other
Expand Down
21 changes: 21 additions & 0 deletions xapian-core/backends/brass/brass_inverter.cc
Expand Up @@ -35,6 +35,24 @@ Inverter::flush_doclengths(BrassPostListTable & table)
table.merge_doclen_changes(doclen_changes);
doclen_changes.clear();
}
void
Inverter::flush_nouniqterms(BrassPostListTable & table)
{
table.merge_nouniqterms_changes(nouniqterms_changes);
nouniqterms_changes.clear();
}
void
Inverter::flush_bigramdoclengths(BrassPostListTable & table)
{
table.merge_bigramdoclen_changes(bigramdoclen_changes);
bigramdoclen_changes.clear();
}
void
Inverter::flush_nouniqbigrams(BrassPostListTable & table)
{
table.merge_nouniqbigrams_changes(nouniqbigrams_changes);
nouniqbigrams_changes.clear();
}

void
Inverter::flush_post_list(BrassPostListTable & table, const string & term)
Expand Down Expand Up @@ -82,5 +100,8 @@ void
Inverter::flush(BrassPostListTable & table)
{
flush_doclengths(table);
flush_bigramdoclengths(table);
flush_nouniqterms(table);
flush_nouniqbigrams(table);
flush_all_post_lists(table);
}
99 changes: 90 additions & 9 deletions xapian-core/backends/brass/brass_inverter.h
Expand Up @@ -112,6 +112,15 @@ class Inverter {
public:
/// Buffered changes to document lengths.
std::map<Xapian::docid, Xapian::termcount> doclen_changes;

/// Buffered changes to number of uniq terms.
std::map<Xapian::docid, Xapian::termcount> nouniqterms_changes;

/// Buffered changes to bigram document length.
std::map<Xapian::docid, Xapian::termcount> bigramdoclen_changes;

/// Buffered changes to number of uniq bigrams in document.
std::map<Xapian::docid, Xapian::termcount> nouniqbigrams_changes;

public:
/// Add posting for term.
Expand Down Expand Up @@ -156,6 +165,9 @@ class Inverter {

void clear() {
doclen_changes.clear();
nouniqterms_changes.clear();
bigramdoclen_changes.clear();
nouniqbigrams_changes.clear();
postlist_changes.clear();
}

Expand All @@ -165,12 +177,48 @@ class Inverter {
}
doclen_changes[did] = doclen;
}

void set_nouniqterms(Xapian::docid did, Xapian::termcount nouniqterms, bool add) {
if (add) {
Assert(nouniqterms_changes.find(did) == nouniqterms_changes.end() || nouniqterms_changes[did] == DELETED_POSTING);
}
nouniqterms_changes[did] = nouniqterms;
}

void set_bigramdoclength(Xapian::docid did, Xapian::termcount bigramdoclen, bool add) {
if (add) {
Assert(bigramdoclen_changes.find(did) == bigramdoclen_changes.end() || bigramdoclen_changes[did] == DELETED_POSTING);
}
bigramdoclen_changes[did] = bigramdoclen;
}

void set_nouniqbigrams(Xapian::docid did, Xapian::termcount nouniqbigrams, bool add) {
if (add) {
Assert(nouniqbigrams_changes.find(did) == nouniqbigrams_changes.end() || nouniqbigrams_changes[did] == DELETED_POSTING);
}
nouniqbigrams_changes[did] = nouniqbigrams;
}

void delete_doclength(Xapian::docid did) {
Assert(doclen_changes.find(did) == doclen_changes.end() || doclen_changes[did] != DELETED_POSTING);
doclen_changes[did] = DELETED_POSTING;
}

void delete_nouniqterms(Xapian::docid did) {
Assert(nouniqterms_changes.find(did) == nouniqterms_changes.end() || nouniqterms_changes[did] != DELETED_POSTING);
nouniqterms_changes[did] = DELETED_POSTING;
}

void delete_bigramdoclength(Xapian::docid did) {
Assert(bigramdoclen_changes.find(did) == bigramdoclen_changes.end() || bigramdoclen_changes[did] != DELETED_POSTING);
bigramdoclen_changes[did] = DELETED_POSTING;
}

void delete_nouniqbigrams(Xapian::docid did) {
Assert(nouniqbigrams_changes.find(did) == nouniqbigrams_changes.end() || nouniqbigrams_changes[did] != DELETED_POSTING);
nouniqbigrams_changes[did] = DELETED_POSTING;
}

bool get_doclength(Xapian::docid did, Xapian::termcount & doclen) const {
std::map<Xapian::docid, Xapian::termcount>::const_iterator i;
i = doclen_changes.find(did);
Expand All @@ -181,10 +229,52 @@ class Inverter {
doclen = i->second;
return true;
}

bool get_nouniqterms(Xapian::docid did, Xapian::termcount & nouniqterm) const {
std::map<Xapian::docid, Xapian::termcount>::const_iterator i;
i = nouniqterms_changes.find(did);
if (i == nouniqterms_changes.end())
return false;
if (rare(i->second == DELETED_POSTING))
throw Xapian::DocNotFoundError("Document not found: " + str(did));
nouniqterm = i->second;
return true;
}

bool get_bigramdoclength(Xapian::docid did, Xapian::termcount & bigramdoclen) const {
std::map<Xapian::docid, Xapian::termcount>::const_iterator i;
i = bigramdoclen_changes.find(did);
if (i == bigramdoclen_changes.end())
return false;
if (rare(i->second == DELETED_POSTING))
throw Xapian::DocNotFoundError("Document not found: " + str(did));
bigramdoclen = i->second;
return true;
}

bool get_nouniqbigrams(Xapian::docid did, Xapian::termcount & nouniqbigram) const {
std::map<Xapian::docid, Xapian::termcount>::const_iterator i;
i = nouniqbigrams_changes.find(did);
if (i == nouniqbigrams_changes.end())
return false;
if (rare(i->second == DELETED_POSTING))
throw Xapian::DocNotFoundError("Document not found: " + str(did));
nouniqbigram = i->second;
return true;
}

/// Flush document length changes.
void flush_doclengths(BrassPostListTable & table);

/// Flush number of terms.
void flush_nouniqterms(BrassPostListTable & table);

/// Flush bigram document length.
void flush_bigramdoclengths(BrassPostListTable & table);

/// Flush number of uniq bigrams.
void flush_nouniqbigrams(BrassPostListTable & table);

/// Flush postlist changes for @a term.
void flush_post_list(BrassPostListTable & table, const std::string & term);

Expand All @@ -194,15 +284,6 @@ class Inverter {
/// Flush postlist changes for all terms which start with @a pfx.
void flush_post_lists(BrassPostListTable & table, const std::string & pfx);

/// Flush postlist changes for @a bigram.
void flush_post_bigram_list(BrassPostListTable & table, const std::string & bigram);

/// Flush postlist changes for all terms.
void flush_all_post_bigram_lists(BrassPostListTable & table);

/// Flush postlist changes for all terms which start with @a pfx.
void flush_post_bigram_lists(BrassPostListTable & table, const std::string & pfx);

/// Flush all changes.
void flush(BrassPostListTable & table);

Expand Down
105 changes: 93 additions & 12 deletions xapian-core/backends/brass/brass_postlist.cc
Expand Up @@ -72,6 +72,45 @@ BrassPostListTable::get_doclength(Xapian::docid did,
return doclen_pl->get_wdf();
}

Xapian::termcount
BrassPostListTable::get_nouniqterms(Xapian::docid did,
intrusive_ptr<const BrassDatabase> db) const {
if (!nouniqterms_pl.get()) {
// Don't keep a reference back to the database, since this
// would make a reference loop.
nouniqterms_pl.reset(new BrassPostList(db, string("nouniqterms"), false));
}
if (!nouniqterms_pl->jump_to(did))
throw Xapian::DocNotFoundError("Document " + str(did) + " not found");
return nouniqterms_pl->get_wdf();
}

Xapian::termcount
BrassPostListTable::get_bigramdoclength(Xapian::docid did,
intrusive_ptr<const BrassDatabase> db) const {
if (!bigramdoclen_pl.get()) {
// Don't keep a reference back to the database, since this
// would make a reference loop.
bigramdoclen_pl.reset(new BrassPostList(db, string("bigramdoclen"), false));
}
if (!bigramdoclen_pl->jump_to(did))
throw Xapian::DocNotFoundError("Document " + str(did) + " not found");
return bigramdoclen_pl->get_wdf();
}

Xapian::termcount
BrassPostListTable::get_nouniqbigrams(Xapian::docid did,
intrusive_ptr<const BrassDatabase> db) const {
if (!nouniqbigrams_pl.get()) {
// Don't keep a reference back to the database, since this
// would make a reference loop.
nouniqbigrams_pl.reset(new BrassPostList(db, string("nouniqbigrams"), false));
}
if (!nouniqbigrams_pl->jump_to(did))
throw Xapian::DocNotFoundError("Document " + str(did) + " not found");
return nouniqbigrams_pl->get_wdf();
}

bool
BrassPostListTable::document_exists(Xapian::docid did,
intrusive_ptr<const BrassDatabase> db) const
Expand Down Expand Up @@ -1078,12 +1117,54 @@ BrassPostListTable::merge_doclen_changes(const map<Xapian::docid, Xapian::termco

// The cursor in the doclen_pl will no longer be valid, so reset it.
doclen_pl.reset(0);
merge_statistics_changes(doclens,string());

}

void
BrassPostListTable::merge_nouniqterms_changes(const map<Xapian::docid, Xapian::termcount> & nouniqterms)
{
LOGCALL_VOID(DB, "BrassPostListTable::merge_nouniqterms_changes", nouniqterms);

// The cursor in the nouniqterms_pl will no longer be valid, so reset it.
nouniqterms_pl.reset(0);
merge_statistics_changes(nouniqterms,string("nouniqterms"));

}

void
BrassPostListTable::merge_bigramdoclen_changes(const map<Xapian::docid, Xapian::termcount> & bigramdoclens)
{
LOGCALL_VOID(DB, "BrassPostListTable::merge_bigramdoclen_changes", bigramdoclens);

// The cursor in the bigramdoclen_pl will no longer be valid, so reset it.
bigramdoclen_pl.reset(0);
merge_statistics_changes(bigramdoclens,string("bigramdoclen"));

}

void
BrassPostListTable::merge_nouniqbigrams_changes(const map<Xapian::docid, Xapian::termcount> & nouniqbigrams)
{
LOGCALL_VOID(DB, "BrassPostListTable::merge_nouniqbigrams_changes", nouniqbigrams);

// The cursor in the nouniqbigrams_pl will no longer be valid, so reset it.
nouniqbigrams_pl.reset(0);
merge_statistics_changes(nouniqbigrams,string("nouniqbigrams"));

}

void
BrassPostListTable::merge_statistics_changes(const map<Xapian::docid, Xapian::termcount> & statlens,string keyparam)
{
LOGCALL_VOID(DB, "BrassPostListTable::merge_statistics_changes", statlens);


LOGVALUE(DB, doclens.size());
if (doclens.empty()) return;
LOGVALUE(DB, statlens.size());
if (statlens.empty()) return;

// Ensure there's a first chunk.
string current_key = make_key(string());
string current_key = make_key(keyparam);
if (!key_exists(current_key)) {
LOGLINE(DB, "Adding dummy first chunk");
string newtag = make_start_of_first_chunk(0, 0, 0);
Expand All @@ -1092,19 +1173,19 @@ BrassPostListTable::merge_doclen_changes(const map<Xapian::docid, Xapian::termco
}

map<Xapian::docid, Xapian::termcount>::const_iterator j;
j = doclens.begin();
Assert(j != doclens.end()); // This case is caught above.
j = statlens.begin();
Assert(j != statlens.end()); // This case is caught above.

Xapian::docid max_did;
PostlistChunkReader *from;
PostlistChunkWriter *to;
max_did = get_chunk(string(), j->first, true, &from, &to);
max_did = get_chunk(keyparam, j->first, true, &from, &to);
LOGVALUE(DB, max_did);
for ( ; j != doclens.end(); ++j) {
for ( ; j != statlens.end(); ++j) {
Xapian::docid did = j->first;

next_doclen_chunk:
LOGLINE(DB, "Updating doclens, did=" << did);
LOGLINE(DB, "Updating statlens, did=" << did);
if (from) while (!from->is_at_end()) {
Xapian::docid copy_did = from->get_docid();
if (copy_did >= did) {
Expand All @@ -1118,13 +1199,13 @@ BrassPostListTable::merge_doclen_changes(const map<Xapian::docid, Xapian::termco
delete from;
to->flush(this);
delete to;
max_did = get_chunk(string(), did, false, &from, &to);
max_did = get_chunk(keyparam, did, false, &from, &to);
goto next_doclen_chunk;
}

Xapian::termcount new_doclen = j->second;
if (new_doclen != static_cast<Xapian::termcount>(-1)) {
to->append(this, did, new_doclen);
Xapian::termcount new_statlen = j->second;
if (new_statlen != static_cast<Xapian::termcount>(-1)) {
to->append(this, did, new_statlen);
}
}

Expand Down

0 comments on commit f4e1036

Please sign in to comment.