Skip to content

Commit

Permalink
EM model can be pre-initialised
Browse files Browse the repository at this point in the history
  • Loading branch information
proycon committed Apr 25, 2012
1 parent 5c89f51 commit dcdbcad
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 28 deletions.
1 change: 1 addition & 0 deletions include/alignmodel.h
Expand Up @@ -108,6 +108,7 @@ class EMAlignmentModel2: public AlignmentModel {
void save(const std::string & filename);
};


class ItEMAlignmentModel: public EMAlignmentModel {
public:
ItEMAlignmentModel(SelectivePatternModel * sourcemodel, SelectivePatternModel * targetmodel, const int MAXROUNDS=10000, const double CONVERGEDTHRESHOLD=0.001, double threshold = 0.0, const int bestn = 0, bool DONULL=true, bool DEBUG = false);
Expand Down
69 changes: 41 additions & 28 deletions src/alignmodel.cpp
Expand Up @@ -245,6 +245,7 @@ void BiAlignmentModel::simpletableoutput(ClassDecoder & sourceclassdecoder, Clas
}
}

/***************************** BEGIN EM ************************************/

EMAlignmentModel::EMAlignmentModel(SelectivePatternModel * sourcemodel, SelectivePatternModel * targetmodel, bool INIT, bool DONULL, bool DEBUG) {
this->sourcemodel = sourcemodel;
Expand Down Expand Up @@ -324,16 +325,17 @@ void EMAlignmentModel::train(const int MAXROUNDS, const double CONVERGEDTHRESHOL
vector<const EncAnyGram*> * targetpatterns = &targetmodel->reverseindex[sentence];
if ((DEBUG) || (sentence % 1000 == 0)) cerr << "@" << sentence << " (" << sourcepatterns->size() << "x" << targetpatterns->size() << ")" << endl;
//compute sentencetotal for normalisation later in count step, sum_s(p(t|s))
unordered_map<const EncAnyGram*, double> sentencetotal;
for (vector<const EncAnyGram*>::iterator targetiter = targetpatterns->begin(); targetiter != targetpatterns->end(); targetiter++) {
const EncAnyGram * targetgram = *targetiter;
if (DONULL) sentencetotal[targetgram] += alignmatrix[NULLGRAM][targetgram];
for (vector<const EncAnyGram*>::const_iterator sourceiter = sourcepatterns->begin(); sourceiter != sourcepatterns->end(); sourceiter++) {
const EncAnyGram * sourcegram = *sourceiter;
sentencetotal[targetgram] += alignmatrix[sourcegram][targetgram]; //compute sum over all source conditions for a targetgram under consideration
}
unordered_map<const EncAnyGram*, double> sentencetotal;
for (vector<const EncAnyGram*>::const_iterator sourceiter = sourcepatterns->begin(); sourceiter != sourcepatterns->end(); sourceiter++) {
const EncAnyGram * sourcegram = *sourceiter;
if (alignmatrix.count(sourcegram)) {
for (vector<const EncAnyGram*>::iterator targetiter = targetpatterns->begin(); targetiter != targetpatterns->end(); targetiter++) {
const EncAnyGram * targetgram = *targetiter;
if (alignmatrix[sourcegram].count(targetgram)) sentencetotal[targetgram] += alignmatrix[sourcegram][targetgram]; //compute sum over all source conditions for a targetgram under consideration
}
}
}


//collect counts to estimate improved model (for evidence that a targetgram is aligned to a sourcegram)
Expand All @@ -342,16 +344,20 @@ void EMAlignmentModel::train(const int MAXROUNDS, const double CONVERGEDTHRESHOL

//the null condition:
if (DONULL) {
if (alignmatrix[NULLGRAM].count(targetgram)) sentencetotal[targetgram] += alignmatrix[NULLGRAM][targetgram]; //belongs to previous step technically, but moved into this loop for efficieny

const double countvalue_null = alignmatrix[NULLGRAM][targetgram] / sentencetotal[targetgram];
count[NULLGRAM][targetgram] += countvalue_null;
total[NULLGRAM] += countvalue_null;
}

for (vector<const EncAnyGram*>::const_iterator sourceiter = sourcepatterns->begin(); sourceiter != sourcepatterns->end(); sourceiter++) {
for (vector<const EncAnyGram*>::const_iterator sourceiter = sourcepatterns->begin(); sourceiter != sourcepatterns->end(); sourceiter++) {
const EncAnyGram * sourcegram = *sourceiter;
const double countvalue = alignmatrix[sourcegram][targetgram] / sentencetotal[targetgram];
count[sourcegram][targetgram] += countvalue;
total[sourcegram] += countvalue;
if ((alignmatrix.count(sourcegram) && alignmatrix[sourcegram].count(targetgram))) {
const double countvalue = alignmatrix[sourcegram][targetgram] / sentencetotal[targetgram];
count[sourcegram][targetgram] += countvalue;
total[sourcegram] += countvalue;
}
}
}

Expand Down Expand Up @@ -495,6 +501,8 @@ void EMAlignmentModel::save(const string & filename) {
}


/************************************************* END EM ***************/

ItEMAlignmentModel::ItEMAlignmentModel(SelectivePatternModel * sourcemodel, SelectivePatternModel * targetmodel, const int MAXROUNDS, const double CONVERGEDTHRESHOLD, double probthreshold, const int bestn, bool DONULL, bool DEBUG) {
// Compute p(target|source) alignmatrix[source][target]
/*
Expand Down Expand Up @@ -1274,6 +1282,8 @@ void orderedinsert(list<double> & l, double value) {





EMAlignmentModel2::EMAlignmentModel2(SelectivePatternModel * sourcemodel, SelectivePatternModel * targetmodel, bool INIT, bool DONULL, bool DEBUG) {
this->sourcemodel = sourcemodel;
this->targetmodel = targetmodel;
Expand Down Expand Up @@ -1352,16 +1362,17 @@ void EMAlignmentModel2::train(const int MAXROUNDS, const double CONVERGEDTHRESHO
vector<const EncAnyGram*> * targetpatterns = &targetmodel->reverseindex[sentence];
if ((DEBUG) || (sentence % 1000 == 0)) cerr << "@" << sentence << " (" << sourcepatterns->size() << "x" << targetpatterns->size() << ")" << endl;
//compute sentencetotal for normalisation later in count step, sum_s(p(t|s))
unordered_map<const EncAnyGram*, double> sentencetotal;
for (vector<const EncAnyGram*>::iterator targetiter = targetpatterns->begin(); targetiter != targetpatterns->end(); targetiter++) {
const EncAnyGram * targetgram = *targetiter;
if (DONULL) sentencetotal[targetgram] += alignmatrix[NULLGRAM][targetgram];
for (vector<const EncAnyGram*>::const_iterator sourceiter = sourcepatterns->begin(); sourceiter != sourcepatterns->end(); sourceiter++) {
const EncAnyGram * sourcegram = *sourceiter;
sentencetotal[targetgram] += alignmatrix[sourcegram][targetgram]; //compute sum over all source conditions for a targetgram under consideration
}
unordered_map<const EncAnyGram*, double> sentencetotal;
for (vector<const EncAnyGram*>::const_iterator sourceiter = sourcepatterns->begin(); sourceiter != sourcepatterns->end(); sourceiter++) {
const EncAnyGram * sourcegram = *sourceiter;
if (alignmatrix.count(sourcegram)) {
for (vector<const EncAnyGram*>::iterator targetiter = targetpatterns->begin(); targetiter != targetpatterns->end(); targetiter++) {
const EncAnyGram * targetgram = *targetiter;
if (alignmatrix[sourcegram].count(targetgram)) sentencetotal[targetgram] += alignmatrix[sourcegram][targetgram]; //compute sum over all source conditions for a targetgram under consideration
}
}
}


//collect counts to estimate improved model (for evidence that a targetgram is aligned to a sourcegram)
Expand All @@ -1370,16 +1381,20 @@ void EMAlignmentModel2::train(const int MAXROUNDS, const double CONVERGEDTHRESHO

//the null condition:
if (DONULL) {
if (alignmatrix[NULLGRAM].count(targetgram)) sentencetotal[targetgram] += alignmatrix[NULLGRAM][targetgram]; //belongs to previous step technically, but moved into this loop for efficieny

const double countvalue_null = alignmatrix[NULLGRAM][targetgram] / sentencetotal[targetgram];
count[NULLGRAM][targetgram] += countvalue_null;
total[NULLGRAM] += countvalue_null;
}

for (vector<const EncAnyGram*>::const_iterator sourceiter = sourcepatterns->begin(); sourceiter != sourcepatterns->end(); sourceiter++) {
for (vector<const EncAnyGram*>::const_iterator sourceiter = sourcepatterns->begin(); sourceiter != sourcepatterns->end(); sourceiter++) {
const EncAnyGram * sourcegram = *sourceiter;
const double countvalue = alignmatrix[sourcegram][targetgram] / sentencetotal[targetgram];
count[sourcegram][targetgram] += countvalue;
total[sourcegram] += countvalue;
if ((alignmatrix.count(sourcegram) && alignmatrix[sourcegram].count(targetgram))) {
const double countvalue = alignmatrix[sourcegram][targetgram] / sentencetotal[targetgram];
count[sourcegram][targetgram] += countvalue;
total[sourcegram] += countvalue;
}
}
}

Expand Down Expand Up @@ -1522,5 +1537,3 @@ void EMAlignmentModel2::save(const string & filename) {
f.close();
}



0 comments on commit dcdbcad

Please sign in to comment.