Skip to content

Commit

Permalink
similarity threshold high precision and low precision;cache similarities
Browse files Browse the repository at this point in the history
  • Loading branch information
Sonal Gupta authored and Stanford NLP committed Jan 23, 2015
1 parent 386c869 commit 871ccc3
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 21 deletions.
7 changes: 5 additions & 2 deletions src/edu/stanford/nlp/patterns/ConstantsAndVariables.java
Expand Up @@ -436,8 +436,11 @@ public class ConstantsAndVariables<E> implements Serializable{
@Option(name="expandPositivesWhenSampling", gloss="when sampling for learning feature wts for learning phrases, expand the positives") @Option(name="expandPositivesWhenSampling", gloss="when sampling for learning feature wts for learning phrases, expand the positives")
public boolean expandPositivesWhenSampling = false; public boolean expandPositivesWhenSampling = false;


@Option(name="positiveSimilarityThreshold") @Option(name="positiveSimilarityThresholdHighPrecision", gloss="used for expanding positives")
public double positiveSimilarityThreshold = 0.7; public double positiveSimilarityThresholdHighPrecision = 0.7;

@Option(name="positiveSimilarityThresholdLowPrecision", gloss="used for not choosing close unknowns as positives")
public double positiveSimilarityThresholdLowPrecision = 0.5;


@Option(name="subSampleUnkAsPosUsingSimPercentage", gloss="When using expandPositivesWhenSampling, select top % after applying the threshold") @Option(name="subSampleUnkAsPosUsingSimPercentage", gloss="When using expandPositivesWhenSampling, select top % after applying the threshold")
public double subSampleUnkAsPosUsingSimPercentage = 0.05; public double subSampleUnkAsPosUsingSimPercentage = 0.05;
Expand Down
69 changes: 50 additions & 19 deletions src/edu/stanford/nlp/patterns/ScorePhrasesLearnFeatWt.java
Expand Up @@ -245,24 +245,30 @@ private Counter<CandidatePhrase> computeSimWithWordVectors(List<CandidatePhrase>
donotuse = true; donotuse = true;
break; break;
} }
if(!wordVectors.containsKey(pos.getPhrase())) if (!wordVectors.containsKey(pos.getPhrase()))
continue; continue;
double[] d2 = wordVectors.get(pos.getPhrase());

double sum = 0;
double d1sq = 0;
double d2sq = 0;
for (int i = 0; i < d1.length; i++) {
sum += d1[i] * d2[i];
d1sq += d1[i] * d1[i];
d2sq += d2[i] * d2[i];
}
double sim = sum / (Math.sqrt(d1sq) * Math.sqrt(d2sq));
avgSim += sim;
}


avgSim /= positivePhrases.size(); PhrasePair pair = new PhrasePair(p.getPhrase(), pos.getPhrase());
if (cacheSimilarities.containsKey(pair))
avgSim = cacheSimilarities.getCount(pair);
else {
double[] d2 = wordVectors.get(pos.getPhrase());

double sum = 0;
double d1sq = 0;
double d2sq = 0;
for (int i = 0; i < d1.length; i++) {
sum += d1[i] * d2[i];
d1sq += d1[i] * d1[i];
d2sq += d2[i] * d2[i];
}
double sim = sum / (Math.sqrt(d1sq) * Math.sqrt(d2sq));
avgSim += sim;
}


avgSim /= positivePhrases.size();
cacheSimilarities.setCount(pair, avgSim);
}
if(!donotuse){ if(!donotuse){
sims.setCount(p, avgSim); sims.setCount(p, avgSim);
if(allMaxSim.get() < avgSim) if(allMaxSim.get() < avgSim)
Expand Down Expand Up @@ -384,8 +390,8 @@ Set<CandidatePhrase> chooseUnknownAsNegatives(Set<CandidatePhrase> candidatePhra
CandidatePhrase k = Counters.argmax(sims); CandidatePhrase k = Counters.argmax(sims);
System.out.println("Maximum similarity was " + sims.getCount(k) + " for word " + k); System.out.println("Maximum similarity was " + sims.getCount(k) + " for word " + k);


Counter<CandidatePhrase> removed = Counters.retainBelow(sims, constVars.positiveSimilarityThreshold); Counter<CandidatePhrase> removed = Counters.retainBelow(sims, constVars.positiveSimilarityThresholdLowPrecision);
System.out.println("removing phrases as negative phrases that were higher that positive similarity threshold of " + constVars.positiveSimilarityThreshold + removed); System.out.println("removing phrases as negative phrases that were higher that positive similarity threshold of " + constVars.positiveSimilarityThresholdLowPrecision + removed);
if(logFile != null){ if(logFile != null){
for(Entry<CandidatePhrase, Double> en: removed.entrySet()) for(Entry<CandidatePhrase, Double> en: removed.entrySet())
if(wordVectors.containsKey(en.getKey().getPhrase())) if(wordVectors.containsKey(en.getKey().getPhrase()))
Expand Down Expand Up @@ -586,7 +592,7 @@ public Quadruple<Set<CandidatePhrase>, Set<CandidatePhrase>, Set<CandidatePhrase
sims = computeSimWithWordCluster(Arrays.asList(candidate), knownPositivePhrases, new AtomicDouble()); sims = computeSimWithWordCluster(Arrays.asList(candidate), knownPositivePhrases, new AtomicDouble());


double sim = sims.getCount(candidate); double sim = sims.getCount(candidate);
if (sim > constVars.positiveSimilarityThreshold) if (sim > constVars.positiveSimilarityThresholdHighPrecision)
allCloseToPositivePhrases.setCount(candidate, sim); allCloseToPositivePhrases.setCount(candidate, sim);
} }
} }
Expand All @@ -611,13 +617,38 @@ public Quadruple<Set<CandidatePhrase>, Set<CandidatePhrase>, Set<CandidatePhrase
} }
} }


private class PhrasePair{
final String p1;
final String p2;
final int hashCode;
public PhrasePair(String p1, String p2) {
if(p1.compareTo(p2) <=0)
{
this.p1 = p1;
this.p2 = p2;
}else
{
this.p1 = p2;
this.p2 = p1;
}

this.hashCode = p1.hashCode() + p2.hashCode() + 331;
}

@Override
public int hashCode(){
return hashCode;
}
}

Counter<PhrasePair> cacheSimilarities = new ConcurrentHashCounter<PhrasePair>();


public RVFDataset<String, ScorePhraseMeasures> choosedatums(boolean forLearningPattern, String answerLabel, public RVFDataset<String, ScorePhraseMeasures> choosedatums(boolean forLearningPattern, String answerLabel,
TwoDimensionalCounter<CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter<CandidatePhrase, E> wordsPatExtracted,
Counter<E> allSelectedPatterns, boolean computeRawFreq) throws IOException { Counter<E> allSelectedPatterns, boolean computeRawFreq) throws IOException {


Counter<Integer> distSimClustersOfPositive = new ClassicCounter<Integer>(); Counter<Integer> distSimClustersOfPositive = new ClassicCounter<Integer>();
if(constVars.expandPositivesWhenSampling){ if(constVars.expandPositivesWhenSampling && !constVars.useWordVectorsToComputeSim){
for(CandidatePhrase s: CollectionUtils.union(constVars.getLearnedWords(answerLabel).keySet(), constVars.getSeedLabelDictionary().get(answerLabel))){ for(CandidatePhrase s: CollectionUtils.union(constVars.getLearnedWords(answerLabel).keySet(), constVars.getSeedLabelDictionary().get(answerLabel))){
String[] toks = s.getPhrase().split("\\s+"); String[] toks = s.getPhrase().split("\\s+");
if(!constVars.getWordClassClusters().containsKey(s.getPhrase())){ if(!constVars.getWordClassClusters().containsKey(s.getPhrase())){
Expand Down

0 comments on commit 871ccc3

Please sign in to comment.