diff --git a/src/edu/stanford/nlp/coref/CorefAlgorithm.java b/src/edu/stanford/nlp/coref/CorefAlgorithm.java index 7d9e377916..5f81cba197 100644 --- a/src/edu/stanford/nlp/coref/CorefAlgorithm.java +++ b/src/edu/stanford/nlp/coref/CorefAlgorithm.java @@ -5,10 +5,10 @@ import edu.stanford.nlp.coref.CorefProperties.CorefAlgorithmType; import edu.stanford.nlp.coref.data.Dictionaries; import edu.stanford.nlp.coref.data.Document; -import edu.stanford.nlp.coref.hybrid.HybridCorefSystem; +//import edu.stanford.nlp.coref.hybrid.HybridCorefSystem; import edu.stanford.nlp.coref.neural.NeuralCorefAlgorithm; -import edu.stanford.nlp.coref.statistical.ClusteringCorefAlgorithm; -import edu.stanford.nlp.coref.statistical.StatisticalCorefAlgorithm; +//import edu.stanford.nlp.coref.statistical.ClusteringCorefAlgorithm; +//import edu.stanford.nlp.coref.statistical.StatisticalCorefAlgorithm; /** * A CorefAlgorithms makes coreference decisions on the provided {@link Document} after @@ -16,9 +16,10 @@ * @author Kevin Clark */ public interface CorefAlgorithm { - public void runCoref(Document document); - public static CorefAlgorithm fromProps(Properties props, Dictionaries dictionaries) { + public void runCoref(Document document); + + /*public static CorefAlgorithm fromProps(Properties props, Dictionaries dictionaries) { CorefAlgorithmType algorithm = CorefProperties.algorithm(props); if (algorithm == CorefAlgorithmType.CLUSTERING) { return new ClusteringCorefAlgorithm(props, dictionaries); @@ -33,5 +34,10 @@ public static CorefAlgorithm fromProps(Properties props, Dictionaries dictionari throw new RuntimeException("Error creating hybrid coref system", e); } } - } + }*/ + + public static CorefAlgorithm fromProps(Properties props, Dictionaries dictionaries) { + return new NeuralCorefAlgorithm(props, dictionaries); + } + } diff --git a/src/edu/stanford/nlp/coref/CorefCoreAnnotations.java b/src/edu/stanford/nlp/coref/CorefCoreAnnotations.java index 05a8a900db..8e35410929 100644 --- a/src/edu/stanford/nlp/coref/CorefCoreAnnotations.java +++ b/src/edu/stanford/nlp/coref/CorefCoreAnnotations.java @@ -6,7 +6,6 @@ import edu.stanford.nlp.coref.data.CorefChain; import edu.stanford.nlp.coref.data.Mention; - import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.util.ErasureUtils; @@ -14,7 +13,7 @@ import edu.stanford.nlp.util.Pair; /** - * Similar to {@link edu.stanford.nlp.ling.CoreAnnotations}, + * Similar to {@link edu.stanford.nlp.ling.CoreAnnotations}, * but this class contains * annotations made specifically for storing Coref data. This is kept * separate from CoreAnnotations so that systems which only need @@ -80,7 +79,7 @@ public Class> getType() { return ErasureUtils.uncheckedCast(Set.class); } } - + /** * CorefChainID - CorefChain map */ diff --git a/src/edu/stanford/nlp/coref/CorefDocMaker.java b/src/edu/stanford/nlp/coref/CorefDocMaker.java new file mode 100644 index 0000000000..96ded84a11 --- /dev/null +++ b/src/edu/stanford/nlp/coref/CorefDocMaker.java @@ -0,0 +1,252 @@ +package edu.stanford.nlp.coref; + +import java.util.ArrayList; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.util.List; +import java.util.Locale; +import java.util.Properties; + +import edu.stanford.nlp.classify.LogisticClassifier; +import edu.stanford.nlp.coref.data.Dictionaries; +import edu.stanford.nlp.coref.data.Document; +import edu.stanford.nlp.coref.data.InputDoc; +import edu.stanford.nlp.coref.data.Mention; +import edu.stanford.nlp.coref.docreader.CoNLLDocumentReader; +import edu.stanford.nlp.coref.docreader.DocReader; +import edu.stanford.nlp.coref.md.CorefMentionFinder; +import edu.stanford.nlp.coref.md.DependencyCorefMentionFinder; +import edu.stanford.nlp.coref.md.HybridCorefMentionFinder; +import edu.stanford.nlp.coref.md.RuleBasedCorefMentionFinder; +import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.io.RuntimeIOException; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.trees.HeadFinder; +import edu.stanford.nlp.trees.SemanticHeadFinder; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.TreeCoreAnnotations; +import edu.stanford.nlp.trees.TreeLemmatizer; +import edu.stanford.nlp.trees.international.pennchinese.ChineseSemanticHeadFinder; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.logging.Redwood; + +/** + * + * make Document for coref input from Annotation and optional info + * read input (raw, conll etc) with DocReader, mention detection, and document preprocessing will be done here + * + * @author heeyoung + */ +public class CorefDocMaker { + + Properties props; + DocReader reader; + final HeadFinder headFinder; + CorefMentionFinder md; + Dictionaries dict; + StanfordCoreNLP corenlp; + final TreeLemmatizer treeLemmatizer; + LogisticClassifier singletonPredictor; + + boolean addMissingAnnotations ; + + public CorefDocMaker(Properties props, Dictionaries dictionaries) throws ClassNotFoundException, IOException { + this.props = props; + this.dict = dictionaries; + reader = getDocumentReader(props); + headFinder = getHeadFinder(props); + md = getMentionFinder(props, dictionaries, headFinder); +// corenlp = new StanfordCoreNLP(props, false); + // the property coref.addMissingAnnotations must be set to true to get the CorefDocMaker to add annotations + if (CorefProperties.addMissingAnnotations(props)) { + addMissingAnnotations = true; + corenlp = loadStanfordProcessor(props); + } else { + addMissingAnnotations = false; + } + treeLemmatizer = new TreeLemmatizer(); + singletonPredictor = (CorefProperties.useSingletonPredictor(props))? + getSingletonPredictorFromSerializedFile(CorefProperties.getPathSingletonPredictor(props)) : null; + } + + /** Load Stanford Processor: skip unnecessary annotator */ + protected StanfordCoreNLP loadStanfordProcessor(Properties props) { + + Properties pipelineProps = new Properties(props); + StringBuilder annoSb = new StringBuilder(""); + if (!CorefProperties.useGoldPOS(props)) { + annoSb.append("pos, lemma"); + } else { + annoSb.append("lemma"); + } + if(CorefProperties.USE_TRUECASE) { + annoSb.append(", truecase"); + } + if (!CorefProperties.useGoldNE(props) || CorefProperties.getLanguage(props)==Locale.CHINESE) { + annoSb.append(", ner"); + } + if (!CorefProperties.useGoldParse(props)) { + if(CorefProperties.useConstituencyTree(props)) annoSb.append(", parse"); + else annoSb.append(", depparse"); + } + // need to add mentions + annoSb.append(", mention"); + String annoStr = annoSb.toString(); + Redwood.log("MentionExtractor ignores specified annotators, using annotators=" + annoStr); + pipelineProps.put("annotators", annoStr); + return new StanfordCoreNLP(pipelineProps, false); + } + + + private static DocReader getDocumentReader(Properties props) { + switch (CorefProperties.getInputType(props)) { + case CONLL: + String corpusPath = CorefProperties.getPathInput(props); + CoNLLDocumentReader.Options options = new CoNLLDocumentReader.Options(); + options.annotateTokenCoref = false; + if (CorefProperties.useCoNLLAuto(props)) options.setFilter(".*_auto_conll$"); + options.lang = CorefProperties.getLanguage(props); + return new CoNLLDocumentReader(corpusPath, options); + + case ACE: + // TODO + return null; + + case MUC: + // TODO + return null; + + case RAW: + default: // default is raw text + // TODO + return null; + } + } + + private static HeadFinder getHeadFinder(Properties props) { + Locale lang = CorefProperties.getLanguage(props); + if(lang == Locale.ENGLISH) return new SemanticHeadFinder(); + else if(lang == Locale.CHINESE) return new ChineseSemanticHeadFinder(); + else { + throw new RuntimeException("Invalid language setting: cannot load HeadFinder"); + } + } + + private static CorefMentionFinder getMentionFinder(Properties props, Dictionaries dictionaries, HeadFinder headFinder) throws ClassNotFoundException, IOException { + + switch (CorefProperties.getMDType(props)) { + case RULE: + return new RuleBasedCorefMentionFinder(headFinder, props); + + case HYBRID: + return new HybridCorefMentionFinder(headFinder, props); + + case DEPENDENCY: + default: // default is dependency + return new DependencyCorefMentionFinder(props); + } + } + + public Document makeDocument(Annotation anno) throws Exception { + return makeDocument(new InputDoc(anno, null, null)); + } + + /** + * Make Document for coref (for method coref(Document doc, StringBuilder[] outputs)). + * Mention detection and document preprocessing is done here. + * @throws Exception + */ + public Document makeDocument(InputDoc input) throws Exception { + if (input == null) return null; + Annotation anno = input.annotation; + + if (Boolean.parseBoolean(props.getProperty("coref.useMarkedDiscourse", "false"))) { + anno.set(CoreAnnotations.UseMarkedDiscourseAnnotation.class, true); + } + + // add missing annotation + if (addMissingAnnotations) { + addMissingAnnotation(anno); + } + + // remove nested NP with same headword except newswire document for chinese + + //if(input.conllDoc != null && CorefProperties.getLanguage(props)==Locale.CHINESE){ + //CorefProperties.setRemoveNested(props, !input.conllDoc.documentID.contains("nw")); + //} + + // each sentence should have a CorefCoreAnnotations.CorefMentionsAnnotation.class which maps to List + // this is set by the mentions annotator + List> mentions = new ArrayList<>() ; + for (CoreMap sentence : anno.get(CoreAnnotations.SentencesAnnotation.class)) { + mentions.add(sentence.get(CorefCoreAnnotations.CorefMentionsAnnotation.class)); + } + + Document doc = new Document(input, mentions); + + // find headword for gold mentions + if(input.goldMentions!=null) findGoldMentionHeads(doc); + + // document preprocessing: initialization (assign ID), mention processing (gender, number, type, etc), speaker extraction, etc + Preprocessor.preprocess(doc, dict, singletonPredictor, headFinder); + + return doc; + } + + private void findGoldMentionHeads(Document doc) { + List sentences = doc.annotation.get(SentencesAnnotation.class); + for(int i=0 ; i sentences = anno.get(CoreAnnotations.SentencesAnnotation.class); + for (CoreMap sentence : sentences) { + boolean hasTree = sentence.containsKey(TreeCoreAnnotations.TreeAnnotation.class); + Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); + + if (!useConstituency) { // TODO: temp for dev: make sure we don't use constituency tree + sentence.remove(TreeCoreAnnotations.TreeAnnotation.class); + } + if (LEMMATIZE && hasTree && useConstituency) treeLemmatizer.transformTree(tree); // TODO don't need? + } + corenlp.annotate(anno); + } else { + throw new RuntimeException("Error: must set coref.addMissingAnnotations = true to call method addMissingAnnotation"); + } + } + + public void resetDocs() { + reader.reset(); + } + + public Document nextDoc() throws Exception { + InputDoc input = reader.nextDoc(); + return (input == null)? null : makeDocument(input); + } + + public static LogisticClassifier getSingletonPredictorFromSerializedFile(String serializedFile) { + try { + ObjectInputStream ois = IOUtils.readStreamFromString(serializedFile); + Object o = ois.readObject(); + if (o instanceof LogisticClassifier) { + return (LogisticClassifier) o; + } + throw new ClassCastException("Wanted SingletonPredictor, got " + o.getClass()); + } catch (IOException e) { + throw new RuntimeIOException(e); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + } + +} diff --git a/src/edu/stanford/nlp/coref/CorefDocumentProcessor.java b/src/edu/stanford/nlp/coref/CorefDocumentProcessor.java index 4ab9c7a8a9..5451e7308f 100644 --- a/src/edu/stanford/nlp/coref/CorefDocumentProcessor.java +++ b/src/edu/stanford/nlp/coref/CorefDocumentProcessor.java @@ -40,6 +40,14 @@ public default void run(DocumentMaker docMaker) throws Exception { Document document = docMaker.nextDoc(); long time = System.currentTimeMillis(); while (document != null) { + /*if (docId < 130) { + Redwood.log(getName(), "Processed document " + docId + " in " + + (System.currentTimeMillis() - time) / 1000.0 + "s"); + time = System.currentTimeMillis(); + docId++; + document = docMaker.nextDoc(); + continue; + }*/ document.extractGoldCorefClusters(); process(docId, document); Redwood.log(getName(), "Processed document " + docId + " in " @@ -47,6 +55,10 @@ public default void run(DocumentMaker docMaker) throws Exception { time = System.currentTimeMillis(); docId++; document = docMaker.nextDoc(); + + /*if (docId > 15) { + break; + }*/ } finish(); } diff --git a/src/edu/stanford/nlp/coref/CorefPrinter.java b/src/edu/stanford/nlp/coref/CorefPrinter.java index ba5e95bef4..798a2f5014 100644 --- a/src/edu/stanford/nlp/coref/CorefPrinter.java +++ b/src/edu/stanford/nlp/coref/CorefPrinter.java @@ -14,6 +14,14 @@ import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.logging.Redwood; + + +import java.text.DecimalFormat; +import java.util.logging.*; +import java.util.regex.*; + + /** * Class for printing out coreference output. * @author Heeyoung Lee @@ -28,9 +36,9 @@ public static String printConllOutput(Document document, boolean gold, boolean f List> orderedMentions = gold ? document.goldMentions : document.predictedMentions; if (filterSingletons) { orderedMentions = orderedMentions.stream().map( - ml -> ml.stream().filter(m -> document.corefClusters.get(m.corefClusterID) != null && - document.corefClusters.get(m.corefClusterID).size() > 1) - .collect(Collectors.toList())) + ml -> ml.stream().filter(m -> document.corefClusters.get(m.corefClusterID) != null && + document.corefClusters.get(m.corefClusterID).getCorefMentions().size() > 1) + .collect(Collectors.toList())) .collect(Collectors.toList()); } return CorefPrinter.printConllOutput(document, orderedMentions, gold); @@ -105,4 +113,5 @@ public static String printConllOutput(Document document, return sb.toString(); } + } diff --git a/src/edu/stanford/nlp/coref/CorefProperties.java b/src/edu/stanford/nlp/coref/CorefProperties.java index 00a399ce48..b681acd7cc 100644 --- a/src/edu/stanford/nlp/coref/CorefProperties.java +++ b/src/edu/stanford/nlp/coref/CorefProperties.java @@ -1,107 +1,454 @@ package edu.stanford.nlp.coref; +import java.io.File; +import java.util.Arrays; +import java.util.HashSet; import java.util.Locale; import java.util.Properties; +import java.util.Set; +import edu.stanford.nlp.coref.data.Dictionaries.MentionType; +import edu.stanford.nlp.coref.sieve.Sieve.ClassifierType; +import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.PropertiesUtils; -/** - * Manages the properties for running coref - * @author Kevin Clark - */ + public class CorefProperties { - //---------- Coreference Algorithms ---------- public enum CorefAlgorithmType {CLUSTERING, STATISTICAL, NEURAL, HYBRID} + public enum CorefInputType { RAW, CONLL, ACE, MUC } + public enum Dataset {TRAIN, DEV, TEST} + public enum MentionDetectionType { RULE, HYBRID, DEPENDENCY } + + // general + public static final String LANG_PROP = "coref.language"; + public static final String SIEVES_PROP = "coref.sieves"; + public static final String ALLOW_REPARSING_PROP = "coref.allowReparsing"; + public static final String SCORE_PROP = "coref.doScore"; + public static final String PARSER_PROP = "coref.useConstituencyTree"; + public static final String THREADS_PROP = "coref.threadCount"; + public static final String INPUT_TYPE_PROP = "coref.input.type"; + public static final String POSTPROCESSING_PROP = "coref.postprocessing"; + public static final String MD_TYPE_PROP = "coref.md.type"; + public static final String USE_SINGLETON_PREDICTOR_PROP = "coref.useSingletonPredictor"; + public static final String SEED_PROP = "coref.seed"; + public static final String CONLL_AUTO_PROP = "coref.conll.auto"; + public static final String MD_TRAIN_PROP = "coref.md.isTraining"; // train MD classifier + public static final String USE_SEMANTICS_PROP = "coref.useSemantics"; // load semantics if true + public static final String CURRENT_SIEVE_FOR_TRAIN_PROP = "coref.currentSieveForTrain"; + public static final String STORE_TRAINDATA_PROP = "coref.storeTrainData"; + public static final String USE_GOLD_NE_PROP = "coref.useGoldNE"; + public static final String USE_GOLD_PARSES_PROP = "coref.useGoldParse"; + public static final String USE_GOLD_POS_PROP = "coref.useGoldPOS"; + private static final String REMOVE_NESTED = "removeNested"; + private static final String ADD_MISSING_ANNOTATIONS = "coref.addMissingAnnotations"; + + // logging & system check & analysis + public static final String DEBUG_PROP = "coref.debug"; + public static final String LOG_PROP = "coref.logFile"; + public static final String TIMER_PROP = "coref.checkTime"; + public static final String MEMORY_PROP = "coref.checkMemory"; + public static final String PRINT_MDLOG_PROP = "coref.print.md.log"; + public static final String CALCULATE_IMPORTANCE_PROP = "coref.calculateFeatureImportance"; + public static final String DO_ANALYSIS_PROP = "coref.analysis.doAnalysis"; + public static final String ANALYSIS_SKIP_MTYPE_PROP = "coref.analysis.skip.mType"; + public static final String ANALYSIS_SKIP_ATYPE_PROP = "coref.analysis.skip.aType"; + + // data & io + public static final String STATES_PROP = "coref.states"; + public static final String DEMONYM_PROP = "coref.demonym"; + public static final String ANIMATE_PROP = "coref.animate"; + public static final String INANIMATE_PROP = "coref.inanimate"; + public static final String MALE_PROP = "coref.male"; + public static final String NEUTRAL_PROP = "coref.neutral"; + public static final String FEMALE_PROP = "coref.female"; + public static final String PLURAL_PROP = "coref.plural"; + public static final String SINGULAR_PROP = "coref.singular"; + public static final String GENDER_NUMBER_PROP = "coref.big.gender.number"; + public static final String COUNTRIES_PROP = "coref.countries"; + public static final String STATES_PROVINCES_PROP = "coref.states.provinces"; + public static final String DICT_LIST_PROP = "coref.dictlist"; + public static final String DICT_PMI_PROP = "coref.dictpmi"; + public static final String SIGNATURES_PROP = "coref.signatures"; + public static final String LOAD_WORD_EMBEDDING_PROP = "coref.loadWordEmbedding"; + public static final String WORD2VEC_PROP = "coref.path.word2vec"; + public static final String WORD2VEC_SERIALIZED_PROP = "coref.path.word2vecSerialized"; + + public static final String PATH_SCORER_PROP = "coref.scorer"; + + public static final String PATH_INPUT_PROP = "coref.path.input"; + public static final String PATH_OUTPUT_PROP = "coref.path.output"; + + public static final String PATH_TRAIN_PROP = "coref.path.traindata"; + public static final String PATH_EVAL_PROP = "coref.path.evaldata"; + + public static final String PATH_SERIALIZED_PROP = "coref.path.serialized"; + + // models + public static final String PATH_SINGLETON_PREDICTOR_PROP = "coref.path.singletonPredictor"; + public static final String PATH_MODEL_PROP = "coref.SIEVENAME.model"; + public static final String MENTION_DETECTION_MODEL_PROP = "coref.md.model"; + + // sieve option + public static final String CLASSIFIER_TYPE_PROP = "coref.SIEVENAME.classifierType"; + public static final String NUM_TREE_PROP = "coref.SIEVENAME.numTrees"; + public static final String NUM_FEATURES_PROP = "coref.SIEVENAME.numFeatures"; + public static final String TREE_DEPTH_PROP = "coref.SIEVENAME.treeDepth"; + public static final String MAX_SENT_DIST_PROP = "coref.SIEVENAME.maxSentDist"; + public static final String MTYPE_PROP = "coref.SIEVENAME.mType"; + public static final String ATYPE_PROP = "coref.SIEVENAME.aType"; + public static final String DOWNSAMPLE_RATE_PROP = "coref.SIEVENAME.downsamplingRate"; + public static final String THRES_FEATURECOUNT_PROP = "coref.SIEVENAME.thresFeatureCount"; + public static final String FEATURE_SELECTION_PROP = "coref.SIEVENAME.featureSelection"; + public static final String THRES_MERGE_PROP = "coref.SIEVENAME.merge.thres"; + public static final String THRES_FEATURE_SELECTION_PROP = "coref.SIEVENAME.pmi.thres"; + public static final String DEFAULT_PRONOUN_AGREEMENT_PROP = "coref.defaultPronounAgreement"; + + // features + public static final String USE_BASIC_FEATURES_PROP = "coref.SIEVENAME.useBasicFeatures"; + public static final String COMBINE_OBJECTROLE_PROP = "coref.SIEVENAME.combineObjectRole"; + public static final String USE_MD_FEATURES_PROP = "coref.SIEVENAME.useMentionDetectionFeatures"; + public static final String USE_DCOREFRULE_FEATURES_PROP = "coref.SIEVENAME.useDcorefRuleFeatures"; + public static final String USE_POS_FEATURES_PROP = "coref.SIEVENAME.usePOSFeatures"; + public static final String USE_LEXICAL_FEATURES_PROP = "coref.SIEVENAME.useLexicalFeatures"; + public static final String USE_WORD_EMBEDDING_FEATURES_PROP = "coref.SIEVENAME.useWordEmbeddingFeatures"; + + public static final Locale LANGUAGE_DEFAULT = Locale.ENGLISH; + public static final int MONITOR_DIST_CMD_FINISHED_WAIT_MILLIS = 60000; + + /** if true, use truecase annotator */ + public static final boolean USE_TRUECASE = false; + + /** if true, remove appositives, predicate nominatives in post processing */ + public static final boolean REMOVE_APPOSITION_PREDICATENOMINATIVES = true; + + /** if true, remove singletons in post processing */ + public static final boolean REMOVE_SINGLETONS = true; + + /** property for conll output path **/ + public static final String OUTPUT_PATH_PROP = "coref.conllOutputPath"; + + // current list of dcoref sieves + private static final Set dcorefSieveNames = new HashSet<>(Arrays.asList("MarkRole", "DiscourseMatch", + "ExactStringMatch", "RelaxedExactStringMatch", "PreciseConstructs", "StrictHeadMatch1", + "StrictHeadMatch2", "StrictHeadMatch3", "StrictHeadMatch4", "RelaxedHeadMatch", "PronounMatch", "SpeakerMatch", + "ChineseHeadMatch")); + + // return what coref algorithm the user wants to use public static CorefAlgorithmType algorithm(Properties props) { String type = PropertiesUtils.getString(props, "coref.algorithm", - getLanguage(props) == Locale.ENGLISH ? "statistical" : "neural"); + getLanguage(props) == Locale.ENGLISH ? "statistical" : "neural"); return CorefAlgorithmType.valueOf(type.toUpperCase()); } - //---------- General Coreference Options ---------- - - /** - * When conll() is true, coref models - *
    - *
  • Use provided POS, NER, Parsing, etc. (instead of using CoreNLP annotators)
  • - *
  • Use provided speaker annotations
  • - *
  • Use provided document type and genre information
  • - *
- */ public static boolean conll(Properties props) { return PropertiesUtils.getBool(props, "coref.conll", false); } + + public static boolean doScore(Properties props) { + return PropertiesUtils.getBool(props, SCORE_PROP, false); + } + public static boolean checkTime(Properties props) { + return PropertiesUtils.getBool(props, TIMER_PROP, false); + } + public static boolean checkMemory(Properties props) { + return PropertiesUtils.getBool(props, MEMORY_PROP, false); + } + + public static String conllOutputPath(Properties props) { + return props.getProperty(OUTPUT_PATH_PROP); + } + + // renaming of this property, will delete the other one soon public static boolean useConstituencyParse(Properties props) { return PropertiesUtils.getBool(props, "coref.useConstituencyParse", - algorithm(props) != CorefAlgorithmType.STATISTICAL || conll(props)); + algorithm(props) != CorefAlgorithmType.STATISTICAL || conll(props)); } - public static boolean verbose(Properties props) { - return PropertiesUtils.getBool(props, "coref.verbose", false); + public static boolean useConstituencyTree(Properties props) { + return PropertiesUtils.getBool(props, PARSER_PROP, false); } - // ---------- Heuristic Mention Filtering ---------- + /** Input data for CorefDocMaker. It is traindata for training, or testdata for evaluation */ + public static String getPathInput(Properties props) { + return PropertiesUtils.getString(props, PATH_INPUT_PROP, null); + } + public static String getPathOutput(Properties props) { + return PropertiesUtils.getString(props, PATH_OUTPUT_PROP, "/home/heeyoung/log-coref/conlloutput/"); + } + public static String getPathTrainData(Properties props) { + return PropertiesUtils.getString(props, PATH_TRAIN_PROP, "/scr/nlp/data/conll-2012/v4/data/train/data/english/annotations/"); + } + public static String getPathEvalData(Properties props) { + return PropertiesUtils.getString(props, PATH_EVAL_PROP, "/scr/nlp/data/conll-2012/v9/data/test/data/english/annotations"); + } + public static int getThreadCounts(Properties props) { + return PropertiesUtils.getInt(props, THREADS_PROP, Runtime.getRuntime().availableProcessors()); + } + public static String getPathScorer(Properties props) { + return PropertiesUtils.getString(props, PATH_SCORER_PROP, "/scr/nlp/data/conll-2012/scorer/v8.01/scorer.pl"); + } + public static CorefInputType getInputType(Properties props) { + String inputType = PropertiesUtils.getString(props, INPUT_TYPE_PROP, "raw"); + return CorefInputType.valueOf(inputType.toUpperCase()); + } + public static boolean printMDLog(Properties props) { + return PropertiesUtils.getBool(props, PRINT_MDLOG_PROP, false); + } + public static boolean doPostProcessing(Properties props) { + return PropertiesUtils.getBool(props, POSTPROCESSING_PROP, false); + } - public static int maxMentionDistance(Properties props) { - return PropertiesUtils.getInt(props, "coref.maxMentionDistance", - conll(props) ? Integer.MAX_VALUE : 50); + /** if true, use conll auto files, else use conll gold files */ + public static boolean useCoNLLAuto(Properties props) { + return PropertiesUtils.getBool(props, CONLL_AUTO_PROP, true); + } + public static MentionDetectionType getMDType(Properties props) { + String defaultMD; + if (getLanguage(props).equals(Locale.ENGLISH)) { + // defaultMD for English should be RULE since this is highest performing score for scoref + defaultMD = "RULE"; + } else if (getLanguage(props).equals(Locale.CHINESE)) { + // defaultMD for Chinese should be RULE for now + defaultMD = "RULE"; + } else { + // general default is "RULE" for now + defaultMD = "RULE"; + } + String type = PropertiesUtils.getString(props, MD_TYPE_PROP, defaultMD); + if(type.equalsIgnoreCase("dep")) type = "DEPENDENCY"; + return MentionDetectionType.valueOf(type.toUpperCase()); + } + public static boolean useSingletonPredictor(Properties props) { + return PropertiesUtils.getBool(props, USE_SINGLETON_PREDICTOR_PROP, false); + } + public static String getPathSingletonPredictor(Properties props) { + return PropertiesUtils.getString(props, PATH_SINGLETON_PREDICTOR_PROP, "edu/stanford/nlp/models/dcoref/singleton.predictor.ser"); + } + public static String getPathModel(Properties props, String sievename) { + return props.getProperty(PATH_SERIALIZED_PROP) + File.separator + + props.getProperty(PATH_MODEL_PROP.replace("SIEVENAME", sievename), "MISSING_MODEL_FOR_"+sievename); + } + public static boolean debug(Properties props) { + return PropertiesUtils.getBool(props, DEBUG_PROP, false); } - public static int maxMentionDistanceWithStringMatch(Properties props) { - return PropertiesUtils.getInt(props, "coref.maxMentionDistanceWithStringMatch", 500); + public static ClassifierType getClassifierType(Properties props, String sievename) { + if(dcorefSieveNames.contains(sievename)) return ClassifierType.RULE; + if(sievename.toLowerCase().endsWith("-rf")) return ClassifierType.RF; + if(sievename.toLowerCase().endsWith("-oracle")) return ClassifierType.ORACLE; + String classifierType = PropertiesUtils.getString(props, CLASSIFIER_TYPE_PROP.replace("SIEVENAME", sievename), null); + return ClassifierType.valueOf(classifierType); + } + public static double getMergeThreshold(Properties props, String sievename) { + String key = THRES_MERGE_PROP.replace("SIEVENAME", sievename); + return PropertiesUtils.getDouble(props, key, 0.3); + } + public static void setMergeThreshold(Properties props, String sievename, double value) { + String key = THRES_MERGE_PROP.replace("SIEVENAME", sievename); + props.setProperty(key, String.valueOf(value)); } - // ---------- Mention Detection ---------- + public static int getNumTrees(Properties props, String sievename) { + return PropertiesUtils.getInt(props, NUM_TREE_PROP.replace("SIEVENAME", sievename), 100); + } + public static int getSeed(Properties props) { + return PropertiesUtils.getInt(props, SEED_PROP, 1); + } + public static int getNumFeatures(Properties props, String sievename) { + return PropertiesUtils.getInt(props, NUM_FEATURES_PROP.replace("SIEVENAME", sievename), 30); + } + public static int getTreeDepth(Properties props, String sievename) { + return PropertiesUtils.getInt(props, TREE_DEPTH_PROP.replace("SIEVENAME", sievename), 0); + } + public static boolean calculateFeatureImportance(Properties props) { + return PropertiesUtils.getBool(props, CALCULATE_IMPORTANCE_PROP, false); + } - public enum MentionDetectionType { RULE, HYBRID, DEPENDENCY } - public static MentionDetectionType mdType(Properties props) { - String type = PropertiesUtils.getString(props, "coref.md.type", - useConstituencyParse(props) ? "RULE" : "dep"); - if (type.equalsIgnoreCase("dep")) { - type = "DEPENDENCY"; - } - return MentionDetectionType.valueOf(type.toUpperCase()); + public static int getMaxSentDistForSieve(Properties props, String sievename) { + return PropertiesUtils.getInt(props, MAX_SENT_DIST_PROP.replace("SIEVENAME", sievename), 1000); } public static String getMentionDetectionModel(Properties props) { - return PropertiesUtils.getString(props, "coref.md.model", - useConstituencyParse(props) ? "edu/stanford/nlp/models/coref/hybrid/md-model.ser" : - "edu/stanford/nlp/models/coref/hybrid/md-model-dep.ser.gz"); + return PropertiesUtils.getString(props, MENTION_DETECTION_MODEL_PROP, + useConstituencyParse(props) ? "edu/stanford/nlp/models/coref/md-model.ser" : + "edu/stanford/nlp/models/coref/md-model-dep.ser.gz"); } - public static boolean isMentionDetectionTraining(Properties props) { - return PropertiesUtils.getBool(props, "coref.md.isTraining", false); + public static Set getMentionType(Properties props, String sievename) { + return getMentionTypes(props, MTYPE_PROP.replace("SIEVENAME", sievename)); } + public static Set getAntecedentType(Properties props, String sievename) { + return getMentionTypes(props, ATYPE_PROP.replace("SIEVENAME", sievename)); + } + + private static Set getMentionTypes(Properties props, String propKey) { + if(!props.containsKey(propKey) || props.getProperty(propKey).equalsIgnoreCase("all")){ + return new HashSet<>(Arrays.asList(MentionType.values())); + } + Set types = new HashSet<>(); + for(String type : props.getProperty(propKey).trim().split(",\\s*")) { + if(type.toLowerCase().matches("i|you|we|they|it|she|he")) type = "PRONOMINAL"; + types.add(MentionType.valueOf(type)); + } + return types; + } + public static double getDownsamplingRate(Properties props, String sievename) { + return PropertiesUtils.getDouble(props, DOWNSAMPLE_RATE_PROP.replace("SIEVENAME", sievename), 1); + } + public static int getFeatureCountThreshold(Properties props, String sievename) { + return PropertiesUtils.getInt(props, THRES_FEATURECOUNT_PROP.replace("SIEVENAME", sievename), 20); + } + public static boolean useBasicFeatures(Properties props, String sievename) { + return PropertiesUtils.getBool(props, USE_BASIC_FEATURES_PROP.replace("SIEVENAME", sievename), true); + } + public static boolean combineObjectRoles(Properties props, String sievename) { + return PropertiesUtils.getBool(props, COMBINE_OBJECTROLE_PROP.replace("SIEVENAME", sievename), true); + } + public static boolean useMentionDetectionFeatures(Properties props, String sievename) { + return PropertiesUtils.getBool(props, USE_MD_FEATURES_PROP.replace("SIEVENAME", sievename), true); + } + public static boolean useDcorefRules(Properties props, String sievename) { + return PropertiesUtils.getBool(props, USE_DCOREFRULE_FEATURES_PROP.replace("SIEVENAME", sievename), true); + } + public static boolean usePOSFeatures(Properties props, String sievename) { + return PropertiesUtils.getBool(props, USE_POS_FEATURES_PROP.replace("SIEVENAME", sievename), true); + } + public static boolean useLexicalFeatures(Properties props, String sievename) { + return PropertiesUtils.getBool(props, USE_LEXICAL_FEATURES_PROP.replace("SIEVENAME", sievename), true); + } + public static boolean useWordEmbedding(Properties props, String sievename) { + return PropertiesUtils.getBool(props, USE_WORD_EMBEDDING_FEATURES_PROP.replace("SIEVENAME", sievename), true); + } + + private static Set getMentionTypeStr(Properties props, String sievename, String whichMention) { + Set strs = Generics.newHashSet(); + String propKey = whichMention; + if (!props.containsKey(propKey)) { + String prefix = "coref." + sievename + "."; + propKey = prefix + propKey; + } + if(props.containsKey(propKey)) strs.addAll(Arrays.asList(props.getProperty(propKey).split(","))); + return strs; + } + public static Set getMentionTypeStr(Properties props, String sievename) { + return getMentionTypeStr(props, sievename, "mType"); + } + public static Set getAntecedentTypeStr(Properties props, String sievename) { + return getMentionTypeStr(props, sievename, "aType"); + } + public static String getSieves(Properties props) { + return PropertiesUtils.getString(props, SIEVES_PROP, "SpeakerMatch,PreciseConstructs,pp-rf,cc-rf,pc-rf,ll-rf,pr-rf"); + } + public static String getPathSerialized(Properties props) { + return props.getProperty(PATH_SERIALIZED_PROP); + } + public static boolean doPMIFeatureSelection(Properties props, String sievename) { + return PropertiesUtils.getString(props, FEATURE_SELECTION_PROP.replace("SIEVENAME", sievename), "pmi").equalsIgnoreCase("pmi"); + } + public static double getPMIThres(Properties props, String sievename) { + return PropertiesUtils.getDouble(props, THRES_FEATURE_SELECTION_PROP.replace("SIEVENAME", sievename), 0.0001); + } + public static boolean doAnalysis(Properties props) { + return PropertiesUtils.getBool(props, DO_ANALYSIS_PROP, false); + } + public static String getSkipMentionType(Properties props) { + return PropertiesUtils.getString(props, ANALYSIS_SKIP_MTYPE_PROP, null); + } + public static String getSkipAntecedentType(Properties props) { + return PropertiesUtils.getString(props, ANALYSIS_SKIP_ATYPE_PROP, null); + } + public static boolean useSemantics(Properties props) { + return PropertiesUtils.getBool(props, USE_SEMANTICS_PROP, true); + } + public static String getPathSerializedWordVectors(Properties props) { + return PropertiesUtils.getString(props, WORD2VEC_SERIALIZED_PROP, "/scr/nlp/data/coref/wordvectors/en/vector.ser.gz"); + } + public static String getCurrentSieveForTrain(Properties props) { + return PropertiesUtils.getString(props, CURRENT_SIEVE_FOR_TRAIN_PROP, null); + } +// public static String getCurrentSieve(Properties props) { +// return PropertiesUtils.getString(props, CURRENT_SIEVE_PROP, null); +// } + public static boolean loadWordEmbedding(Properties props) { + return PropertiesUtils.getBool(props, LOAD_WORD_EMBEDDING_PROP, true); + } + public static String getPathWord2Vec(Properties props) { + return PropertiesUtils.getString(props, WORD2VEC_PROP, null); + } + + public static String getGenderNumber(Properties props) { + return PropertiesUtils.getString(props, GENDER_NUMBER_PROP, "edu/stanford/nlp/models/dcoref/gender.data.gz"); + } + + public static boolean storeTrainData(Properties props) { + return PropertiesUtils.getBool(props, STORE_TRAINDATA_PROP, false); + } + + public static boolean allowReparsing(Properties props) { + return PropertiesUtils.getBool(props, ALLOW_REPARSING_PROP, true); + } + + public static boolean useGoldNE(Properties props) { + return PropertiesUtils.getBool(props, USE_GOLD_NE_PROP, true); + } + public static boolean useGoldParse(Properties props) { + return PropertiesUtils.getBool(props, USE_GOLD_PARSES_PROP, true); + } + public static boolean useGoldPOS(Properties props) { + return PropertiesUtils.getBool(props, USE_GOLD_POS_PROP, true); + } + public static boolean isMentionDetectionTraining(Properties props) { + return PropertiesUtils.getBool(props, CorefProperties.MD_TRAIN_PROP, false); + } public static void setMentionDetectionTraining(Properties props, boolean val) { - props.put("coref.md.isTraining", val); + props.put(CorefProperties.MD_TRAIN_PROP, val); + } + public static void setRemoveNestedMentions(Properties props,boolean bool){ + props.setProperty(CorefProperties.REMOVE_NESTED, String.valueOf(bool)); + } + public static boolean removeNestedMentions(Properties props){ + return PropertiesUtils.getBool(props, CorefProperties.REMOVE_NESTED, true); } - public static boolean removeNestedMentions(Properties props) { - return PropertiesUtils.getBool(props, "removeNestedMentions", true); + public static boolean useDefaultPronounAgreement(Properties props){ + return PropertiesUtils.getBool(props, CorefProperties.DEFAULT_PRONOUN_AGREEMENT_PROP,false); } - public static void setRemoveNestedMentions(Properties props, boolean val) { - props.put("removeNestedMentions", val); + public static boolean addMissingAnnotations(Properties props) { + return PropertiesUtils.getBool(props, ADD_MISSING_ANNOTATIONS, false); } - public static boolean liberalChineseMD(Properties props) { - return PropertiesUtils.getBool(props, "coref.md.liberalChineseMD", true); + // heuristic mention filtering + public static int maxMentionDistance(Properties props) { + return PropertiesUtils.getInt(props, "coref.maxMentionDistance", + conll(props) ? Integer.MAX_VALUE : 50); } - // ---------- Input and Output Data ---------- + public static int maxMentionDistanceWithStringMatch(Properties props) { + return PropertiesUtils.getInt(props, "coref.maxMentionDistanceWithStringMatch", 500); + } - public static final String OUTPUT_PATH_PROP = "coref.conllOutputPath"; - public static String conllOutputPath(Properties props) { - return props.getProperty("coref.conllOutputPath"); + // type of algorithm for mention detection + public static MentionDetectionType mdType(Properties props) { + String type = PropertiesUtils.getString(props, "coref.md.type", + useConstituencyParse(props) ? "RULE" : "dep"); + if (type.equalsIgnoreCase("dep")) { + type = "DEPENDENCY"; + } + return MentionDetectionType.valueOf(type.toUpperCase()); + } + + // use a more liberal policy for Chinese mention detection + public static boolean liberalChineseMD(Properties props) { + return PropertiesUtils.getBool(props, "coref.md.liberalChineseMD", true); } - public enum Dataset {TRAIN, DEV, TEST}; public static void setInput(Properties props, Dataset d) { props.setProperty("coref.inputPath", d == Dataset.TRAIN ? getTrainDataPath(props) : - (d == Dataset.DEV ? getDevDataPath(props) : getTestDataPath(props))); + (d == Dataset.DEV ? getDevDataPath(props) : getTestDataPath(props))); } public static String getDataPath(Properties props) { @@ -110,29 +457,25 @@ public static String getDataPath(Properties props) { public static String getTrainDataPath(Properties props) { return props.getProperty("coref.trainData", - getDataPath(props) + "v4/data/train/data/" + getLanguageStr(props) + "/annotations/"); + getDataPath(props) + "v4/data/train/data/" + getLanguageStr(props) + "/annotations/"); } public static String getDevDataPath(Properties props) { return props.getProperty("coref.devData", - getDataPath(props) + "v4/data/development/data/" + getLanguageStr(props) + "/annotations/"); + getDataPath(props) + "v4/data/dev/data/" + getLanguageStr(props) + "/annotations/"); } public static String getTestDataPath(Properties props) { return props.getProperty("coref.testData", - getDataPath(props) + "v9/data/test/data/" + getLanguageStr(props) + "/annotations"); + getDataPath(props) + "v9/data/test/data/" + getLanguageStr(props) + "/annotations"); } public static String getInputPath(Properties props) { String input = props.getProperty("coref.inputPath", - props.containsKey("coref.data") ? getTestDataPath(props) : null); + props.containsKey("coref.data") ? getTestDataPath(props) : null); return input; } - public static String getScorerPath(Properties props) { - return props.getProperty("coref.scorer"); - } - public static Locale getLanguage(Properties props) { String lang = PropertiesUtils.getString(props, "coref.language", "en"); if (lang.equalsIgnoreCase("en") || lang.equalsIgnoreCase("english")) { @@ -147,4 +490,8 @@ public static Locale getLanguage(Properties props) { private static String getLanguageStr(Properties props) { return getLanguage(props).getDisplayName().toLowerCase(); } + + public static String getScorerPath(Properties props) { + return props.getProperty("coref.scorer"); + } } diff --git a/src/edu/stanford/nlp/coref/CorefRules.java b/src/edu/stanford/nlp/coref/CorefRules.java index e5bbbf5ba4..17c21a48d9 100644 --- a/src/edu/stanford/nlp/coref/CorefRules.java +++ b/src/edu/stanford/nlp/coref/CorefRules.java @@ -10,16 +10,15 @@ import edu.stanford.nlp.coref.data.CorefCluster; import edu.stanford.nlp.coref.data.Dictionaries; -import edu.stanford.nlp.coref.data.Document; -import edu.stanford.nlp.coref.data.Mention; -import edu.stanford.nlp.coref.data.Semantics; -import edu.stanford.nlp.coref.data.SpeakerInfo; import edu.stanford.nlp.coref.data.Dictionaries.Animacy; import edu.stanford.nlp.coref.data.Dictionaries.Gender; import edu.stanford.nlp.coref.data.Dictionaries.MentionType; import edu.stanford.nlp.coref.data.Dictionaries.Number; import edu.stanford.nlp.coref.data.Dictionaries.Person; - +import edu.stanford.nlp.coref.data.Document; +import edu.stanford.nlp.coref.data.Mention; +import edu.stanford.nlp.coref.data.Semantics; +import edu.stanford.nlp.coref.data.SpeakerInfo; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.math.NumberMatchingRegex; @@ -31,18 +30,21 @@ /** - * Rules for coref system (mention detection, entity coref, event coref) + * CorefRules for coref system (mention detection, entity coref, event coref) * The name of the method for mention detection starts with detection, * for entity coref starts with entity, and for event coref starts with event. - * + * * @author heeyoung, recasens */ public class CorefRules { + + private static final boolean DEBUG = true; + public static boolean entityBothHaveProper(CorefCluster mentionCluster, CorefCluster potentialAntecedent) { boolean mentionClusterHaveProper = false; boolean potentialAntecedentHaveProper = false; - + for (Mention m : mentionCluster.corefMentions) { if (m.mentionType==MentionType.PROPER) { mentionClusterHaveProper = true; @@ -66,15 +68,15 @@ public static boolean entitySameProperHeadLastWord(CorefCluster mentionCluster, } return false; } - + public static boolean entityAlias(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Semantics semantics, Dictionaries dict) throws Exception { - + Mention mention = mentionCluster.getRepresentativeMention(); Mention antecedent = potentialAntecedent.getRepresentativeMention(); if(mention.mentionType!=MentionType.PROPER || antecedent.mentionType!=MentionType.PROPER) return false; - + Method meth = semantics.wordnet.getClass().getMethod("alias", new Class[]{Mention.class, Mention.class}); if((Boolean) meth.invoke(semantics.wordnet, new Object[]{mention, antecedent})) { return true; @@ -154,7 +156,7 @@ public static boolean isAcronym(List first, List second) { if (first.size() == 0 && second.size() == 0) { return false; } List longer; List shorter; - + if (first.size() == second.size()) { String firstWord = first.get(0).get(CoreAnnotations.TextAnnotation.class); String secondWord = second.get(0).get(CoreAnnotations.TextAnnotation.class); @@ -198,7 +200,7 @@ public static boolean isAcronym(List first, List second) { return false; } } - + return true; } @@ -223,7 +225,7 @@ public static boolean entityAttributesAgree(CorefCluster mentionCluster, CorefCl } public static boolean entityAttributesAgree(CorefCluster mentionCluster, CorefCluster potentialAntecedent, boolean ignoreGender){ - + boolean hasExtraAnt = false; boolean hasExtraThis = false; @@ -324,30 +326,30 @@ private static boolean attributeSetDisagree(Set s1,Set s2){ return true; return false; } - + private static void pruneAttributes(Set attrs, Set unknown) { if (attrs.size() > unknown.size()) attrs.removeAll(unknown); } - + private static void pruneAttributes(Set attrs, E unknown) { if (attrs.size() > 1) attrs.remove(unknown); } - + private static final Set UNKNOWN_NER = new HashSet<>(Arrays.asList("MISC","O")); private static boolean entityAttributesAgreeChinese(CorefCluster mentionCluster, CorefCluster potentialAntecedent){ - + pruneAttributes(mentionCluster.numbers,Number.UNKNOWN); pruneAttributes(mentionCluster.genders,Gender.UNKNOWN); pruneAttributes(mentionCluster.animacies,Animacy.UNKNOWN); pruneAttributes(mentionCluster.nerStrings,UNKNOWN_NER); - + pruneAttributes(potentialAntecedent.numbers,Number.UNKNOWN); pruneAttributes(potentialAntecedent.genders,Gender.UNKNOWN); pruneAttributes(potentialAntecedent.animacies,Animacy.UNKNOWN); pruneAttributes(potentialAntecedent.nerStrings,UNKNOWN_NER); - + if(attributeSetDisagree(mentionCluster.numbers,potentialAntecedent.numbers) || attributeSetDisagree(mentionCluster.genders,potentialAntecedent.genders) || attributeSetDisagree(mentionCluster.animacies,potentialAntecedent.animacies) @@ -356,7 +358,7 @@ private static boolean entityAttributesAgreeChinese(CorefCluster mentionCluster, return true; } - + public static boolean entityAttributesAgree(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Locale lang) { if (lang == Locale.CHINESE ) { return entityAttributesAgreeChinese(mentionCluster,potentialAntecedent); @@ -407,13 +409,13 @@ public static boolean entityExactStringMatch(Mention m, Mention ant, Dictionarie boolean matched = false; if(roleSet!=null && roleSet.contains(m)) return false; if(m.isPronominal() || ant.isPronominal()) return false; - + String mSpan = m.lowercaseNormalizedSpanString(); if(dict.allPronouns.contains(mSpan)) return false; - + String antSpan = ant.lowercaseNormalizedSpanString(); if(dict.allPronouns.contains(antSpan)) return false; - + if(mSpan.equals(antSpan)) matched = true; if(mSpan.equals(antSpan+" 's") || antSpan.equals(mSpan+" 's")) matched = true; return matched; @@ -458,7 +460,7 @@ public static boolean entityIWithinI(Mention m1, Mention m2, Dictionaries dict){ } return false; } - + /** Check whether later mention has incompatible modifier */ public static boolean entityHaveIncompatibleModifier(Mention m, Mention ant) { @@ -670,6 +672,11 @@ public static boolean antecedentIsMentionSpeaker(Document document, return false; } + /** Is the speaker for mention the same entity as the ant entity? */ + private static boolean antecedentMatchesMentionSpeakerAnnotation(Mention mention, Mention ant) { + return antecedentMatchesMentionSpeakerAnnotation(mention, ant, null); + } + public static final Pattern WHITESPACE_PATTERN = Pattern.compile(" +"); /** @@ -818,7 +825,7 @@ public static boolean entitySameSpeaker(Document document, Mention m, Mention an * Given the name of a speaker, returns the coref cluster id it belongs to (-1 if no cluster) * @param document The document to search in * @param speakerString The name to search for - * @return cluster id + * @return cluster id */ public static int getSpeakerClusterId(Document document, String speakerString) { int speakerClusterId = -1; @@ -863,12 +870,12 @@ public static boolean entityTokenDistance(Mention m1, Mention m2) { } // COREF_DICT strict: all the mention pairs between the two clusters must match in the dict - public static boolean entityClusterAllCorefDictionary(CorefCluster menCluster, CorefCluster antCluster, + public static boolean entityClusterAllCorefDictionary(CorefCluster menCluster, CorefCluster antCluster, Dictionaries dict, int dictColumn, int freq){ boolean ret = false; for(Mention men : menCluster.getCorefMentions()){ if(men.isPronominal()) continue; - for(Mention ant : antCluster.getCorefMentions()){ + for(Mention ant : antCluster.getCorefMentions()){ if(ant.isPronominal() || men.headWord.lemma().equals(ant.headWord.lemma())) continue; if(entityCorefDictionary(men, ant, dict, dictColumn, freq)){ ret = true; @@ -877,18 +884,18 @@ public static boolean entityClusterAllCorefDictionary(CorefCluster menCluster, C } } } - return ret; + return ret; } - + // COREF_DICT pairwise: the two mentions match in the dict - public static boolean entityCorefDictionary(Mention men, Mention ant, Dictionaries dict, int dictVersion, int freq){ - + public static boolean entityCorefDictionary(Mention men, Mention ant, Dictionaries dict, int dictVersion, int freq){ + Pair mention_pair = new Pair<>( men.getSplitPattern()[dictVersion - 1].toLowerCase(), ant.getSplitPattern()[dictVersion - 1].toLowerCase()); - + int high_freq = -1; - if(dictVersion == 1){ + if(dictVersion == 1){ high_freq = 75; } else if(dictVersion == 2){ high_freq = 16; @@ -897,21 +904,21 @@ public static boolean entityCorefDictionary(Mention men, Mention ant, Dictionari } else if(dictVersion == 4){ high_freq = 16; } - + if(dict.corefDict.get(dictVersion-1).getCount(mention_pair) > high_freq) return true; if(dict.corefDict.get(dictVersion-1).getCount(mention_pair) > freq){ if(dict.corefDictPMI.getCount(mention_pair) > 0.18) return true; if(!dict.corefDictPMI.containsKey(mention_pair)) return true; - } - return false; + } + return false; } - + public static boolean contextIncompatible(Mention men, Mention ant, Dictionaries dict) { String antHead = ant.headWord.word(); - if ( (ant.mentionType == MentionType.PROPER) - && ant.sentNum != men.sentNum - && !isContextOverlapping(ant,men) + if ( (ant.mentionType == MentionType.PROPER) + && ant.sentNum != men.sentNum + && !isContextOverlapping(ant,men) && dict.NE_signatures.containsKey(antHead)) { IntCounter ranks = Counters.toRankCounter(dict.NE_signatures.get(antHead)); List context; @@ -943,7 +950,7 @@ public static boolean contextIncompatible(Mention men, Mention ant, Dictionaries public static boolean sentenceContextIncompatible(Mention men, Mention ant, Dictionaries dict) { if ( (ant.mentionType != MentionType.PROPER) && (ant.sentNum != men.sentNum) - && (men.mentionType != MentionType.PROPER) + && (men.mentionType != MentionType.PROPER) && !isContextOverlapping(ant,men)) { List context1 = !ant.getPremodifierContext().isEmpty() ? ant.getPremodifierContext() : ant.getContext(); List context2 = !men.getPremodifierContext().isEmpty() ? men.getPremodifierContext() : men.getContext(); @@ -972,7 +979,7 @@ public static boolean sentenceContextIncompatible(Mention men, Mention ant, Dict } return false; } - + private static boolean isContextOverlapping(Mention m1, Mention m2) { Set context1 = Generics.newHashSet(); Set context2 = Generics.newHashSet(); diff --git a/src/edu/stanford/nlp/coref/CorefSystem.java b/src/edu/stanford/nlp/coref/CorefSystem.java index 264543488e..b943abe61a 100644 --- a/src/edu/stanford/nlp/coref/CorefSystem.java +++ b/src/edu/stanford/nlp/coref/CorefSystem.java @@ -2,36 +2,28 @@ import java.io.FileOutputStream; import java.io.PrintWriter; -import java.util.Calendar; -import java.util.Map; -import java.util.Properties; +import java.util.*; import java.util.logging.Logger; -import edu.stanford.nlp.coref.data.CorefChain; -import edu.stanford.nlp.coref.data.CorefCluster; -import edu.stanford.nlp.coref.data.Dictionaries; -import edu.stanford.nlp.coref.data.Document; -import edu.stanford.nlp.coref.data.DocumentMaker; +import edu.stanford.nlp.coref.data.*; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.StringUtils; -import edu.stanford.nlp.util.logging.Redwood; /** * Class for running coreference algorithms * @author Kevin Clark */ public class CorefSystem { - private final DocumentMaker docMaker; + public final Dictionaries dictionaries; + public final DocumentMaker docMaker; private final CorefAlgorithm corefAlgorithm; - private final boolean verbose; public CorefSystem(Properties props) { try { - Dictionaries dictionaries = new Dictionaries(props); + dictionaries = new Dictionaries(props); docMaker = new DocumentMaker(props, dictionaries); corefAlgorithm = CorefAlgorithm.fromProps(props, dictionaries); - verbose = CorefProperties.verbose(props); } catch (Exception e) { throw new RuntimeException("Error initializing coref system", e); } @@ -78,12 +70,7 @@ public void runOnConll(Properties props) throws Exception { public void process(int id, Document document) { writerGold.print(CorefPrinter.printConllOutput(document, true)); writerBeforeCoref.print(CorefPrinter.printConllOutput(document, false)); - long time = System.currentTimeMillis(); corefAlgorithm.runCoref(document); - if (verbose) { - Redwood.log(getName(), "Coref took " - + (System.currentTimeMillis() - time) / 1000.0 + "s"); - } CorefUtils.removeSingletonClusters(document); writerAfterCoref.print(CorefPrinter.printConllOutput(document, false, true)); } @@ -111,8 +98,28 @@ public String getName() { writerAfterCoref.close(); } + /** Remove singleton clusters */ + public static List> filterMentionsWithSingletonClusters( + Document document, List> mentions) { + + List> res = new ArrayList<>(mentions.size()); + for (List ml:mentions) { + List filtered = new ArrayList<>(); + for (Mention m:ml) { + CorefCluster cluster = document.corefClusters.get(m.corefClusterID); + if (cluster != null && cluster.getCorefMentions().size() > 1) { + filtered.add(m); + } + } + res.add(filtered); + } + return res; + } + public static void main(String[] args) throws Exception { Properties props = StringUtils.argsToProperties(new String[] {"-props", args[0]}); + System.out.println("---\ncoref props"); + System.out.println(props); CorefSystem coref = new CorefSystem(props); coref.runOnConll(props); } diff --git a/src/edu/stanford/nlp/coref/CorefUtils.java b/src/edu/stanford/nlp/coref/CorefUtils.java index 9467de6d18..92b1398512 100644 --- a/src/edu/stanford/nlp/coref/CorefUtils.java +++ b/src/edu/stanford/nlp/coref/CorefUtils.java @@ -112,9 +112,7 @@ public static Map> heuristicFilter(List sortedMe for (Mention match : withStringMatch) { if (match.mentionNum < m.mentionNum && match.mentionNum >= m.mentionNum - maxMentionDistanceWithStringMatch) { - if (!candidateAntecedents.contains(match.mentionID)) { - candidateAntecedents.add(match.mentionID); - } + candidateAntecedents.add(match.mentionID); } } } diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/MentionMatcher.java b/src/edu/stanford/nlp/coref/MentionMatcher.java similarity index 90% rename from src/edu/stanford/nlp/coref/hybrid/sieve/MentionMatcher.java rename to src/edu/stanford/nlp/coref/MentionMatcher.java index 5d0d563289..e585e07816 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/MentionMatcher.java +++ b/src/edu/stanford/nlp/coref/MentionMatcher.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref; import edu.stanford.nlp.coref.data.Mention; diff --git a/src/edu/stanford/nlp/coref/hybrid/HybridCorefPrinter.java b/src/edu/stanford/nlp/coref/OldCorefPrinter.java similarity index 77% rename from src/edu/stanford/nlp/coref/hybrid/HybridCorefPrinter.java rename to src/edu/stanford/nlp/coref/OldCorefPrinter.java index bbf7f5c72a..8c008071e4 100644 --- a/src/edu/stanford/nlp/coref/hybrid/HybridCorefPrinter.java +++ b/src/edu/stanford/nlp/coref/OldCorefPrinter.java @@ -1,38 +1,44 @@ -package edu.stanford.nlp.coref.hybrid; +package edu.stanford.nlp.coref; import java.io.FileNotFoundException; import java.text.DecimalFormat; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Deque; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Properties; +import java.util.Set; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -import edu.stanford.nlp.coref.hybrid.sieve.DiscourseMatch; -import edu.stanford.nlp.coref.hybrid.sieve.ExactStringMatch; -import edu.stanford.nlp.coref.hybrid.sieve.PreciseConstructs; -import edu.stanford.nlp.coref.hybrid.sieve.PronounMatch; -import edu.stanford.nlp.coref.hybrid.sieve.RFSieve; -import edu.stanford.nlp.coref.hybrid.sieve.RelaxedExactStringMatch; -import edu.stanford.nlp.coref.hybrid.sieve.RelaxedHeadMatch; -import edu.stanford.nlp.coref.hybrid.sieve.Sieve; -import edu.stanford.nlp.coref.hybrid.sieve.SpeakerMatch; -import edu.stanford.nlp.coref.hybrid.sieve.StrictHeadMatch1; -import edu.stanford.nlp.coref.hybrid.sieve.StrictHeadMatch2; -import edu.stanford.nlp.coref.hybrid.sieve.StrictHeadMatch3; -import edu.stanford.nlp.coref.hybrid.sieve.StrictHeadMatch4; import edu.stanford.nlp.coref.data.CorefCluster; import edu.stanford.nlp.coref.data.Dictionaries; +import edu.stanford.nlp.coref.data.Dictionaries.MentionType; import edu.stanford.nlp.coref.data.Document; import edu.stanford.nlp.coref.data.Mention; -import edu.stanford.nlp.coref.data.Dictionaries.MentionType; +import edu.stanford.nlp.coref.sieve.DiscourseMatch; +import edu.stanford.nlp.coref.sieve.ExactStringMatch; +import edu.stanford.nlp.coref.sieve.PreciseConstructs; +import edu.stanford.nlp.coref.sieve.PronounMatch; +import edu.stanford.nlp.coref.sieve.RFSieve; +import edu.stanford.nlp.coref.sieve.RelaxedExactStringMatch; +import edu.stanford.nlp.coref.sieve.RelaxedHeadMatch; +import edu.stanford.nlp.coref.sieve.Sieve; +import edu.stanford.nlp.coref.sieve.SpeakerMatch; +import edu.stanford.nlp.coref.sieve.StrictHeadMatch1; +import edu.stanford.nlp.coref.sieve.StrictHeadMatch2; +import edu.stanford.nlp.coref.sieve.StrictHeadMatch3; +import edu.stanford.nlp.coref.sieve.StrictHeadMatch4; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.UtteranceAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.math.NumberMatchingRegex; +import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; @@ -43,17 +49,13 @@ import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.logging.Redwood; -/** - * Prints CoNLL-style output from a {@link Document} - * @author heeyoung - */ -public class HybridCorefPrinter { +public class OldCorefPrinter { /** A logger for this class */ - private static Redwood.RedwoodChannels log = Redwood.channels(HybridCorefPrinter.class); + private static Redwood.RedwoodChannels log = Redwood.channels(CorefPrinter.class); public static final DecimalFormat df = new DecimalFormat("#.####"); - + // for debug // public static final ChineseHeadMatch dcorefChineseHeadMatch = new ChineseHeadMatch(StringUtils.argsToProperties(new String[]{"-coref.language", "zh"})); public static final SpeakerMatch dcorefSpeaker = new SpeakerMatch(); @@ -67,7 +69,7 @@ public class HybridCorefPrinter { public static final StrictHeadMatch4 dcorefHead4 = new StrictHeadMatch4(); public static final RelaxedHeadMatch dcorefRelaxedHead = new RelaxedHeadMatch(); public static final PronounMatch dcorefPronounSieve = new PronounMatch(); - + /** Print raw document for analysis */ public static String printRawDoc(Document document, boolean gold, boolean printClusterID) throws FileNotFoundException { StringBuilder sb = new StringBuilder(); @@ -96,15 +98,15 @@ public static String printErrorLog(Mention m, Document document, Counter orderedAnts = new ArrayList<>(); - + List orderedAnts = new ArrayList<>(); + sb.append("\nGOLD CLUSTER ID\n"); for(int sentDist=m.sentNum ; sentDist >= 0 ; sentDist--) { if(sentDist == sieve.maxSentDist) sb.append("\tstart compare from here-------------\n"); int sentIdx = m.sentNum-sentDist; sb.append("\tSENT "+sentIdx+"\t"+sentenceStringWithMention(sentIdx, document, true, true)).append("\n"); } - + sb.append("\nMENTION ID\n"); for(int sentDist=m.sentNum ; sentDist >= 0 ; sentDist--) { if(sentDist == sieve.maxSentDist) sb.append("\tstart compare from here-------------\n"); @@ -122,12 +124,12 @@ public static String printErrorLog(Mention m, Document document, Counter 0 && Counters.max(probs) > sieve.thresMerge); - boolean correctDecision = ( (isFirstMention && !foundCorefAnt) + boolean correctDecision = ( (isFirstMention && !foundCorefAnt) || (foundCorefAnt && Sieve.isReallyCoref(document, m.mentionID, Counters.argmax(probs))) ); boolean barePlural = (m.originalSpan.size()==1 && m.headWord.tag().equals("NNS")); if(correctDecision) return ""; @@ -136,7 +138,7 @@ public static String printErrorLog(Mention m, Document document, Counter mentions = allMentions.get(i); @@ -243,29 +245,117 @@ public static String sentenceStringWithMention(int i, Document document, boolean } } // sentStr.append("\n"); - + return sentStr.toString(); } + + public static String printConllOutput(Document document, boolean gold) { + return printConllOutput(document, gold, false); + } + + public static String printConllOutput(Document document, boolean gold, boolean filterSingletons) { + List> orderedMentions; + if (gold) { + orderedMentions = document.goldMentions; + } else { + orderedMentions = document.predictedMentions; + } + if (filterSingletons) { + orderedMentions = CorefSystem.filterMentionsWithSingletonClusters(document, orderedMentions); + } + return printConllOutput(document, orderedMentions, gold); + } + + public static String printConllOutput(Document document, List> orderedMentions, boolean gold) + { + Annotation anno = document.annotation; + List> conllDocSentences = document.conllDoc.sentenceWordLists; + String docID = anno.get(CoreAnnotations.DocIDAnnotation.class); + StringBuilder sb = new StringBuilder(); + sb.append("#begin document ").append(docID).append("\n"); + List sentences = anno.get(CoreAnnotations.SentencesAnnotation.class); + for(int sentNum = 0 ; sentNum < sentences.size() ; sentNum++){ + List sentence = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class); + List conllSentence = conllDocSentences.get(sentNum); + Map> mentionBeginOnly = Generics.newHashMap(); + Map> mentionEndOnly = Generics.newHashMap(); + Map> mentionBeginEnd = Generics.newHashMap(); + + for(int i=0 ; i()); + mentionEndOnly.put(i, new LinkedHashSet<>()); + mentionBeginEnd.put(i, new LinkedHashSet<>()); + } + + for(Mention m : orderedMentions.get(sentNum)) { + if(m.startIndex==m.endIndex-1) { + mentionBeginEnd.get(m.startIndex).add(m); + } else { + mentionBeginOnly.get(m.startIndex).add(m); + mentionEndOnly.get(m.endIndex-1).add(m); + } + } + + for(int i=0 ; i 0) { + sb2.append("|"); + } + int corefClusterId = (gold)? m.goldCorefClusterID:m.corefClusterID; + sb2.append("(").append(corefClusterId); + } + for(Mention m : mentionBeginEnd.get(i)){ + if (sb2.length() > 0) { + sb2.append("|"); + } + int corefClusterId = (gold)? m.goldCorefClusterID:m.corefClusterID; + sb2.append("(").append(corefClusterId).append(")"); + } + for(Mention m : mentionEndOnly.get(i)){ + if (sb2.length() > 0) { + sb2.append("|"); + } + int corefClusterId = (gold)? m.goldCorefClusterID:m.corefClusterID; + sb2.append(corefClusterId).append(")"); + } + if(sb2.length() == 0) sb2.append("-"); + + String[] columns = conllSentence.get(i); + for(int j = 0 ; j < columns.length-1 ; j++){ + String column = columns[j]; + sb.append(column).append("\t"); + } + sb.append(sb2).append("\n"); + } + sb.append("\n"); + } + + sb.append("#end document").append("\n"); + // sb.append("#end document ").append(docID).append("\n"); + + return sb.toString(); + } public static String printMentionDetectionLog(Document document) { StringBuilder sbLog = new StringBuilder(); List sentences = document.annotation.get(SentencesAnnotation.class); sbLog.append("\nERROR START-----------------------------------------------------------------------\n"); for(int i=0 ; i < sentences.size() ; i++) { - sbLog.append("\nSENT ").append(i).append(" GOLD : ").append(HybridCorefPrinter.sentenceStringWithMention(i, document, true, false)).append("\n"); - sbLog.append("SENT ").append(i).append(" PREDICT: ").append(HybridCorefPrinter.sentenceStringWithMention(i, document, false, false)).append("\n"); - + sbLog.append("\nSENT ").append(i).append(" GOLD : ").append(OldCorefPrinter.sentenceStringWithMention(i, document, true, false)).append("\n"); + sbLog.append("SENT ").append(i).append(" PREDICT: ").append(OldCorefPrinter.sentenceStringWithMention(i, document, false, false)).append("\n"); + // for(CoreLabel cl : sentences.get(i).get(TokensAnnotation.class)) { // sbLog.append(cl.word()).append("-").append(cl.get(UtteranceAnnotation.class)).append("-").append(cl.get(SpeakerAnnotation.class)).append(" "); // } - + for(Mention p : document.predictedMentions.get(i)) { sbLog.append("\n"); if(!p.hasTwin) sbLog.append("\tSPURIOUS"); sbLog.append("\tmention: ").append(p.spanToString()).append("\t\t\theadword: ").append(p.headString).append("\tPOS: ").append(p.headWord.tag()).append("\tmentiontype: ").append(p.mentionType).append("\tnumber: ").append(p.number).append("\tgender: ").append(p.gender).append("\tanimacy: ").append(p.animacy).append("\tperson: ").append(p.person).append("\tNE: ").append(p.nerString); } sbLog.append("\n"); - + for(Mention g : document.goldMentions.get(i)){ if(!g.hasTwin) { sbLog.append("\tmissed gold: ").append(g.spanToString()).append("\tPOS: ").append(g.headWord.tag()).append("\tmentiontype: ").append(g.mentionType).append("\theadword: ").append(g.headString).append("\tnumber: ").append(g.number).append("\tgender: ").append(g.gender).append("\tanimacy: ").append(g.animacy).append("\tperson: ").append(g.person).append("\tNE: ").append(g.nerString).append("\n"); @@ -287,14 +377,14 @@ public static String printErrorLogDcoref(Mention m, Mention found, Document docu sb.append("RESOLVER TYPE: ").append(whichResolver).append("\n"); sb.append("DOCUMENT: "+document.docInfo.get("DOC_ID")+", "+document.docInfo.get("DOC_PART")).append("\n"); - List orderedAnts = new ArrayList<>(); - + List orderedAnts = new ArrayList<>(); + sb.append("\nGOLD CLUSTER ID\n"); for(int sentDist=m.sentNum ; sentDist >= 0 ; sentDist--) { int sentIdx = m.sentNum-sentDist; sb.append("\tSENT "+sentIdx+"\t"+sentenceStringWithMention(sentIdx, document, true, true)).append("\n"); } - + sb.append("\nMENTION ID\n"); for(int sentDist=m.sentNum ; sentDist >= 0 ; sentDist--) { int sentIdx = m.sentNum-sentDist; @@ -311,9 +401,9 @@ public static String printErrorLogDcoref(Mention m, Mention found, Document docu Mention ant = orderedAnts.get(i); orders.put(ant.mentionID, i); } - + CorefCluster mC = document.corefClusters.get(m.corefClusterID); - + boolean isFirstMention = isFirstMention(m, document); boolean foundCorefAnt = true; // we're printing only mentions that found coref antecedent boolean correctDecision = document.isCoref(m, found); @@ -323,14 +413,14 @@ public static String printErrorLogDcoref(Mention m, Mention found, Document docu +"\t\tfoundAnt? "+foundCorefAnt+"\t\tcorrectDecision? "+correctDecision); sb.append("\n\ttype: "+m.mentionType+"\tHeadword: "+m.headWord.word()+"\tNEtype: "+m.nerString+"\tnumber: "+m.number+"\tgender: "+m.gender+"\tanimacy: "+m.animacy).append("\n"); if(m.contextParseTree!=null) sb.append(m.contextParseTree.pennString()); - + sb.append("\n\n\t\tOracle\t\tDcoref\t\t\tRF\t\tAntecedent\n"); for(Mention ant : orderedAnts) { int antID = ant.mentionID; CorefCluster aC = document.corefClusters.get(ant.corefClusterID); boolean oracle = Sieve.isReallyCoref(document, m.mentionID, antID); int order = orders.get(antID); - + String oracleStr = (oracle)? "coref " : "notcoref"; // String dcorefStr = (dcoref)? "coref " : "notcoref"; String dcorefStr = "notcoref"; @@ -347,12 +437,12 @@ public static String printErrorLogDcoref(Mention m, Mention found, Document docu else if(dcorefRelaxedHead.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-relaxedHead"; else if(dcorefPronounSieve.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-pronounSieve"; - + dcorefStr += "\t"+String.valueOf(order); - + sb.append("\t\t"+oracleStr+"\t"+dcorefStr+"\t\t"+ant.spanToString()+" ("+ant.mentionID+")\n"); } - + sb.append("ERROR END -----------------------------------------------------------------------\n"); return sb.toString(); } @@ -360,26 +450,26 @@ public static String printErrorLogDcoref(Mention m, Mention found, Document docu public static void linkDistanceAnalysis(String[] args) throws Exception { Properties props = StringUtils.argsToProperties(args); - HybridCorefSystem cs = new HybridCorefSystem(props); + CorefSystem cs = new CorefSystem(props); cs.docMaker.resetDocs(); - + Counter proper = new ClassicCounter<>(); Counter common = new ClassicCounter<>(); Counter pronoun = new ClassicCounter<>(); Counter list = new ClassicCounter<>(); - - + + while(true) { Document document = cs.docMaker.nextDoc(); if(document==null) break; document.extractGoldCorefClusters(); - + for(int sentIdx=0 ; sentIdx < document.predictedMentions.size() ; sentIdx++) { List predictedInSent = document.predictedMentions.get(sentIdx); - + for(int mIdx = 0 ; mIdx < predictedInSent.size() ; mIdx++) { Mention m = predictedInSent.get(mIdx); - + loop: for(int distance=0 ; distance <= sentIdx ; distance++) { List candidates = Sieve.getOrderedAntecedents(m, sentIdx-distance, mIdx, document.predictedMentions, cs.dictionaries); @@ -387,7 +477,7 @@ public static void linkDistanceAnalysis(String[] args) throws Exception { for(Mention candidate : candidates) { if(candidate == m) continue; if(distance==0 && m.appearEarlierThan(candidate)) continue; // ignore cataphora - + if(candidate.goldCorefClusterID == m.goldCorefClusterID) { switch(m.mentionType) { case NOMINAL: @@ -415,9 +505,9 @@ public static void linkDistanceAnalysis(String[] args) throws Exception { break; } } - } + } } - } + } } } System.out.println("PROPER -------------------------------------------"); @@ -428,11 +518,52 @@ public static void linkDistanceAnalysis(String[] args) throws Exception { Counters.printCounterSortedByKeys(pronoun); System.out.println("LIST -------------------------------------------"); Counters.printCounterSortedByKeys(list); - + log.info(); } - + + public static void printScoreSummary(String summary, Logger logger, boolean afterPostProcessing) { + String[] lines = summary.split("\n"); + if(!afterPostProcessing) { + for(String line : lines) { + if(line.startsWith("Identification of Mentions")) { + Redwood.log(line); + return; + } + } + } else { + StringBuilder sb = new StringBuilder(); + for(String line : lines) { + if(line.startsWith("METRIC")) sb.append(line); + if(!line.startsWith("Identification of Mentions") && line.contains("Recall")) { + sb.append(line).append("\n"); + } + } + Redwood.log(sb.toString()); + } + } + /** Print average F1 of MUC, B^3, CEAF_E */ + public static void printFinalConllScore(String summary) { + Pattern f1 = Pattern.compile("Coreference:.*F1: (.*)%"); + Matcher f1Matcher = f1.matcher(summary); + double[] F1s = new double[5]; + int i = 0; + while (f1Matcher.find()) { + F1s[i++] = Double.parseDouble(f1Matcher.group(1)); + } + double finalScore = (F1s[0]+F1s[1]+F1s[3])/3; + Redwood.log("Final conll score ((muc+bcub+ceafe)/3) = " + (new DecimalFormat("#.##")).format(finalScore)); + } + + public static void printMentionDetection(Map goldMentionsByID) { + int foundGoldCount = 0; + for(Mention g : goldMentionsByID.values()) { + if(g.hasTwin) foundGoldCount++; + } + Redwood.log("debug-md", "# of found gold mentions: "+foundGoldCount + " / # of gold mentions: "+goldMentionsByID.size()); + } + public static void main(String[] args) throws Exception { linkDistanceAnalysis(args); } diff --git a/src/edu/stanford/nlp/coref/Preprocessor.java b/src/edu/stanford/nlp/coref/Preprocessor.java new file mode 100644 index 0000000000..2ad6bd5336 --- /dev/null +++ b/src/edu/stanford/nlp/coref/Preprocessor.java @@ -0,0 +1,921 @@ +package edu.stanford.nlp.coref; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +import edu.stanford.nlp.classify.LogisticClassifier; +import edu.stanford.nlp.coref.data.CorefCluster; +import edu.stanford.nlp.coref.data.Dictionaries; +import edu.stanford.nlp.coref.data.Dictionaries.Number; +import edu.stanford.nlp.coref.data.Document; +import edu.stanford.nlp.coref.data.Document.DocType; +import edu.stanford.nlp.coref.data.Mention; +import edu.stanford.nlp.coref.data.SpeakerInfo; +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.UtteranceAnnotation; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.IndexedWord; +import edu.stanford.nlp.math.NumberMatchingRegex; +import edu.stanford.nlp.semgraph.SemanticGraph; +import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; +import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation; +import edu.stanford.nlp.semgraph.SemanticGraphEdge; +import edu.stanford.nlp.trees.GrammaticalRelation; +import edu.stanford.nlp.trees.HeadFinder; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; +import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations; +import edu.stanford.nlp.util.CollectionValuedMap; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.IntPair; +import edu.stanford.nlp.util.IntTuple; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.logging.Redwood; + +/** + * Coref document preprocessor. + * + * @author heeyoung + */ + +public class Preprocessor { + + /** A logger for this class */ + private static Redwood.RedwoodChannels log = Redwood.channels(Preprocessor.class); + + private Preprocessor() {} + + /** + * Fill missing information in document including mention ID, mention attributes, syntactic relation, etc. + * + * @throws Exception + */ + public static void preprocess(Document doc, Dictionaries dict, LogisticClassifier singletonPredictor, HeadFinder headFinder) throws Exception { + + // assign mention IDs, find twin mentions, fill mention positions, sentNum, headpositions + initializeMentions(doc, dict, singletonPredictor, headFinder); + + // mention reordering + mentionReordering(doc, headFinder); + + // find syntactic information + fillSyntacticInfo(doc); + + // process discourse (speaker info etc) + setParagraphAnnotation(doc); + processDiscourse(doc, dict); + + // initialize cluster info + initializeClusters(doc); + + // extract gold clusters if we have + if(doc.goldMentions!=null) { + extractGoldClusters(doc); + OldCorefPrinter.printMentionDetection(doc.goldMentionsByID); + } + } + + /** Extract gold coref cluster information. */ + public static void extractGoldClusters(Document doc){ + doc.goldCorefClusters = Generics.newHashMap(); + for (List mentions : doc.goldMentions) { + for (Mention m : mentions) { + int id = m.goldCorefClusterID; + if (id == -1) { + throw new RuntimeException("No gold info"); + } + CorefCluster c = doc.goldCorefClusters.get(id); + if (c == null) { + c = new CorefCluster(id); + doc.goldCorefClusters.put(id, c); + } + c.corefMentions.add(m); + } + } + } + + + private static void mentionReordering(Document doc, HeadFinder headFinder) throws Exception { + List> mentions = doc.predictedMentions; + List sentences = doc.annotation.get(SentencesAnnotation.class); + + for (int i=0 ; i mentionsInSent = mentions.get(i); + mentions.set(i, mentionReorderingBySpan(mentionsInSent)); + } + } + + protected static int getHeadIndex(Tree t, HeadFinder headFinder) { + // The trees passed in do not have the CoordinationTransformer + // applied, but that just means the SemanticHeadFinder results are + // slightly worse. + Tree ht = t.headTerminal(headFinder); + if(ht==null) return -1; // temporary: a key which is matched to nothing + CoreLabel l = (CoreLabel) ht.label(); + return l.get(CoreAnnotations.IndexAnnotation.class); + } + + private static List mentionReorderingBySpan(List mentionsInSent) { + TreeSet ordering = new TreeSet<>(new Comparator() { + @Override + public int compare(Mention m1, Mention m2) { + return (m1.appearEarlierThan(m2)) ? -1 : (m2.appearEarlierThan(m1)) ? 1 : 0; + } + }); + ordering.addAll(mentionsInSent); + List orderedMentions = Generics.newArrayList(ordering); + return orderedMentions; + } + + private static void fillSyntacticInfo(Document doc) { + + List> mentions = doc.predictedMentions; + List sentences = doc.annotation.get(SentencesAnnotation.class); + + for (int i=0 ; i mentionsInSent = mentions.get(i); + findSyntacticRelationsFromDependency(mentionsInSent); + } + } + + /** assign mention IDs, find twin mentions, fill mention positions, initialize coref clusters, etc + * @throws Exception */ + private static void initializeMentions(Document doc, Dictionaries dict, LogisticClassifier singletonPredictor, HeadFinder headFinder) throws Exception { + boolean hasGold = (doc.goldMentions != null); + assignMentionIDs(doc); + if(hasGold) findTwinMentions(doc, true); + fillMentionInfo(doc, dict, singletonPredictor, headFinder); + doc.allPositions = Generics.newHashMap(doc.positions); // allPositions retain all mentions even after postprocessing + } + + private static void assignMentionIDs(Document doc) { + boolean hasGold = (doc.goldMentions != null); + int maxID = 0; + if(hasGold) { + for (List golds : doc.goldMentions) { + for (Mention g : golds) { + g.mentionID = maxID++; + } + } + } + for (List predicted : doc.predictedMentions) { + for (Mention p : predicted) { + p.mentionID = maxID++; + } + } + } + + /** Mark twin mentions in gold and predicted mentions */ + protected static void findTwinMentions(Document doc, boolean strict){ + if(strict) findTwinMentionsStrict(doc); + else findTwinMentionsRelaxed(doc); + } + + /** Mark twin mentions: All mention boundaries should be matched */ + private static void findTwinMentionsStrict(Document doc){ + for(int sentNum = 0; sentNum < doc.goldMentions.size(); sentNum++) { + List golds = doc.goldMentions.get(sentNum); + List predicts = doc.predictedMentions.get(sentNum); + + // For CoNLL training there are some documents with gold mentions with the same position offsets + // See /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll + // (Packwood - Roth) + CollectionValuedMap goldMentionPositions = new CollectionValuedMap<>(); + for(Mention g : golds) { + IntPair ip = new IntPair(g.startIndex, g.endIndex); + if (goldMentionPositions.containsKey(ip)) { + StringBuilder existingMentions = new StringBuilder(); + for (Mention eg: goldMentionPositions.get(ip)) { + if (existingMentions.length() > 0) { + existingMentions.append(","); + } + existingMentions.append(eg.mentionID); + } + Redwood.log("debug-preprocessor", "WARNING: gold mentions with the same offsets: " + ip + + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString()); + } + //assert(!goldMentionPositions.containsKey(ip)); + goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g); + } + for(Mention p : predicts) { + IntPair pos = new IntPair(p.startIndex, p.endIndex); + if(goldMentionPositions.containsKey(pos)) { + Collection cm = goldMentionPositions.get(pos); + int minId = Integer.MAX_VALUE; + Mention g = null; + for (Mention m : cm) { + if (m.mentionID < minId) { + g = m; + minId = m.mentionID; + } + } + cm.remove(g); + p.mentionID = g.mentionID; + p.hasTwin = true; + g.hasTwin = true; + } + } + } + } + + /** Mark twin mentions: heads of the mentions are matched */ + private static void findTwinMentionsRelaxed(Document doc) { + for(int sentNum = 0; sentNum < doc.goldMentions.size(); sentNum++) { + List golds = doc.goldMentions.get(sentNum); + List predicts = doc.predictedMentions.get(sentNum); + + Map goldMentionPositions = Generics.newHashMap(); + Map> goldMentionHeadPositions = Generics.newHashMap(); + for(Mention g : golds) { + goldMentionPositions.put(new IntPair(g.startIndex, g.endIndex), g); + if(!goldMentionHeadPositions.containsKey(g.headIndex)) { + goldMentionHeadPositions.put(g.headIndex, new LinkedList<>()); + } + goldMentionHeadPositions.get(g.headIndex).add(g); + } + + List remains = new ArrayList<>(); + for (Mention p : predicts) { + IntPair pos = new IntPair(p.startIndex, p.endIndex); + if(goldMentionPositions.containsKey(pos)) { + Mention g = goldMentionPositions.get(pos); + p.mentionID = g.mentionID; + p.hasTwin = true; + g.hasTwin = true; + goldMentionHeadPositions.get(g.headIndex).remove(g); + if(goldMentionHeadPositions.get(g.headIndex).isEmpty()) { + goldMentionHeadPositions.remove(g.headIndex); + } + } + else remains.add(p); + } + for (Mention r : remains){ + if(goldMentionHeadPositions.containsKey(r.headIndex)) { + Mention g = goldMentionHeadPositions.get(r.headIndex).poll(); + r.mentionID = g.mentionID; + r.hasTwin = true; + g.hasTwin = true; + if(goldMentionHeadPositions.get(g.headIndex).isEmpty()) { + goldMentionHeadPositions.remove(g.headIndex); + } + } + } + } + } + + /** initialize several variables for mentions + * @throws Exception + */ + private static void fillMentionInfo(Document doc, Dictionaries dict, + LogisticClassifier singletonPredictor, HeadFinder headFinder) throws Exception { + List sentences = doc.annotation.get(SentencesAnnotation.class); + + for(int i = 0; i < doc.predictedMentions.size(); i ++){ + CoreMap sentence = sentences.get(i); + for(int j = 0; j < doc.predictedMentions.get(i).size(); j ++){ + Mention m = doc.predictedMentions.get(i).get(j); + doc.predictedMentionsByID.put(m.mentionID, m); // mentionsByID + + IntTuple pos = new IntTuple(2); + pos.set(0, i); + pos.set(1, j); + doc.positions.put(m, pos); // positions + m.sentNum = i; // sentNum + + IntTuple headPosition = new IntTuple(2); + headPosition.set(0, i); + headPosition.set(1, m.headIndex); + doc.mentionheadPositions.put(headPosition, m); // headPositions + + m.contextParseTree = sentence.get(TreeAnnotation.class); +// m.sentenceWords = sentence.get(TokensAnnotation.class); + m.basicDependency = sentence.get(BasicDependenciesAnnotation.class); + m.enhancedDependency = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); + if (m.enhancedDependency == null) { + m.enhancedDependency = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); + } + + // mentionSubTree (highest NP that has the same head) if constituency tree available + if (m.contextParseTree != null) { + Tree headTree = m.contextParseTree.getLeaves().get(m.headIndex); + if (headTree == null) { throw new RuntimeException("Missing head tree for a mention!"); } + Tree t = headTree; + while ((t = t.parent(m.contextParseTree)) != null) { + if (t.headTerminal(headFinder) == headTree && t.value().equals("NP")) { + m.mentionSubTree = t; + } else if(m.mentionSubTree != null){ + break; + } + } + if (m.mentionSubTree == null) { + m.mentionSubTree = headTree; + } + } + + m.process(dict, null, singletonPredictor); + } + } + + + boolean hasGold = (doc.goldMentions != null); + if(hasGold) { + doc.goldMentionsByID = Generics.newHashMap(); + int sentNum = 0; + for(List golds : doc.goldMentions) { + for(Mention g : golds) { + doc.goldMentionsByID.put(g.mentionID, g); + g.sentNum = sentNum; + } + sentNum++; + } + } + } + + private static void findSyntacticRelationsFromDependency(List orderedMentions) { + if(orderedMentions.size()==0) return; + markListMemberRelation(orderedMentions); + SemanticGraph dependency = orderedMentions.get(0).enhancedDependency; + + // apposition + Set> appos = Generics.newHashSet(); + List appositions = dependency.findAllRelns(UniversalEnglishGrammaticalRelations.APPOSITIONAL_MODIFIER); + for(SemanticGraphEdge edge : appositions) { + int sIdx = edge.getSource().index()-1; + int tIdx = edge.getTarget().index()-1; + appos.add(Pair.makePair(sIdx, tIdx)); + } + markMentionRelation(orderedMentions, appos, "APPOSITION"); + + // predicate nominatives + Set> preNomi = Generics.newHashSet(); + List copula = dependency.findAllRelns(UniversalEnglishGrammaticalRelations.COPULA); + for(SemanticGraphEdge edge : copula) { + IndexedWord source = edge.getSource(); + IndexedWord target = dependency.getChildWithReln(source, UniversalEnglishGrammaticalRelations.NOMINAL_SUBJECT); + if(target==null) target = dependency.getChildWithReln(source, UniversalEnglishGrammaticalRelations.CLAUSAL_SUBJECT); + // TODO + if(target == null) continue; + + // to handle relative clause: e.g., Tim who is a student, + if(target.tag().startsWith("W")) { + IndexedWord parent = dependency.getParent(source); + if(parent!=null && dependency.reln(parent, source).equals(UniversalEnglishGrammaticalRelations.RELATIVE_CLAUSE_MODIFIER)) { + target = parent; + } + } + int sIdx = source.index()-1; + int tIdx = target.index()-1; + preNomi.add(Pair.makePair(tIdx, sIdx)); + } + markMentionRelation(orderedMentions, preNomi, "PREDICATE_NOMINATIVE"); + + + // relative pronouns TODO + Set> relativePronounPairs = Generics.newHashSet(); + markMentionRelation(orderedMentions, relativePronounPairs, "RELATIVE_PRONOUN"); + } + + private static void initializeClusters(Document doc) { + for (List predicted : doc.predictedMentions) { + for (Mention p : predicted) { + doc.corefClusters.put(p.mentionID, new CorefCluster(p.mentionID, Generics.newHashSet(Arrays.asList(p)))); + p.corefClusterID = p.mentionID; + } + } + boolean hasGold = (doc.goldMentions != null); + if(hasGold) { + for(List golds : doc.goldMentions) { + for(Mention g : golds) { + doc.goldMentionsByID.put(g.mentionID, g); + } + } + } + } + + /** Find document type: Conversation or article */ + private static DocType findDocType(Document doc) { + boolean speakerChange = false; + + for(CoreMap sent : doc.annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) { + int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class); + if(utterIndex!=0) speakerChange = true; + if(speakerChange && utterIndex==0) return DocType.ARTICLE; + if(doc.maxUtter < utterIndex) doc.maxUtter = utterIndex; + } + } + if(!speakerChange) return DocType.ARTICLE; + return DocType.CONVERSATION; // in conversation, utter index keep increasing. + } + + /** Set paragraph index */ + private static void setParagraphAnnotation(Document doc) { + int paragraphIndex = 0; + int previousOffset = -10; + for(CoreMap sent : doc.annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) { + if(w.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) { + if(w.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) > previousOffset+2) paragraphIndex++; + w.set(CoreAnnotations.ParagraphAnnotation.class, paragraphIndex); + previousOffset = w.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); + } else { + w.set(CoreAnnotations.ParagraphAnnotation.class, -1); + } + } + } + for(List l : doc.predictedMentions) { + for(Mention m : l){ + m.paragraph = m.headWord.get(CoreAnnotations.ParagraphAnnotation.class); + } + } + doc.numParagraph = paragraphIndex; + } + + /** Process discourse information */ + protected static void processDiscourse(Document doc, Dictionaries dict) { + Boolean useMarkedDiscourse = + doc.annotation.get(CoreAnnotations.UseMarkedDiscourseAnnotation.class); + if (useMarkedDiscourse == null || !useMarkedDiscourse) { + for (CoreLabel l : doc.annotation.get(CoreAnnotations.TokensAnnotation.class)) { + l.remove(CoreAnnotations.SpeakerAnnotation.class); + l.remove(CoreAnnotations.UtteranceAnnotation.class); + } + } + + setUtteranceAndSpeakerAnnotation(doc); +// markQuotations(this.annotation.get(CoreAnnotations.SentencesAnnotation.class), false); + + // mention utter setting + for(Mention m : doc.predictedMentionsByID.values()) { + m.utter = m.headWord.get(CoreAnnotations.UtteranceAnnotation.class); + } + + doc.docType = findDocType(doc); + findSpeakers(doc, dict); + + boolean debug = false; + if(debug) { + for(CoreMap sent : doc.annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + for(CoreLabel cl : sent.get(TokensAnnotation.class)) { + log.info(" "+cl.word()+"-"+cl.get(UtteranceAnnotation.class)+"-"+cl.get(SpeakerAnnotation.class)); + } + } + + + for(Integer utter : doc.speakers.keySet()) { + String speakerID = doc.speakers.get(utter); + log.info("utterance: "+utter); + log.info("speakers value: " + speakerID); + log.info("mention for it: "+ + ( (NumberMatchingRegex.isDecimalInteger(speakerID))? + doc.predictedMentionsByID.get(Integer.parseInt(doc.speakers.get(utter))) + : "no mention for this speaker yet") ); + } + log.info("AA SPEAKERS: "+ doc.speakers); + } + + // build 'speakerInfo' from 'speakers' + for(Integer utter : doc.speakers.keySet()) { + String speaker = doc.speakers.get(utter); + SpeakerInfo speakerInfo = doc.speakerInfoMap.get(speaker); + if (speakerInfo == null) { + doc.speakerInfoMap.put(speaker, speakerInfo = new SpeakerInfo(speaker)); + } + } + if(debug){ + log.info("BB SPEAKER INFO MAP: "+doc.speakerInfoMap); + } + + // mention -> to its speakerID: m.headWord.get(SpeakerAnnotation.class) + // speakerID -> more info: speakerInfoMap.get(speakerID) + // if exists, set(mentionID, its speakerID pair): speakerPairs + + // for speakerInfo with real speaker name, find corresponding mention by strict/loose matching + Map speakerConversion = Generics.newHashMap(); + for(String speaker : doc.speakerInfoMap.keySet()) { + SpeakerInfo speakerInfo = doc.speakerInfoMap.get(speaker); + if (speakerInfo.hasRealSpeakerName()) { // do only for real name speaker, not mention ID + boolean found = false; + for(Mention m : doc.predictedMentionsByID.values()) { + if (CorefRules.mentionMatchesSpeaker(m, speakerInfo, true)) { + speakerConversion.put(speaker, m.mentionID); + found = true; + break; + } + } + if(!found) { + for(Mention m : doc.predictedMentionsByID.values()) { + if (CorefRules.mentionMatchesSpeaker(m, speakerInfo, false)) { + speakerConversion.put(speaker, m.mentionID); + break; + } + } + } + } + } + + if(debug) log.info("CC speaker conversion: " + speakerConversion); + + // convert real name speaker to speaker mention id + for(Integer utter : doc.speakers.keySet()) { + String speaker = doc.speakers.get(utter); + if(speakerConversion.containsKey(speaker)) { + int speakerID = speakerConversion.get(speaker); + doc.speakers.put(utter, Integer.toString(speakerID)); + } + } + for(String speaker : speakerConversion.keySet()) { + doc.speakerInfoMap.put( Integer.toString(speakerConversion.get(speaker)), doc.speakerInfoMap.get(speaker)); + doc.speakerInfoMap.remove(speaker); + } + + // fix SpeakerAnnotation + for(CoreLabel cl : doc.annotation.get(TokensAnnotation.class)) { + int utter = cl.get(UtteranceAnnotation.class); + if(doc.speakers.containsKey(utter)) { + cl.set(CoreAnnotations.SpeakerAnnotation.class, doc.speakers.get(utter)); + } + } + + // find speakerPairs + for(Mention m : doc.predictedMentionsByID.values()) { + String speaker = m.headWord.get(CoreAnnotations.SpeakerAnnotation.class); + if(debug) log.info("DD: "+speaker); + if (NumberMatchingRegex.isDecimalInteger(speaker)) { + int speakerMentionID = Integer.parseInt(speaker); + doc.speakerPairs.add(new Pair<>(m.mentionID, speakerMentionID)); + } + } + + if(debug) { + log.info("=========================================================================="); + for(Integer utter : doc.speakers.keySet()) { + String speakerID = doc.speakers.get(utter); + log.info("utterance: "+utter); + log.info("speakers value: " + speakerID); + log.info("mention for it: "+ + ( (NumberMatchingRegex.isDecimalInteger(speakerID))? + doc.predictedMentionsByID.get(Integer.parseInt(doc.speakers.get(utter))) + : "no mention for this speaker yet") ); + } + log.info(doc.speakers); + } + } + + private static void setUtteranceAndSpeakerAnnotation(Document doc) { + doc.speakerInfoGiven = false; + int utterance = 0; + int outsideQuoteUtterance = 0; // the utterance of outside of quotation + boolean insideQuotation = false; + List tokens = doc.annotation.get(CoreAnnotations.TokensAnnotation.class); + String preSpeaker = (tokens.size() > 0)? tokens.get(0).get(CoreAnnotations.SpeakerAnnotation.class) : null; + + for (CoreLabel l : tokens) { + String curSpeaker = l.get(CoreAnnotations.SpeakerAnnotation.class); + String w = l.get(CoreAnnotations.TextAnnotation.class); + + if (curSpeaker!=null && !curSpeaker.equals("-")) doc.speakerInfoGiven = true; + + boolean speakerChange = doc.speakerInfoGiven && curSpeaker!=null && !curSpeaker.equals(preSpeaker); + boolean quoteStart = w.equals("``") || (!insideQuotation && w.equals("\"")); + boolean quoteEnd = w.equals("''") || (insideQuotation && w.equals("\"")); + + if(speakerChange) { + if(quoteStart) { + utterance = doc.maxUtter + 1; + outsideQuoteUtterance = utterance+1; + } else { + utterance = doc.maxUtter + 1; + outsideQuoteUtterance = utterance; + } + preSpeaker = curSpeaker; + } else { + if(quoteStart) { + utterance = doc.maxUtter + 1; + } + } + if(quoteEnd) { + utterance = outsideQuoteUtterance; + insideQuotation = false; + } + if(doc.maxUtter < utterance) doc.maxUtter = utterance; + + l.set(CoreAnnotations.UtteranceAnnotation.class, utterance); + if(quoteStart) l.set(CoreAnnotations.UtteranceAnnotation.class, outsideQuoteUtterance); // quote start got outside utterance idx + + boolean noSpeakerInfo = !l.containsKey(CoreAnnotations.SpeakerAnnotation.class) + || l.get(CoreAnnotations.SpeakerAnnotation.class).equals("") + || l.get(CoreAnnotations.SpeakerAnnotation.class).startsWith("PER"); + + if(noSpeakerInfo || insideQuotation){ + l.set(CoreAnnotations.SpeakerAnnotation.class, "PER"+utterance); + } + if(quoteStart) insideQuotation = true; + } + } + + /** Speaker extraction */ + private static void findSpeakers(Document doc, Dictionaries dict) { + Boolean useMarkedDiscourseBoolean = doc.annotation.get(CoreAnnotations.UseMarkedDiscourseAnnotation.class); + boolean useMarkedDiscourse = (useMarkedDiscourseBoolean != null)? useMarkedDiscourseBoolean: false; + + if(!useMarkedDiscourse) { + if(doc.docType==DocType.CONVERSATION) findSpeakersInConversation(doc, dict); + else if (doc.docType==DocType.ARTICLE) findSpeakersInArticle(doc, dict); + } + + for(CoreMap sent : doc.annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) { + int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class); + if(!doc.speakers.containsKey(utterIndex)) { + doc.speakers.put(utterIndex, w.get(CoreAnnotations.SpeakerAnnotation.class)); + } + } + } + } + + private static void findSpeakersInArticle(Document doc, Dictionaries dict) { + List sentences = doc.annotation.get(CoreAnnotations.SentencesAnnotation.class); + IntPair beginQuotation = null; + IntPair endQuotation = null; + boolean insideQuotation = false; + int utterNum = -1; + + for (int i = 0 ; i < sentences.size(); i++) { + List sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class); + for(int j = 0 ; j < sent.size() ; j++) { + int utterIndex = sent.get(j).get(CoreAnnotations.UtteranceAnnotation.class); + + if(utterIndex != 0 && !insideQuotation) { + utterNum = utterIndex; + insideQuotation = true; + beginQuotation = new IntPair(i,j); + } else if (utterIndex == 0 && insideQuotation) { + insideQuotation = false; + endQuotation = new IntPair(i,j); + findQuotationSpeaker(doc, utterNum, sentences, beginQuotation, endQuotation, dict); + } + } + } + if(insideQuotation) { + endQuotation = new IntPair(sentences.size()-1, sentences.get(sentences.size()-1).get(CoreAnnotations.TokensAnnotation.class).size()-1); + findQuotationSpeaker(doc, utterNum, sentences, beginQuotation, endQuotation, dict); + } + } + + private static void findQuotationSpeaker(Document doc, int utterNum, List sentences, + IntPair beginQuotation, IntPair endQuotation, Dictionaries dict) { + + if(findSpeaker(doc, utterNum, beginQuotation.get(0), sentences, 0, beginQuotation.get(1), dict)) + return ; + + if(findSpeaker(doc, utterNum, endQuotation.get(0), sentences, endQuotation.get(1), + sentences.get(endQuotation.get(0)).get(CoreAnnotations.TokensAnnotation.class).size(), dict)) + return; + + if(beginQuotation.get(1) <= 1 && beginQuotation.get(0) > 0) { + if(findSpeaker(doc, utterNum, beginQuotation.get(0)-1, sentences, 0, + sentences.get(beginQuotation.get(0)-1).get(CoreAnnotations.TokensAnnotation.class).size(), dict)) + return; + } + + if(endQuotation.get(1) >= sentences.get(endQuotation.get(0)).size()-2 + && sentences.size() > endQuotation.get(0)+1) { + if(findSpeaker(doc, utterNum, endQuotation.get(0)+1, sentences, 0, + sentences.get(endQuotation.get(0)+1).get(CoreAnnotations.TokensAnnotation.class).size(), dict)) + return; + } + } + + private static boolean findSpeaker(Document doc, int utterNum, int sentNum, List sentences, + int startIndex, int endIndex, Dictionaries dict) { + List sent = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class); + for(int i = startIndex ; i < endIndex ; i++) { + CoreLabel cl = sent.get(i); + if(cl.get(CoreAnnotations.UtteranceAnnotation.class)!=0) continue; + String lemma = cl.lemma(); + String word = cl.word(); + if(dict.reportVerb.contains(lemma) && cl.tag().startsWith("V")) { + // find subject + SemanticGraph dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); + if (dependency == null) { + dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); + } + IndexedWord w = dependency.getNodeByWordPattern(word); + + if (w != null) { + if(findSubject(doc, dependency, w, sentNum, utterNum)) return true; + for(IndexedWord p : dependency.getPathToRoot(w)) { + if(!p.tag().startsWith("V") && !p.tag().startsWith("MD")) break; + if(findSubject(doc, dependency, p, sentNum, utterNum)) return true; // handling something like "was talking", "can tell" + } + } else { + Redwood.log("debug-preprocessor", "Cannot find node in dependency for word " + word); + } + } + } + return false; + } + + private static boolean findSubject(Document doc, SemanticGraph dependency, IndexedWord w, int sentNum, int utterNum) { + for(Pair child : dependency.childPairs(w)){ + if(child.first().getShortName().equals("nsubj")) { + String subjectString = child.second().word(); + int subjectIndex = child.second().index(); // start from 1 + IntTuple headPosition = new IntTuple(2); + headPosition.set(0, sentNum); + headPosition.set(1, subjectIndex-1); + String speaker; + if(doc.mentionheadPositions.containsKey(headPosition)) { + speaker = Integer.toString(doc.mentionheadPositions.get(headPosition).mentionID); + } else { + speaker = subjectString; + } + doc.speakers.put(utterNum, speaker); + return true; + } + } + return false; + } + + private static void findSpeakersInConversation(Document doc, Dictionaries dict) { + for(List l : doc.predictedMentions) { + for(Mention m : l){ + if(m.predicateNominatives == null) continue; + for (Mention a : m.predicateNominatives){ + if(a.spanToString().toLowerCase().equals("i")) { + doc.speakers.put(m.headWord.get(CoreAnnotations.UtteranceAnnotation.class), Integer.toString(m.mentionID)); + } + } + } + } + List paragraph = new ArrayList<>(); + int paragraphUtterIndex = 0; + String nextParagraphSpeaker = ""; + int paragraphOffset = 0; + for(CoreMap sent : doc.annotation.get(CoreAnnotations.SentencesAnnotation.class)) { + paragraph.add(sent); + int currentUtter = sent.get(CoreAnnotations.TokensAnnotation.class).get(0).get(CoreAnnotations.UtteranceAnnotation.class); + if(paragraphUtterIndex!=currentUtter) { + nextParagraphSpeaker = findParagraphSpeaker(doc, paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict); + paragraphUtterIndex = currentUtter; + paragraphOffset += paragraph.size(); + paragraph = new ArrayList<>(); + } + } + findParagraphSpeaker(doc, paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict); + } + + private static String findParagraphSpeaker(Document doc, List paragraph, + int paragraphUtterIndex, String nextParagraphSpeaker, int paragraphOffset, Dictionaries dict) { + if ( ! doc.speakers.containsKey(paragraphUtterIndex)) { + if ( ! nextParagraphSpeaker.isEmpty()) { + doc.speakers.put(paragraphUtterIndex, nextParagraphSpeaker); + } else { // find the speaker of this paragraph (John, nbc news) + // cdm [Sept 2015] added this check to try to avoid crash + if (paragraph.isEmpty()) { + Redwood.log("debug-preprocessor", "Empty paragraph; skipping findParagraphSpeaker"); + return ""; + } + CoreMap lastSent = paragraph.get(paragraph.size()-1); + String speaker = ""; + boolean hasVerb = false; + for(int i = 0 ; i < lastSent.get(CoreAnnotations.TokensAnnotation.class).size() ; i++){ + CoreLabel w = lastSent.get(CoreAnnotations.TokensAnnotation.class).get(i); + String pos = w.get(CoreAnnotations.PartOfSpeechAnnotation.class); + String ner = w.get(CoreAnnotations.NamedEntityTagAnnotation.class); + if(pos.startsWith("V")) { + hasVerb = true; + break; + } + if(ner.startsWith("PER")) { + IntTuple headPosition = new IntTuple(2); + headPosition.set(0, paragraph.size()-1 + paragraphOffset); + headPosition.set(1, i); + if(doc.mentionheadPositions.containsKey(headPosition)) { + speaker = Integer.toString(doc.mentionheadPositions.get(headPosition).mentionID); + } + } + } + if(!hasVerb && !speaker.equals("")) { + doc.speakers.put(paragraphUtterIndex, speaker); + } + } + } + return findNextParagraphSpeaker(doc, paragraph, paragraphOffset, dict); + } + + private static String findNextParagraphSpeaker(Document doc, List paragraph, int paragraphOffset, Dictionaries dict) { + if (paragraph.isEmpty()) { + return ""; + } + CoreMap lastSent = paragraph.get(paragraph.size()-1); + String speaker = ""; + for(CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) { + if(w.get(CoreAnnotations.LemmaAnnotation.class).equals("report") || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) { + String word = w.get(CoreAnnotations.TextAnnotation.class); + SemanticGraph dependency = lastSent.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); + if (dependency == null) { + dependency = lastSent.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); + } + IndexedWord t = dependency.getNodeByWordPattern(word); + + for(Pair child : dependency.childPairs(t)){ + if(child.first().getShortName().equals("nsubj")) { + int subjectIndex = child.second().index(); // start from 1 + IntTuple headPosition = new IntTuple(2); + headPosition.set(0, paragraph.size()-1 + paragraphOffset); + headPosition.set(1, subjectIndex-1); + if(doc.mentionheadPositions.containsKey(headPosition) + && doc.mentionheadPositions.get(headPosition).nerString.startsWith("PER")) { + speaker = Integer.toString(doc.mentionheadPositions.get(headPosition).mentionID); + } + } + } + } + } + return speaker; + } + + /** Check one mention is the speaker of the other mention */ + public static boolean isSpeaker(Mention m, Mention ant, Dictionaries dict) { + + if(!dict.firstPersonPronouns.contains(ant.spanToString().toLowerCase()) + || ant.number==Number.PLURAL || ant.sentNum!=m.sentNum) return false; + + int countQuotationMark = 0; + for(int i = Math.min(m.headIndex, ant.headIndex)+1 ; i < Math.max(m.headIndex, ant.headIndex) ; i++) { + String word = m.sentenceWords.get(i).get(CoreAnnotations.TextAnnotation.class); + if(word.equals("``") || word.equals("''")) countQuotationMark++; + } + if(countQuotationMark!=1) return false; + + IndexedWord w = m.enhancedDependency.getNodeByWordPattern(m.sentenceWords.get(m.headIndex).get(CoreAnnotations.TextAnnotation.class)); + if(w== null) return false; + + for(Pair parent : m.enhancedDependency.parentPairs(w)){ + if(parent.first().getShortName().equals("nsubj") + && dict.reportVerb.contains(parent.second().get(CoreAnnotations.LemmaAnnotation.class))) { + return true; + } + } + return false; + } + + private static void markListMemberRelation(List orderedMentions) { + for(Mention m1 : orderedMentions){ + for(Mention m2 : orderedMentions){ + // Mark if m2 and m1 are in list relationship + if (m1.isListMemberOf(m2)) { + m2.addListMember(m1); + m1.addBelongsToList(m2); + } else if (m2.isListMemberOf(m1)) { + m1.addListMember(m2); + m2.addBelongsToList(m1); + } + } + } + } + private static void markMentionRelation(List orderedMentions, Set> foundPairs, String flag) { + for(Mention m1 : orderedMentions){ + for(Mention m2 : orderedMentions){ + if(m1==m2) continue; + // Ignore if m2 and m1 are in list relationship + if (m1.isListMemberOf(m2) || m2.isListMemberOf(m1) || m1.isMemberOfSameList(m2)) { + //Redwood.log("debug-preprocessor", "Not checking '" + m1 + "' and '" + m2 + "' for " + flag + ": in list relationship"); + continue; + } + for(Pair foundPair: foundPairs){ + if (foundPair.first() == m1.headIndex && foundPair.second() == m2.headIndex) { + if(flag.equals("APPOSITION")) { + if ( ! foundPair.first().equals(foundPair.second()) || m2.insideIn(m1)) { + m2.addApposition(m1); + } + } + else if(flag.equals("PREDICATE_NOMINATIVE")) { + m2.addPredicateNominatives(m1); + } + else if(flag.equals("RELATIVE_PRONOUN")) m2.addRelativePronoun(m1); + else throw new RuntimeException("check flag in markMentionRelation (dcoref/MentionExtractor.java)"); + } + } + } + } + } + +// private static final TregexPattern relativePronounPattern = TregexPattern.compile("NP < (NP=m1 $.. (SBAR < (WHNP < WP|WDT=m2)))"); +// private static void findRelativePronouns(Tree tree, Set> relativePronounPairs) { +// findTreePattern(tree, relativePronounPattern, relativePronounPairs); +// } +} diff --git a/src/edu/stanford/nlp/coref/hybrid/README.txt b/src/edu/stanford/nlp/coref/README.txt similarity index 84% rename from src/edu/stanford/nlp/coref/hybrid/README.txt rename to src/edu/stanford/nlp/coref/README.txt index 71cfe063d1..1bf79a76da 100644 --- a/src/edu/stanford/nlp/coref/hybrid/README.txt +++ b/src/edu/stanford/nlp/coref/README.txt @@ -21,10 +21,10 @@ Dependency tree based models will give about 1 point lower score. Here's the commands for system training and evaluation. How to train models - all sieves will be trained contiguously. -$ java -Xmx30g edu.stanford.nlp.hcoref.train.CorefTrainer -props >& log-train.txt +$ java -Xmx30g edu.stanford.nlp.coref.train.CorefTrainer -props >& log-train.txt How to evaluate the system -$ java -Xmx30g edu.stanford.nlp.hcoref.CorefSystem -props >& output.txt +$ java -Xmx30g edu.stanford.nlp.coref.CorefSystem -props >& output.txt Several properties files are in hcoref.properties. - coref-default-dep.properties: default settings for using hcoref in pipeline. @@ -48,11 +48,11 @@ Here's the brief workflow. Here is an example code to use the system (See pipeline.HybridCorefAnnotator). - import edu.stanford.nlp.hcoref.CorefCoreAnnotations; - import edu.stanford.nlp.hcoref.CorefSystem; - import edu.stanford.nlp.hcoref.data.CorefChain; - import edu.stanford.nlp.hcoref.data.CorefChain.CorefMention; - import edu.stanford.nlp.hcoref.data.Document; + import edu.stanford.nlp.coref.CorefCoreAnnotations; + import edu.stanford.nlp.coref.CorefSystem; + import edu.stanford.nlp.coref.data.CorefChain; + import edu.stanford.nlp.coref.data.CorefChain.CorefMention; + import edu.stanford.nlp.coref.data.Document; CorefSystem corefSystem = new CorefSystem(props); Document corefDoc = corefSystem.docMaker.makeDocument(annotation); diff --git a/src/edu/stanford/nlp/coref/Scorer.java b/src/edu/stanford/nlp/coref/Scorer.java new file mode 100644 index 0000000000..bbe7aeceea --- /dev/null +++ b/src/edu/stanford/nlp/coref/Scorer.java @@ -0,0 +1,37 @@ +package edu.stanford.nlp.coref; + +import java.io.IOException; +import java.io.PrintWriter; +import java.text.DecimalFormat; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import edu.stanford.nlp.io.StringOutputStream; +import edu.stanford.nlp.util.SystemUtils; + +public class Scorer { + public static String getEvalSummary(String evalScript, + String goldFile, String predictFile) throws IOException { + ProcessBuilder process = new ProcessBuilder(evalScript, "all", goldFile, predictFile, "none"); + StringOutputStream errSos = new StringOutputStream(); + StringOutputStream outSos = new StringOutputStream(); + PrintWriter out = new PrintWriter(outSos); + PrintWriter err = new PrintWriter(errSos); + SystemUtils.run(process, out, err); + out.close(); + err.close(); + String summary = outSos.toString(); + String errStr = errSos.toString(); + if ( ! errStr.isEmpty()) { + summary += "\nERROR: " + errStr; + } + Pattern pattern = Pattern.compile("\\d+\\.\\d\\d\\d+"); + DecimalFormat df = new DecimalFormat("#.##"); + Matcher matcher = pattern.matcher(summary); + while(matcher.find()) { + String number = matcher.group(); + summary = summary.replaceFirst(number, df.format(Double.parseDouble(number))); + } + return summary; + } +} diff --git a/src/edu/stanford/nlp/coref/misc/SingletonPredictor.java b/src/edu/stanford/nlp/coref/SingletonPredictor.java similarity index 88% rename from src/edu/stanford/nlp/coref/misc/SingletonPredictor.java rename to src/edu/stanford/nlp/coref/SingletonPredictor.java index ac03d2b0d4..b4edc8b85a 100644 --- a/src/edu/stanford/nlp/coref/misc/SingletonPredictor.java +++ b/src/edu/stanford/nlp/coref/SingletonPredictor.java @@ -1,6 +1,8 @@ // StanfordCoreNLP -- a suite of NLP tools -package edu.stanford.nlp.coref.misc; +package edu.stanford.nlp.coref; +import edu.stanford.nlp.util.logging.Redwood; + import java.io.IOException; import java.io.ObjectOutputStream; import java.util.ArrayList; @@ -12,23 +14,18 @@ import edu.stanford.nlp.classify.LogisticClassifier; import edu.stanford.nlp.classify.LogisticClassifierFactory; import edu.stanford.nlp.coref.data.CorefCluster; -import edu.stanford.nlp.coref.data.DocumentMaker; import edu.stanford.nlp.coref.data.Dictionaries; import edu.stanford.nlp.coref.data.Document; import edu.stanford.nlp.coref.data.Mention; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.io.RuntimeIOException; +import edu.stanford.nlp.ling.CoreAnnotations.*; import edu.stanford.nlp.ling.BasicDatum; -import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.TokenBeginAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.util.CoreMap; -import edu.stanford.nlp.util.PropertiesUtils; import edu.stanford.nlp.util.StringUtils; -import edu.stanford.nlp.util.logging.Redwood; /** * Train the singleton predictor using a logistic regression classifier as @@ -66,7 +63,7 @@ public static void setTokenIndices(Document doc){ } } } - + /** * Generate the training features from the CoNLL input file. * @return Dataset of feature vectors @@ -75,10 +72,10 @@ public static void setTokenIndices(Document doc){ public GeneralDataset generateFeatureVectors(Properties props) throws Exception { GeneralDataset dataset = new Dataset<>(); - + Dictionaries dict = new Dictionaries(props); - DocumentMaker docMaker = new DocumentMaker(props, dict); - + CorefDocMaker docMaker = new CorefDocMaker(props, dict); + Document document; while((document = docMaker.nextDoc()) != null){ setTokenIndices(document); @@ -93,18 +90,18 @@ public GeneralDataset generateFeatureVectors(Properties props) t IndexedWord head = mention.enhancedDependency. getNodeByIndexSafe(mention.headWord.index()); - if(head == null) continue; + if(head == null) continue; ArrayList feats = mention.getSingletonFeatures(dict); dataset.add(new BasicDatum<>( feats, "1")); - } + } } - // Generate features for singletons with class label 0 + // Generate features for singletons with class label 0 ArrayList gold_heads = new ArrayList<>(); for(Mention gold_men : document.goldMentionsByID.values()){ gold_heads.add(gold_men.headWord); - } + } for(Mention predicted_men : document.predictedMentionsByID.values()){ SemanticGraph dep = predicted_men.enhancedDependency; IndexedWord head = dep.getNodeByIndexSafe(predicted_men.headWord.index()); @@ -119,11 +116,11 @@ public GeneralDataset generateFeatureVectors(Properties props) t predicted_men.getSingletonFeatures(dict), "0")); } } - + dataset.summaryStatistics(); return dataset; } - + /** * Train the singleton predictor using a logistic regression classifier. * @param Dataset of features @@ -136,7 +133,7 @@ public LogisticClassifier train(GeneralDataset p return classifier; } - + /** * Saves the singleton predictor model to the given filename. * If there is an error, a RuntimeIOException is thrown. @@ -153,11 +150,7 @@ public void saveToSerialized(LogisticClassifier predictor, throw new RuntimeIOException(ioe); } } - - public static String getPathSingletonPredictor(Properties props) { - return PropertiesUtils.getString(props, "coref.path.singletonPredictor", "edu/stanford/nlp/models/dcoref/singleton.predictor.ser"); - } - + public static void main(String[] args) throws Exception { Properties props = null; if (args.length > 0) props = StringUtils.argsToProperties(args); @@ -169,12 +162,12 @@ public static void main(String[] args) throws Exception { log.info("-singleton.predictor.output [output_model_file]: was not specified"); return; } - + SingletonPredictor predictor = new SingletonPredictor(); - + GeneralDataset data = predictor.generateFeatureVectors(props); LogisticClassifier classifier = predictor.train(data); - predictor.saveToSerialized(classifier, getPathSingletonPredictor(props)); + predictor.saveToSerialized(classifier, CorefProperties.getPathSingletonPredictor(props)); } } diff --git a/src/edu/stanford/nlp/coref/data/CorefChain.java b/src/edu/stanford/nlp/coref/data/CorefChain.java index 4c5083bce9..9459898d7b 100644 --- a/src/edu/stanford/nlp/coref/data/CorefChain.java +++ b/src/edu/stanford/nlp/coref/data/CorefChain.java @@ -35,7 +35,6 @@ import java.util.Set; import edu.stanford.nlp.coref.CorefCoreAnnotations; - import edu.stanford.nlp.coref.data.Dictionaries.Animacy; import edu.stanford.nlp.coref.data.Dictionaries.Gender; import edu.stanford.nlp.coref.data.Dictionaries.MentionType; diff --git a/src/edu/stanford/nlp/coref/data/CorefCluster.java b/src/edu/stanford/nlp/coref/data/CorefCluster.java index ae2e8755f3..766645bfc5 100644 --- a/src/edu/stanford/nlp/coref/data/CorefCluster.java +++ b/src/edu/stanford/nlp/coref/data/CorefCluster.java @@ -37,7 +37,6 @@ import edu.stanford.nlp.coref.data.Dictionaries.Animacy; import edu.stanford.nlp.coref.data.Dictionaries.Gender; import edu.stanford.nlp.coref.data.Dictionaries.Number; - import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.util.Generics; @@ -75,7 +74,6 @@ public class CorefCluster implements Serializable{ public int getClusterID(){ return clusterID; } public Set getCorefMentions() { return corefMentions; } - public int size() { return corefMentions.size(); } public Mention getFirstMention() { return firstMention; } public Mention getRepresentativeMention() { return representative; } diff --git a/src/edu/stanford/nlp/coref/data/Dictionaries.java b/src/edu/stanford/nlp/coref/data/Dictionaries.java index 7faaa4791d..1ed7e3c247 100644 --- a/src/edu/stanford/nlp/coref/data/Dictionaries.java +++ b/src/edu/stanford/nlp/coref/data/Dictionaries.java @@ -14,7 +14,7 @@ import java.util.Properties; import java.util.Set; -import edu.stanford.nlp.coref.hybrid.HybridCorefProperties; +import edu.stanford.nlp.coref.CorefProperties; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.neural.VectorMap; @@ -25,11 +25,6 @@ import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.PropertiesUtils; -/** - * Stores various data used for coreference. - * TODO: get rid of dependence on HybridCorefProperties - * @author Heeyoung Lee - */ public class Dictionaries { /** A logger for this class */ @@ -94,7 +89,7 @@ public enum Person { I, YOU, HE, SHE, WE, THEY, IT, UNKNOWN} public Set titleWords; public Set removeWords; public Set removeChars; - + public final Set personPronouns = Generics.newHashSet(); public final Set allPronouns = Generics.newHashSet(); @@ -163,7 +158,7 @@ private void readWordLists(Locale lang) { neg_relations = WordLists.neg_relationsEn; modals = WordLists.modalsEn; break; - + case "zh": reportVerb = WordLists.reportVerbZh; reportNoun = WordLists.reportNounZh; @@ -211,12 +206,12 @@ private void readWordLists(Locale lang) { } public int dimVector; - + public VectorMap vectors = new VectorMap(); public Map strToEntity = Generics.newHashMap(); public Counter dictScore = new ClassicCounter<>(); - + private void setPronouns() { for(String s: animatePronouns){ personPronouns.add(s); @@ -380,7 +375,7 @@ private void loadCountriesLists(String file) { * Load Bergsma and Lin (2006) gender and number list. *
* The list is converted from raw text and numbers to a serialized - * map, which saves quite a bit of time loading. + * map, which saves quite a bit of time loading. * See edu.stanford.nlp.dcoref.util.ConvertGenderFile */ /* @@ -459,13 +454,13 @@ public void loadChineseGenderNumberAnimacy(String file) { } else if (neutral * 0.5 > male + female && neutral > 2) { neutralWords.add(word); } - + if (animate * 0.5 > inanimate && animate > 2) { animateWords.add(word); } else if (inanimate * 0.5 > animate && inanimate > 2) { inanimateWords.add(word); } - + if (singular * 0.5 > plural && singular >2) { singularWords.add(word); } else if (plural * 0.5 > singular && plural > 2) { @@ -538,17 +533,17 @@ private static void loadSignatures(String file, Map> sigs IOUtils.closeIgnoringExceptions(reader); } } - + public void loadSemantics(Properties props) throws ClassNotFoundException, IOException { log.info("LOADING SEMANTICS"); // wordnet = new WordNet(); - + // load word vector - if(HybridCorefProperties.loadWordEmbedding(props)) { + if(CorefProperties.loadWordEmbedding(props)) { log.info("LOAD: WordVectors"); - String wordvectorFile = HybridCorefProperties.getPathSerializedWordVectors(props); - String word2vecFile = HybridCorefProperties.getPathWord2Vec(props); + String wordvectorFile = CorefProperties.getPathSerializedWordVectors(props); + String word2vecFile = CorefProperties.getPathWord2Vec(props); try { // Try to read the serialized vectors vectors = VectorMap.deserialize(wordvectorFile); @@ -565,7 +560,7 @@ public void loadSemantics(Properties props) throws ClassNotFoundException, IOExc } } dimVector = vectors.entrySet().iterator().next().getValue().length; - + // if(Boolean.parseBoolean(props.getProperty("useValDictionary"))) { // log.info("LOAD: ValDictionary"); // for(String line : IOUtils.readLines(valDict)) { @@ -578,25 +573,25 @@ public void loadSemantics(Properties props) throws ClassNotFoundException, IOExc } public Dictionaries(Properties props) throws ClassNotFoundException, IOException { - this(props.getProperty(HybridCorefProperties.LANG_PROP, HybridCorefProperties.LANGUAGE_DEFAULT.toLanguageTag()), - props.getProperty(HybridCorefProperties.DEMONYM_PROP, DefaultPaths.DEFAULT_DCOREF_DEMONYM), - props.getProperty(HybridCorefProperties.ANIMATE_PROP, DefaultPaths.DEFAULT_DCOREF_ANIMATE), - props.getProperty(HybridCorefProperties.INANIMATE_PROP, DefaultPaths.DEFAULT_DCOREF_INANIMATE), - props.getProperty(HybridCorefProperties.MALE_PROP), - props.getProperty(HybridCorefProperties.NEUTRAL_PROP), - props.getProperty(HybridCorefProperties.FEMALE_PROP), - props.getProperty(HybridCorefProperties.PLURAL_PROP), - props.getProperty(HybridCorefProperties.SINGULAR_PROP), - props.getProperty(HybridCorefProperties.STATES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES), - props.getProperty(HybridCorefProperties.GENDER_NUMBER_PROP, HybridCorefProperties.getGenderNumber(props)), - props.getProperty(HybridCorefProperties.COUNTRIES_PROP, DefaultPaths.DEFAULT_DCOREF_COUNTRIES), - props.getProperty(HybridCorefProperties.STATES_PROVINCES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES_AND_PROVINCES), - HybridCorefProperties.getSieves(props).contains("CorefDictionaryMatch"), - PropertiesUtils.getStringArray(props, HybridCorefProperties.DICT_LIST_PROP, + this(props.getProperty(CorefProperties.LANG_PROP, CorefProperties.LANGUAGE_DEFAULT.toLanguageTag()), + props.getProperty(CorefProperties.DEMONYM_PROP, DefaultPaths.DEFAULT_DCOREF_DEMONYM), + props.getProperty(CorefProperties.ANIMATE_PROP, DefaultPaths.DEFAULT_DCOREF_ANIMATE), + props.getProperty(CorefProperties.INANIMATE_PROP, DefaultPaths.DEFAULT_DCOREF_INANIMATE), + props.getProperty(CorefProperties.MALE_PROP), + props.getProperty(CorefProperties.NEUTRAL_PROP), + props.getProperty(CorefProperties.FEMALE_PROP), + props.getProperty(CorefProperties.PLURAL_PROP), + props.getProperty(CorefProperties.SINGULAR_PROP), + props.getProperty(CorefProperties.STATES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES), + props.getProperty(CorefProperties.GENDER_NUMBER_PROP, CorefProperties.getGenderNumber(props)), + props.getProperty(CorefProperties.COUNTRIES_PROP, DefaultPaths.DEFAULT_DCOREF_COUNTRIES), + props.getProperty(CorefProperties.STATES_PROVINCES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES_AND_PROVINCES), + CorefProperties.getSieves(props).contains("CorefDictionaryMatch"), + PropertiesUtils.getStringArray(props, CorefProperties.DICT_LIST_PROP, new String[]{DefaultPaths.DEFAULT_DCOREF_DICT1, DefaultPaths.DEFAULT_DCOREF_DICT2, DefaultPaths.DEFAULT_DCOREF_DICT3, DefaultPaths.DEFAULT_DCOREF_DICT4}), - props.getProperty(HybridCorefProperties.DICT_PMI_PROP, DefaultPaths.DEFAULT_DCOREF_DICT1), - props.getProperty(HybridCorefProperties.SIGNATURES_PROP, DefaultPaths.DEFAULT_DCOREF_NE_SIGNATURES)); + props.getProperty(CorefProperties.DICT_PMI_PROP, DefaultPaths.DEFAULT_DCOREF_DICT1), + props.getProperty(CorefProperties.SIGNATURES_PROP, DefaultPaths.DEFAULT_DCOREF_NE_SIGNATURES)); /*if(CorefProperties.useSemantics(props)) { loadSemantics(props); } else { @@ -609,46 +604,46 @@ public Dictionaries(Properties props) throws ClassNotFoundException, IOException public static String signature(Properties props) { StringBuilder os = new StringBuilder(); - os.append(HybridCorefProperties.DEMONYM_PROP + ":" + - props.getProperty(HybridCorefProperties.DEMONYM_PROP, + os.append(CorefProperties.DEMONYM_PROP + ":" + + props.getProperty(CorefProperties.DEMONYM_PROP, DefaultPaths.DEFAULT_DCOREF_DEMONYM)); - os.append(HybridCorefProperties.ANIMATE_PROP + ":" + - props.getProperty(HybridCorefProperties.ANIMATE_PROP, + os.append(CorefProperties.ANIMATE_PROP + ":" + + props.getProperty(CorefProperties.ANIMATE_PROP, DefaultPaths.DEFAULT_DCOREF_ANIMATE)); - os.append(HybridCorefProperties.INANIMATE_PROP + ":" + - props.getProperty(HybridCorefProperties.INANIMATE_PROP, + os.append(CorefProperties.INANIMATE_PROP + ":" + + props.getProperty(CorefProperties.INANIMATE_PROP, DefaultPaths.DEFAULT_DCOREF_INANIMATE)); - if(props.containsKey(HybridCorefProperties.MALE_PROP)) { - os.append(HybridCorefProperties.MALE_PROP + ":" + - props.getProperty(HybridCorefProperties.MALE_PROP)); + if(props.containsKey(CorefProperties.MALE_PROP)) { + os.append(CorefProperties.MALE_PROP + ":" + + props.getProperty(CorefProperties.MALE_PROP)); } - if(props.containsKey(HybridCorefProperties.NEUTRAL_PROP)) { - os.append(HybridCorefProperties.NEUTRAL_PROP + ":" + - props.getProperty(HybridCorefProperties.NEUTRAL_PROP)); + if(props.containsKey(CorefProperties.NEUTRAL_PROP)) { + os.append(CorefProperties.NEUTRAL_PROP + ":" + + props.getProperty(CorefProperties.NEUTRAL_PROP)); } - if(props.containsKey(HybridCorefProperties.FEMALE_PROP)) { - os.append(HybridCorefProperties.FEMALE_PROP + ":" + - props.getProperty(HybridCorefProperties.FEMALE_PROP)); + if(props.containsKey(CorefProperties.FEMALE_PROP)) { + os.append(CorefProperties.FEMALE_PROP + ":" + + props.getProperty(CorefProperties.FEMALE_PROP)); } - if(props.containsKey(HybridCorefProperties.PLURAL_PROP)) { - os.append(HybridCorefProperties.PLURAL_PROP + ":" + - props.getProperty(HybridCorefProperties.PLURAL_PROP)); + if(props.containsKey(CorefProperties.PLURAL_PROP)) { + os.append(CorefProperties.PLURAL_PROP + ":" + + props.getProperty(CorefProperties.PLURAL_PROP)); } - if(props.containsKey(HybridCorefProperties.SINGULAR_PROP)) { - os.append(HybridCorefProperties.SINGULAR_PROP + ":" + - props.getProperty(HybridCorefProperties.SINGULAR_PROP)); + if(props.containsKey(CorefProperties.SINGULAR_PROP)) { + os.append(CorefProperties.SINGULAR_PROP + ":" + + props.getProperty(CorefProperties.SINGULAR_PROP)); } - os.append(HybridCorefProperties.STATES_PROP + ":" + - props.getProperty(HybridCorefProperties.STATES_PROP, + os.append(CorefProperties.STATES_PROP + ":" + + props.getProperty(CorefProperties.STATES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES)); - os.append(HybridCorefProperties.GENDER_NUMBER_PROP + ":" + - props.getProperty(HybridCorefProperties.GENDER_NUMBER_PROP, + os.append(CorefProperties.GENDER_NUMBER_PROP + ":" + + props.getProperty(CorefProperties.GENDER_NUMBER_PROP, DefaultPaths.DEFAULT_DCOREF_GENDER_NUMBER)); - os.append(HybridCorefProperties.COUNTRIES_PROP + ":" + - props.getProperty(HybridCorefProperties.COUNTRIES_PROP, + os.append(CorefProperties.COUNTRIES_PROP + ":" + + props.getProperty(CorefProperties.COUNTRIES_PROP, DefaultPaths.DEFAULT_DCOREF_COUNTRIES)); - os.append(HybridCorefProperties.STATES_PROVINCES_PROP + ":" + - props.getProperty(HybridCorefProperties.STATES_PROVINCES_PROP, + os.append(CorefProperties.STATES_PROVINCES_PROP + ":" + + props.getProperty(CorefProperties.STATES_PROVINCES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES_AND_PROVINCES)); return os.toString(); } diff --git a/src/edu/stanford/nlp/coref/data/Document.java b/src/edu/stanford/nlp/coref/data/Document.java index 11082b01b5..7734a83de2 100644 --- a/src/edu/stanford/nlp/coref/data/Document.java +++ b/src/edu/stanford/nlp/coref/data/Document.java @@ -33,7 +33,6 @@ import java.util.Set; import edu.stanford.nlp.coref.docreader.CoNLLDocumentReader; - import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.stats.ClassicCounter; @@ -93,33 +92,33 @@ public List> getOrderedMentions() { /** List of gold links in a document by positions */ private List> goldLinks; - /** UtteranceAnnotation -> String (speaker): mention ID or speaker string + /** UtteranceAnnotation -> String (speaker): mention ID or speaker string * e.g., the value can be "34" (mentionID), "Larry" (speaker string), or "PER3" (autoassigned speaker string) * */ public Map speakers; - /** Pair of mention id, and the mention's speaker id - * the second value is the "speaker mention"'s id. + /** Pair of mention id, and the mention's speaker id + * the second value is the "speaker mention"'s id. * e.g., Larry said, "San Francisco is a city.": (id(Larry), id(San Francisco)) * */ public Set> speakerPairs; - + public boolean speakerInfoGiven; - + public int maxUtter; public int numParagraph; public int numSentences; /** Set of incompatible clusters pairs */ - private final Set> incompatibles; - private final Set> incompatibleClusters; - + private Set> incompatibles; + private Set> incompatibleClusters; + public Map, Boolean> acronymCache; - /** Map of speaker name/id to speaker info + /** Map of speaker name/id to speaker info * the key is the value of the variable 'speakers' */ public Map speakerInfoMap = Generics.newHashMap(); - + public Counter properNouns = new ClassicCounter<>(); public Counter phraseCounter = new ClassicCounter<>(); public Counter headwordCounter = new ClassicCounter<>(); @@ -139,7 +138,7 @@ public Document() { speakerPairs = Generics.newHashSet(); incompatibles = Generics.newHashSet(); incompatibleClusters = Generics.newHashSet(); - acronymCache = Generics.newHashMap(); + acronymCache = Generics.newHashMap(); } public Document(Annotation anno, List> predictedMentions, List> goldMentions) { @@ -338,8 +337,8 @@ public int numberOfSpeakers() { } public boolean isCoref(Mention m1, Mention m2) { - return this.goldMentionsByID.containsKey(m1.mentionID) - && this.goldMentionsByID.containsKey(m2.mentionID) + return this.goldMentionsByID.containsKey(m1.mentionID) + && this.goldMentionsByID.containsKey(m2.mentionID) && this.goldMentionsByID.get(m1.mentionID).goldCorefClusterID == this.goldMentionsByID.get(m2.mentionID).goldCorefClusterID; } } diff --git a/src/edu/stanford/nlp/coref/data/DocumentMaker.java b/src/edu/stanford/nlp/coref/data/DocumentMaker.java index 80181a7d7a..18b13fa2f4 100644 --- a/src/edu/stanford/nlp/coref/data/DocumentMaker.java +++ b/src/edu/stanford/nlp/coref/data/DocumentMaker.java @@ -55,8 +55,8 @@ private static DocReader getDocumentReader(Properties props) { private static HeadFinder getHeadFinder(Properties props) { Locale lang = CorefProperties.getLanguage(props); - if (lang == Locale.ENGLISH) return new SemanticHeadFinder(); - else if (lang == Locale.CHINESE) return new ChineseSemanticHeadFinder(); + if(lang == Locale.ENGLISH) return new SemanticHeadFinder(); + else if(lang == Locale.CHINESE) return new ChineseSemanticHeadFinder(); else { throw new RuntimeException("Invalid language setting: cannot load HeadFinder"); } diff --git a/src/edu/stanford/nlp/coref/data/DocumentPreprocessor.java b/src/edu/stanford/nlp/coref/data/DocumentPreprocessor.java index 5812f5c644..ab320967b5 100644 --- a/src/edu/stanford/nlp/coref/data/DocumentPreprocessor.java +++ b/src/edu/stanford/nlp/coref/data/DocumentPreprocessor.java @@ -1,6 +1,14 @@ package edu.stanford.nlp.coref.data; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; import edu.stanford.nlp.classify.LogisticClassifier; import edu.stanford.nlp.coref.CorefRules; @@ -83,7 +91,7 @@ public static void preprocess(Document doc, Dictionaries dict, LogisticClassifie } /** Extract gold coref cluster information. */ - private static void extractGoldClusters(Document doc){ + public static void extractGoldClusters(Document doc){ doc.goldCorefClusters = Generics.newHashMap(); for (List mentions : doc.goldMentions) { for (Mention m : mentions) { @@ -130,8 +138,12 @@ protected static int getHeadIndex(Tree t, HeadFinder headFinder) { } private static List mentionReorderingBySpan(List mentionsInSent) { - TreeSet ordering = new TreeSet<>((m1, m2) -> (m1.appearEarlierThan(m2)) ? -1 : - (m2.appearEarlierThan(m1)) ? 1 : 0); + TreeSet ordering = new TreeSet<>(new Comparator() { + @Override + public int compare(Mention m1, Mention m2) { + return (m1.appearEarlierThan(m2)) ? -1 : (m2.appearEarlierThan(m1)) ? 1 : 0; + } + }); ordering.addAll(mentionsInSent); List orderedMentions = Generics.newArrayList(ordering); return orderedMentions; @@ -148,9 +160,8 @@ private static void fillSyntacticInfo(Document doc) { } } - /** Assign mention IDs, find twin mentions, fill mention positions, initialize coref clusters, etc. - * @throws Exception - */ + /** assign mention IDs, find twin mentions, fill mention positions, initialize coref clusters, etc + * @throws Exception */ private static void initializeMentions(Document doc, Dictionaries dict, LogisticClassifier singletonPredictor, HeadFinder headFinder) throws Exception { boolean hasGold = (doc.goldMentions != null); assignMentionIDs(doc); @@ -177,7 +188,7 @@ private static void assignMentionIDs(Document doc) { } /** Mark twin mentions in gold and predicted mentions */ - private static void findTwinMentions(Document doc, boolean strict){ + protected static void findTwinMentions(Document doc, boolean strict){ if(strict) findTwinMentionsStrict(doc); else findTwinMentionsRelaxed(doc); } @@ -343,7 +354,7 @@ private static void fillMentionInfo(Document doc, Dictionaries dict, } private static void findSyntacticRelationsFromDependency(List orderedMentions) { - if (orderedMentions.isEmpty()) return; + if(orderedMentions.size()==0) return; markListMemberRelation(orderedMentions); SemanticGraph dependency = orderedMentions.get(0).enhancedDependency; @@ -389,7 +400,7 @@ private static void findSyntacticRelationsFromDependency(List orderedMe private static void initializeClusters(Document doc) { for (List predicted : doc.predictedMentions) { for (Mention p : predicted) { - doc.corefClusters.put(p.mentionID, new CorefCluster(p.mentionID, Generics.newHashSet(Collections.singletonList(p)))); + doc.corefClusters.put(p.mentionID, new CorefCluster(p.mentionID, Generics.newHashSet(Arrays.asList(p)))); p.corefClusterID = p.mentionID; } } @@ -443,7 +454,7 @@ private static void setParagraphAnnotation(Document doc) { } /** Process discourse information */ - private static void processDiscourse(Document doc, Dictionaries dict) { + protected static void processDiscourse(Document doc, Dictionaries dict) { Boolean useMarkedDiscourse = doc.annotation.get(CoreAnnotations.UseMarkedDiscourseAnnotation.class); if (useMarkedDiscourse == null || !useMarkedDiscourse) { @@ -535,9 +546,9 @@ private static void processDiscourse(Document doc, Dictionaries dict) { doc.speakers.put(utter, Integer.toString(speakerID)); } } - for (Map.Entry stringIntegerEntry : speakerConversion.entrySet()) { - doc.speakerInfoMap.put(Integer.toString(stringIntegerEntry.getValue()), doc.speakerInfoMap.get(stringIntegerEntry.getKey())); - doc.speakerInfoMap.remove(stringIntegerEntry.getKey()); + for(String speaker : speakerConversion.keySet()) { + doc.speakerInfoMap.put( Integer.toString(speakerConversion.get(speaker)), doc.speakerInfoMap.get(speaker)); + doc.speakerInfoMap.remove(speaker); } // fix SpeakerAnnotation @@ -615,7 +626,7 @@ private static void setUtteranceAndSpeakerAnnotation(Document doc) { if(quoteStart) l.set(CoreAnnotations.UtteranceAnnotation.class, outsideQuoteUtterance); // quote start got outside utterance idx boolean noSpeakerInfo = !l.containsKey(CoreAnnotations.SpeakerAnnotation.class) - || l.get(CoreAnnotations.SpeakerAnnotation.class).isEmpty() + || l.get(CoreAnnotations.SpeakerAnnotation.class).equals("") || l.get(CoreAnnotations.SpeakerAnnotation.class).startsWith("PER"); if(noSpeakerInfo || insideQuotation){ @@ -898,7 +909,7 @@ private static void markMentionRelation(List orderedMentions, Set foundPair: foundPairs){ if (foundPair.first() == m1.headIndex && foundPair.second() == m2.headIndex) { - if (flag.equals("APPOSITION")) { + if(flag.equals("APPOSITION")) { if ( ! foundPair.first().equals(foundPair.second()) || m2.insideIn(m1)) { m2.addApposition(m1); } diff --git a/src/edu/stanford/nlp/coref/data/InputDoc.java b/src/edu/stanford/nlp/coref/data/InputDoc.java index a9e05f9563..e2da16bd18 100644 --- a/src/edu/stanford/nlp/coref/data/InputDoc.java +++ b/src/edu/stanford/nlp/coref/data/InputDoc.java @@ -4,37 +4,36 @@ import java.util.Map; import edu.stanford.nlp.coref.docreader.CoNLLDocumentReader.CoNLLDocument; - import edu.stanford.nlp.pipeline.Annotation; /** * Input document read from input source (CoNLL, ACE, MUC, or raw text) - * Stores Annotation, gold info (optional) and additional document information (optional) - * + * Stores Annotation, gold info (optional) and additional document information (optional) + * * @author heeyoung * */ public class InputDoc { - + public Annotation annotation; - - /** + + /** * Additional document information possibly useful for coref. * (e.g., is this dialog? the source of article, etc) * We can use this as features for coref system. - * This is optional. + * This is optional. */ public Map docInfo; - /** + /** * Gold mentions with coreference information for evaluation. - * This is optional. + * This is optional. */ public List> goldMentions; - + /** optional for CoNLL document */ public CoNLLDocument conllDoc; - + public InputDoc(Annotation anno) { this(anno, null, null, null); } diff --git a/src/edu/stanford/nlp/coref/data/Mention.java b/src/edu/stanford/nlp/coref/data/Mention.java index cf53c70bca..3ce83f9145 100644 --- a/src/edu/stanford/nlp/coref/data/Mention.java +++ b/src/edu/stanford/nlp/coref/data/Mention.java @@ -36,13 +36,12 @@ import java.util.Objects; import java.util.Set; +import edu.stanford.nlp.classify.LogisticClassifier; import edu.stanford.nlp.coref.data.Dictionaries.Animacy; import edu.stanford.nlp.coref.data.Dictionaries.Gender; import edu.stanford.nlp.coref.data.Dictionaries.MentionType; import edu.stanford.nlp.coref.data.Dictionaries.Number; import edu.stanford.nlp.coref.data.Dictionaries.Person; - -import edu.stanford.nlp.classify.LogisticClassifier; import edu.stanford.nlp.ling.AbstractCoreLabel; import edu.stanford.nlp.ling.BasicDatum; import edu.stanford.nlp.ling.CoreAnnotation; @@ -1618,8 +1617,8 @@ public boolean equals(Object obj) { if (isSubject != rhs.isSubject) { return false; } if (isDirectObject != rhs.isDirectObject) { return false; } - if (isIndirectObject != rhs.isIndirectObject) { return false; } - if (isPrepositionObject != rhs.isPrepositionObject) { return false; } + if (isIndirectObject != isIndirectObject) { return false; } + if (isPrepositionObject != isPrepositionObject) { return false; } if (hasTwin != rhs.hasTwin) { return false; } if (generic != rhs.generic) { return false; } diff --git a/src/edu/stanford/nlp/coref/data/Semantics.java b/src/edu/stanford/nlp/coref/data/Semantics.java index 96a39218f6..91013e09bc 100644 --- a/src/edu/stanford/nlp/coref/data/Semantics.java +++ b/src/edu/stanford/nlp/coref/data/Semantics.java @@ -5,11 +5,11 @@ /** Semantic knowledge: currently WordNet is available */ public class Semantics { public Object wordnet; - + public Semantics() {} public Semantics(Dictionaries dict) throws Exception{ - Constructor wordnetConstructor = (Class.forName("edu.stanford.nlp.hcoref.WordNet")).getConstructor(); + Constructor wordnetConstructor = (Class.forName("edu.stanford.nlp.coref.WordNet")).getConstructor(); wordnet = wordnetConstructor.newInstance(); } } diff --git a/src/edu/stanford/nlp/coref/data/SpeakerInfo.java b/src/edu/stanford/nlp/coref/data/SpeakerInfo.java index 3e10c540df..8d3f289ee0 100644 --- a/src/edu/stanford/nlp/coref/data/SpeakerInfo.java +++ b/src/edu/stanford/nlp/coref/data/SpeakerInfo.java @@ -14,15 +14,15 @@ */ public class SpeakerInfo implements Serializable { private static final long serialVersionUID = 7776098967746458031L; - - private final String speakerId; + + private String speakerId; private String speakerName; private String[] speakerNameStrings; // tokenized speaker name private String speakerDesc; - private final Set mentions = new LinkedHashSet<>(); // Mentions that corresponds to the speaker... + private Set mentions = new LinkedHashSet<>(); // Mentions that corresponds to the speaker... // private Mention originalMention; // the mention used when creating this SpeakerInfo - private final boolean speakerIdIsNumber; // speaker id is a number (probably mention id) - private final boolean speakerIdIsAutoDetermined; // speaker id was auto determined by system + private boolean speakerIdIsNumber; // speaker id is a number (probably mention id) + private boolean speakerIdIsAutoDetermined; // speaker id was auto determined by system // private Mention mainMention; // TODO: keep track of speaker utterances? diff --git a/src/edu/stanford/nlp/coref/hybrid/demo/ChineseHcorefDemo.java b/src/edu/stanford/nlp/coref/demo/ChineseHcorefDemo.java similarity index 89% rename from src/edu/stanford/nlp/coref/hybrid/demo/ChineseHcorefDemo.java rename to src/edu/stanford/nlp/coref/demo/ChineseHcorefDemo.java index 1c5eb51ad3..c07bdcfaba 100644 --- a/src/edu/stanford/nlp/coref/hybrid/demo/ChineseHcorefDemo.java +++ b/src/edu/stanford/nlp/coref/demo/ChineseHcorefDemo.java @@ -1,5 +1,8 @@ -package edu.stanford.nlp.coref.hybrid.demo; +package edu.stanford.nlp.coref.demo; +import edu.stanford.nlp.coref.CorefCoreAnnotations; +import edu.stanford.nlp.coref.data.CorefChain; +import edu.stanford.nlp.coref.data.Mention; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; @@ -8,28 +11,23 @@ import java.util.Properties; -import edu.stanford.nlp.coref.CorefCoreAnnotations; - -import edu.stanford.nlp.coref.data.CorefChain; -import edu.stanford.nlp.coref.data.Mention; - /** * A simple example of Stanford Chinese coreference resolution - * - * When I use originAPI code, using the properties file in path edu/stanford/nlp/hcoref/properties/zh-dcoref-default.properties - * the code could not run correctly in Chinese. - * + * + * When I use originAPI code, using the properties file in path edu/stanford/nlp/coref/properties/zh-dcoref-default.properties + * the code could not run correctly in Chinese. + * * What I did is extracting the right properties file from stanford-chinese-corenlp-2015-12-08-models.jar - * and replace edu/stanford/nlp/hcoref/properties/zh-coref-default.properties to our originAPI code + * and replace edu/stanford/nlp/coref/properties/zh-coref-default.properties to our originAPI code * which finally run correctly. - * - * @originAPI http://stanfordnlp.github.io/CoreNLP/coref.html + * + * @originAPI http://stanfordnlp.github.io/CoreNLP/coref.html * @modify_author zkli */ public class ChineseHcorefDemo { public static void main(String[] args) throws Exception { long startTime=System.currentTimeMillis(); - + String text = "俄罗斯 航空 公司 一 名 官员 在 9号 说 , " + "米洛舍维奇 的 儿子 马可·米洛舍维奇 9号 早上 持 外交 护照 从 俄国 首都 莫斯科 搭机 飞往 中国 大陆 北京 , " + "可是 就 在 稍后 就 返回 莫斯科 。 " + @@ -50,7 +48,7 @@ public static void main(String[] args) throws Exception { pipeline.annotate(document); System.out.println("---"); System.out.println("coref chains"); - + for (CorefChain cc : document.get(CorefCoreAnnotations.CorefChainAnnotation.class).values()) { System.out.println("\t" + cc); } @@ -61,8 +59,8 @@ public static void main(String[] args) throws Exception { System.out.println("\t" + m); } } - - long endTime=System.currentTimeMillis(); + + long endTime=System.currentTimeMillis(); long time = (endTime-startTime)/1000; System.out.println("Running time "+time/60+"min "+time%60+"s"); } diff --git a/src/edu/stanford/nlp/coref/docreader/CoNLLDocumentReader.java b/src/edu/stanford/nlp/coref/docreader/CoNLLDocumentReader.java index db50122ff5..171d116f5c 100644 --- a/src/edu/stanford/nlp/coref/docreader/CoNLLDocumentReader.java +++ b/src/edu/stanford/nlp/coref/docreader/CoNLLDocumentReader.java @@ -21,6 +21,7 @@ import java.util.regex.Pattern; import edu.stanford.nlp.coref.CorefCoreAnnotations; +import edu.stanford.nlp.coref.CorefProperties; import edu.stanford.nlp.coref.data.InputDoc; import edu.stanford.nlp.coref.data.Mention; import edu.stanford.nlp.io.IOUtils; @@ -217,7 +218,7 @@ public static class Options { public boolean annotateTreeCoref = false; // Annotate tree with CorefMentionAnnotation public boolean annotateTreeNer = false; // Annotate tree with NamedEntityAnnotation - + public Locale lang = Locale.ENGLISH; public String backgroundNerTag = "O"; // Background NER tag @@ -248,7 +249,7 @@ public static class CoNLLDocument implements Serializable { public String documentID; String partNo; public String filename; - + public List> sentenceWordLists = new ArrayList<>(); Annotation annotation; @@ -1019,7 +1020,7 @@ public String toString() /** Reads and dumps output, mainly for debugging. */ public static void main(String[] args) throws IOException { Properties props = StringUtils.argsToProperties(args); - boolean debug = false; + boolean debug = CorefProperties.debug(props); String filepath = props.getProperty("i"); String outfile = props.getProperty("o"); if (filepath == null || outfile == null) { @@ -1076,12 +1077,12 @@ public InputDoc nextDoc() { if (conllDoc == null) return null; Annotation anno = conllDoc.getAnnotation(); - + // conll doc has constituency tree but doesn't have dependency tree setDependencyTree(anno); - + List> allGoldMentions = extractGoldMentions(conllDoc); - + // store some useful information in docInfo for later Map docInfo = makeDocInfo(conllDoc); @@ -1096,20 +1097,20 @@ private Map makeDocInfo(CoNLLDocument conllDoc) { docInfo.put("DOC_PART", conllDoc.partNo); docInfo.put("DOC_ID_PART", conllDoc.documentIdPart); docInfo.put("DOC_FILE", conllDoc.filename); - + return docInfo; } private void setDependencyTree(Annotation anno) { List sentences = anno.get(SentencesAnnotation.class); - + for(CoreMap sentence : sentences) { Tree tree = sentence.get(TreeAnnotation.class); if (tree==null) continue; - + SemanticGraph deps = null; SemanticGraph basicDeps = null; - + if (options.lang == Locale.CHINESE) { final boolean threadSafe = true; @@ -1119,7 +1120,7 @@ private void setDependencyTree(Annotation anno) { GrammaticalStructure.Extras.NONE, threadSafe, null); - + basicDeps = SemanticGraphFactory.makeFromTree( new ChineseGrammaticalStructure(tree, Filters.acceptFilter(), chineseHeadFinder), SemanticGraphFactory.Mode.BASIC, @@ -1130,7 +1131,7 @@ private void setDependencyTree(Annotation anno) { deps = SemanticGraphFactory.generateEnhancedDependencies(tree); basicDeps = SemanticGraphFactory.generateUncollapsedDependencies(tree); } - + sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, basicDeps); sentence.set(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class, deps); } diff --git a/src/edu/stanford/nlp/coref/docreader/DocReader.java b/src/edu/stanford/nlp/coref/docreader/DocReader.java index 0e5ffd5c91..341264fd97 100644 --- a/src/edu/stanford/nlp/coref/docreader/DocReader.java +++ b/src/edu/stanford/nlp/coref/docreader/DocReader.java @@ -3,7 +3,7 @@ import edu.stanford.nlp.coref.data.InputDoc; public interface DocReader { - + /** Read raw, CoNLL, ACE, or MUC document and return InputDoc */ public InputDoc nextDoc(); diff --git a/src/edu/stanford/nlp/coref/hybrid/HybridCorefProperties.java b/src/edu/stanford/nlp/coref/hybrid/HybridCorefProperties.java deleted file mode 100644 index a2ae5a6476..0000000000 --- a/src/edu/stanford/nlp/coref/hybrid/HybridCorefProperties.java +++ /dev/null @@ -1,306 +0,0 @@ -package edu.stanford.nlp.coref.hybrid; - -import java.io.File; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Locale; -import java.util.Properties; -import java.util.Set; - -import edu.stanford.nlp.coref.hybrid.sieve.Sieve.ClassifierType; - -import edu.stanford.nlp.coref.data.Dictionaries.MentionType; -import edu.stanford.nlp.util.Generics; -import edu.stanford.nlp.util.PropertiesUtils; - -/** - * Properties for the hybrid coref system - * @author Heeyoung Lee - * @author Kevin Clark - */ -public class HybridCorefProperties { - public enum CorefInputType { RAW, CONLL, ACE, MUC } - - // general - public static final String LANG_PROP = "coref.language"; - private static final String SIEVES_PROP = "coref.sieves"; - private static final String SCORE_PROP = "coref.doScore"; - private static final String THREADS_PROP = "coref.threadCount"; - private static final String POSTPROCESSING_PROP = "coref.postprocessing"; - private static final String SEED_PROP = "coref.seed"; - private static final String CONLL_AUTO_PROP = "coref.conll.auto"; - private static final String USE_SEMANTICS_PROP = "coref.useSemantics"; // load semantics if true - public static final String CURRENT_SIEVE_FOR_TRAIN_PROP = "coref.currentSieveForTrain"; - private static final String STORE_TRAINDATA_PROP = "coref.storeTrainData"; - private static final String ADD_MISSING_ANNOTATIONS = "coref.addMissingAnnotations"; - - // logging & system check & analysis - private static final String DEBUG_PROP = "coref.debug"; - public static final String LOG_PROP = "coref.logFile"; - private static final String TIMER_PROP = "coref.checkTime"; - private static final String MEMORY_PROP = "coref.checkMemory"; - private static final String PRINT_MDLOG_PROP = "coref.print.md.log"; - private static final String CALCULATE_IMPORTANCE_PROP = "coref.calculateFeatureImportance"; - private static final String DO_ANALYSIS_PROP = "coref.analysis.doAnalysis"; - private static final String ANALYSIS_SKIP_MTYPE_PROP = "coref.analysis.skip.mType"; - private static final String ANALYSIS_SKIP_ATYPE_PROP = "coref.analysis.skip.aType"; - - // data & io - public static final String STATES_PROP = "coref.states"; - public static final String DEMONYM_PROP = "coref.demonym"; - public static final String ANIMATE_PROP = "coref.animate"; - public static final String INANIMATE_PROP = "coref.inanimate"; - public static final String MALE_PROP = "coref.male"; - public static final String NEUTRAL_PROP = "coref.neutral"; - public static final String FEMALE_PROP = "coref.female"; - public static final String PLURAL_PROP = "coref.plural"; - public static final String SINGULAR_PROP = "coref.singular"; - public static final String GENDER_NUMBER_PROP = "coref.big.gender.number"; - public static final String COUNTRIES_PROP = "coref.countries"; - public static final String STATES_PROVINCES_PROP = "coref.states.provinces"; - public static final String DICT_LIST_PROP = "coref.dictlist"; - public static final String DICT_PMI_PROP = "coref.dictpmi"; - public static final String SIGNATURES_PROP = "coref.signatures"; - public static final String LOAD_WORD_EMBEDDING_PROP = "coref.loadWordEmbedding"; - private static final String WORD2VEC_PROP = "coref.path.word2vec"; - private static final String WORD2VEC_SERIALIZED_PROP = "coref.path.word2vecSerialized"; - - private static final String PATH_SERIALIZED_PROP = "coref.path.serialized"; - - // models - private static final String PATH_MODEL_PROP = "coref.SIEVENAME.model"; - - // sieve option - private static final String CLASSIFIER_TYPE_PROP = "coref.SIEVENAME.classifierType"; - private static final String NUM_TREE_PROP = "coref.SIEVENAME.numTrees"; - private static final String NUM_FEATURES_PROP = "coref.SIEVENAME.numFeatures"; - private static final String TREE_DEPTH_PROP = "coref.SIEVENAME.treeDepth"; - private static final String MAX_SENT_DIST_PROP = "coref.SIEVENAME.maxSentDist"; - private static final String MTYPE_PROP = "coref.SIEVENAME.mType"; - private static final String ATYPE_PROP = "coref.SIEVENAME.aType"; - private static final String DOWNSAMPLE_RATE_PROP = "coref.SIEVENAME.downsamplingRate"; - private static final String THRES_FEATURECOUNT_PROP = "coref.SIEVENAME.thresFeatureCount"; - private static final String FEATURE_SELECTION_PROP = "coref.SIEVENAME.featureSelection"; - private static final String THRES_MERGE_PROP = "coref.SIEVENAME.merge.thres"; - private static final String THRES_FEATURE_SELECTION_PROP = "coref.SIEVENAME.pmi.thres"; - private static final String DEFAULT_PRONOUN_AGREEMENT_PROP = "coref.defaultPronounAgreement"; - - // features - private static final String USE_BASIC_FEATURES_PROP = "coref.SIEVENAME.useBasicFeatures"; - private static final String COMBINE_OBJECTROLE_PROP = "coref.SIEVENAME.combineObjectRole"; - private static final String USE_MD_FEATURES_PROP = "coref.SIEVENAME.useMentionDetectionFeatures"; - private static final String USE_DCOREFRULE_FEATURES_PROP = "coref.SIEVENAME.useDcorefRuleFeatures"; - private static final String USE_POS_FEATURES_PROP = "coref.SIEVENAME.usePOSFeatures"; - private static final String USE_LEXICAL_FEATURES_PROP = "coref.SIEVENAME.useLexicalFeatures"; - private static final String USE_WORD_EMBEDDING_FEATURES_PROP = "coref.SIEVENAME.useWordEmbeddingFeatures"; - - public static final Locale LANGUAGE_DEFAULT = Locale.ENGLISH; - - /** if true, remove appositives, predicate nominatives in post processing */ - public static final boolean REMOVE_APPOSITION_PREDICATENOMINATIVES = true; - - /** if true, remove singletons in post processing */ - public static final boolean REMOVE_SINGLETONS = true; - - // current list of dcoref sieves - private static final Set dcorefSieveNames = new HashSet<>(Arrays.asList("MarkRole", "DiscourseMatch", - "ExactStringMatch", "RelaxedExactStringMatch", "PreciseConstructs", "StrictHeadMatch1", - "StrictHeadMatch2", "StrictHeadMatch3", "StrictHeadMatch4", "RelaxedHeadMatch", "PronounMatch", "SpeakerMatch", - "ChineseHeadMatch")); - - - public static boolean doScore(Properties props) { - return PropertiesUtils.getBool(props, SCORE_PROP, false); - } - public static boolean checkTime(Properties props) { - return PropertiesUtils.getBool(props, TIMER_PROP, false); - } - public static boolean checkMemory(Properties props) { - return PropertiesUtils.getBool(props, MEMORY_PROP, false); - } - - public static int getThreadCounts(Properties props) { - return PropertiesUtils.getInt(props, THREADS_PROP, Runtime.getRuntime().availableProcessors()); - } - public static Locale getLanguage(Properties props) { - String lang = PropertiesUtils.getString(props, LANG_PROP, "en"); - if(lang.equalsIgnoreCase("en") || lang.equalsIgnoreCase("english")) return Locale.ENGLISH; - else if(lang.equalsIgnoreCase("zh") || lang.equalsIgnoreCase("chinese")) return Locale.CHINESE; - else throw new RuntimeException("unsupported language"); - } - public static boolean printMDLog(Properties props) { - return PropertiesUtils.getBool(props, PRINT_MDLOG_PROP, false); - } - public static boolean doPostProcessing(Properties props) { - return PropertiesUtils.getBool(props, POSTPROCESSING_PROP, false); - } - - /** if true, use conll auto files, else use conll gold files */ - public static boolean useCoNLLAuto(Properties props) { - return PropertiesUtils.getBool(props, CONLL_AUTO_PROP, true); - } - - public static String getPathModel(Properties props, String sievename) { - return props.getProperty(PATH_SERIALIZED_PROP) + File.separator + - props.getProperty(PATH_MODEL_PROP.replace("SIEVENAME", sievename), "MISSING_MODEL_FOR_"+sievename); - } - public static boolean debug(Properties props) { - return PropertiesUtils.getBool(props, DEBUG_PROP, false); - } - - public static ClassifierType getClassifierType(Properties props, String sievename) { - if(dcorefSieveNames.contains(sievename)) return ClassifierType.RULE; - if(sievename.toLowerCase().endsWith("-rf")) return ClassifierType.RF; - if(sievename.toLowerCase().endsWith("-oracle")) return ClassifierType.ORACLE; - String classifierType = PropertiesUtils.getString(props, CLASSIFIER_TYPE_PROP.replace("SIEVENAME", sievename), null); - return ClassifierType.valueOf(classifierType); - } - public static double getMergeThreshold(Properties props, String sievename) { - String key = THRES_MERGE_PROP.replace("SIEVENAME", sievename); - return PropertiesUtils.getDouble(props, key, 0.3); - } - public static void setMergeThreshold(Properties props, String sievename, double value) { - String key = THRES_MERGE_PROP.replace("SIEVENAME", sievename); - props.setProperty(key, String.valueOf(value)); - } - - public static int getNumTrees(Properties props, String sievename) { - return PropertiesUtils.getInt(props, NUM_TREE_PROP.replace("SIEVENAME", sievename), 100); - } - public static int getSeed(Properties props) { - return PropertiesUtils.getInt(props, SEED_PROP, 1); - } - public static int getNumFeatures(Properties props, String sievename) { - return PropertiesUtils.getInt(props, NUM_FEATURES_PROP.replace("SIEVENAME", sievename), 30); - } - public static int getTreeDepth(Properties props, String sievename) { - return PropertiesUtils.getInt(props, TREE_DEPTH_PROP.replace("SIEVENAME", sievename), 0); - } - public static boolean calculateFeatureImportance(Properties props) { - return PropertiesUtils.getBool(props, CALCULATE_IMPORTANCE_PROP, false); - } - - public static int getMaxSentDistForSieve(Properties props, String sievename) { - return PropertiesUtils.getInt(props, MAX_SENT_DIST_PROP.replace("SIEVENAME", sievename), 1000); - } - - public static Set getMentionType(Properties props, String sievename) { - return getMentionTypes(props, MTYPE_PROP.replace("SIEVENAME", sievename)); - } - public static Set getAntecedentType(Properties props, String sievename) { - return getMentionTypes(props, ATYPE_PROP.replace("SIEVENAME", sievename)); - } - - private static Set getMentionTypes(Properties props, String propKey) { - if(!props.containsKey(propKey) || props.getProperty(propKey).equalsIgnoreCase("all")){ - return new HashSet<>(Arrays.asList(MentionType.values())); - } - - Set types = new HashSet<>(); - for(String type : props.getProperty(propKey).trim().split(",\\s*")) { - if(type.toLowerCase().matches("i|you|we|they|it|she|he")) type = "PRONOMINAL"; - types.add(MentionType.valueOf(type)); - } - return types; - } - public static double getDownsamplingRate(Properties props, String sievename) { - return PropertiesUtils.getDouble(props, DOWNSAMPLE_RATE_PROP.replace("SIEVENAME", sievename), 1); - } - public static int getFeatureCountThreshold(Properties props, String sievename) { - return PropertiesUtils.getInt(props, THRES_FEATURECOUNT_PROP.replace("SIEVENAME", sievename), 20); - } - public static boolean useBasicFeatures(Properties props, String sievename) { - return PropertiesUtils.getBool(props, USE_BASIC_FEATURES_PROP.replace("SIEVENAME", sievename), true); - } - public static boolean combineObjectRoles(Properties props, String sievename) { - return PropertiesUtils.getBool(props, COMBINE_OBJECTROLE_PROP.replace("SIEVENAME", sievename), true); - } - public static boolean useMentionDetectionFeatures(Properties props, String sievename) { - return PropertiesUtils.getBool(props, USE_MD_FEATURES_PROP.replace("SIEVENAME", sievename), true); - } - public static boolean useDcorefRules(Properties props, String sievename) { - return PropertiesUtils.getBool(props, USE_DCOREFRULE_FEATURES_PROP.replace("SIEVENAME", sievename), true); - } - public static boolean usePOSFeatures(Properties props, String sievename) { - return PropertiesUtils.getBool(props, USE_POS_FEATURES_PROP.replace("SIEVENAME", sievename), true); - } - public static boolean useLexicalFeatures(Properties props, String sievename) { - return PropertiesUtils.getBool(props, USE_LEXICAL_FEATURES_PROP.replace("SIEVENAME", sievename), true); - } - public static boolean useWordEmbedding(Properties props, String sievename) { - return PropertiesUtils.getBool(props, USE_WORD_EMBEDDING_FEATURES_PROP.replace("SIEVENAME", sievename), true); - } - - private static Set getMentionTypeStr(Properties props, String sievename, String whichMention) { - Set strs = Generics.newHashSet(); - String propKey = whichMention; - if (!props.containsKey(propKey)) { - String prefix = "coref." + sievename + "."; - propKey = prefix + propKey; - } - if(props.containsKey(propKey)) strs.addAll(Arrays.asList(props.getProperty(propKey).split(","))); - return strs; - } - public static Set getMentionTypeStr(Properties props, String sievename) { - return getMentionTypeStr(props, sievename, "mType"); - } - public static Set getAntecedentTypeStr(Properties props, String sievename) { - return getMentionTypeStr(props, sievename, "aType"); - } - public static String getSieves(Properties props) { - return PropertiesUtils.getString(props, SIEVES_PROP, "SpeakerMatch,PreciseConstructs,pp-rf,cc-rf,pc-rf,ll-rf,pr-rf"); - } - public static String getPathSerialized(Properties props) { - return props.getProperty(PATH_SERIALIZED_PROP); - } - public static boolean doPMIFeatureSelection(Properties props, String sievename) { - return PropertiesUtils.getString(props, FEATURE_SELECTION_PROP.replace("SIEVENAME", sievename), "pmi").equalsIgnoreCase("pmi"); - } - public static double getPMIThres(Properties props, String sievename) { - return PropertiesUtils.getDouble(props, THRES_FEATURE_SELECTION_PROP.replace("SIEVENAME", sievename), 0.0001); - } - public static boolean doAnalysis(Properties props) { - return PropertiesUtils.getBool(props, DO_ANALYSIS_PROP, false); - } - public static String getSkipMentionType(Properties props) { - return PropertiesUtils.getString(props, ANALYSIS_SKIP_MTYPE_PROP, null); - } - public static String getSkipAntecedentType(Properties props) { - return PropertiesUtils.getString(props, ANALYSIS_SKIP_ATYPE_PROP, null); - } - public static boolean useSemantics(Properties props) { - return PropertiesUtils.getBool(props, USE_SEMANTICS_PROP, false); - } - public static String getPathSerializedWordVectors(Properties props) { - return PropertiesUtils.getString(props, WORD2VEC_SERIALIZED_PROP, "/scr/nlp/data/coref/wordvectors/en/vector.ser.gz"); - } - public static String getCurrentSieveForTrain(Properties props) { - return PropertiesUtils.getString(props, CURRENT_SIEVE_FOR_TRAIN_PROP, null); - } -// public static String getCurrentSieve(Properties props) { -// return PropertiesUtils.getString(props, CURRENT_SIEVE_PROP, null); -// } - public static boolean loadWordEmbedding(Properties props) { - return PropertiesUtils.getBool(props, LOAD_WORD_EMBEDDING_PROP, true); - } - public static String getPathWord2Vec(Properties props) { - return PropertiesUtils.getString(props, WORD2VEC_PROP, null); - } - - public static String getGenderNumber(Properties props) { - return PropertiesUtils.getString(props, GENDER_NUMBER_PROP, "edu/stanford/nlp/models/dcoref/gender.data.gz"); - } - - public static boolean storeTrainData(Properties props) { - return PropertiesUtils.getBool(props, STORE_TRAINDATA_PROP, false); - } - - public static boolean useDefaultPronounAgreement(Properties props){ - return PropertiesUtils.getBool(props, HybridCorefProperties.DEFAULT_PRONOUN_AGREEMENT_PROP,false); - } - - public static boolean addMissingAnnotations(Properties props) { - return PropertiesUtils.getBool(props, ADD_MISSING_ANNOTATIONS, false); - } - -} diff --git a/src/edu/stanford/nlp/coref/hybrid/HybridCorefSystem.java b/src/edu/stanford/nlp/coref/hybrid/HybridCorefSystem.java index ebddff4b7f..ba7fcdd3b1 100644 --- a/src/edu/stanford/nlp/coref/hybrid/HybridCorefSystem.java +++ b/src/edu/stanford/nlp/coref/hybrid/HybridCorefSystem.java @@ -3,6 +3,7 @@ import java.io.File; import java.io.FileOutputStream; import java.io.PrintWriter; +import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.util.List; @@ -10,31 +11,26 @@ import java.util.Properties; import java.util.Set; import java.util.logging.Logger; - -import edu.stanford.nlp.coref.CorefAlgorithm; -import edu.stanford.nlp.coref.CorefPrinter; -import edu.stanford.nlp.coref.CorefProperties; -import edu.stanford.nlp.coref.CorefScorer; -import edu.stanford.nlp.coref.CorefUtils; +import edu.stanford.nlp.coref.*; import edu.stanford.nlp.coref.data.CorefChain; import edu.stanford.nlp.coref.data.CorefCluster; import edu.stanford.nlp.coref.data.Dictionaries; import edu.stanford.nlp.coref.data.Document; -import edu.stanford.nlp.coref.data.DocumentMaker; import edu.stanford.nlp.coref.data.Mention; -import edu.stanford.nlp.coref.hybrid.sieve.Sieve; -import edu.stanford.nlp.coref.hybrid.sieve.Sieve.ClassifierType; +import edu.stanford.nlp.coref.sieve.Sieve; +import edu.stanford.nlp.coref.sieve.Sieve.ClassifierType; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.RuntimeInterruptedException; import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.concurrent.MulticoreWrapper; import edu.stanford.nlp.util.concurrent.ThreadsafeProcessor; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.util.logging.RedwoodConfiguration; -public class HybridCorefSystem implements CorefAlgorithm { +public class HybridCorefSystem { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(HybridCorefSystem.class); @@ -42,21 +38,7 @@ public class HybridCorefSystem implements CorefAlgorithm { public Properties props; public List sieves; public Dictionaries dictionaries; - public DocumentMaker docMaker = null; - - public HybridCorefSystem(Properties props, Dictionaries dictionaries) throws Exception { - this.props = props; - this.dictionaries = dictionaries; - sieves = Sieve.loadSieves(props); - - // set semantics loading - for(Sieve sieve : sieves) { - if(sieve.classifierType == ClassifierType.RULE) continue; - if(HybridCorefProperties.useWordEmbedding(props, sieve.sievename)) { - props.setProperty(HybridCorefProperties.LOAD_WORD_EMBEDDING_PROP, "true"); - } - } - } + public CorefDocMaker docMaker = null; public HybridCorefSystem(Properties props) throws Exception { this.props = props; @@ -65,38 +47,37 @@ public HybridCorefSystem(Properties props) throws Exception { // set semantics loading for(Sieve sieve : sieves) { if(sieve.classifierType == ClassifierType.RULE) continue; - if(HybridCorefProperties.useWordEmbedding(props, sieve.sievename)) { - props.setProperty(HybridCorefProperties.LOAD_WORD_EMBEDDING_PROP, "true"); + if(CorefProperties.useWordEmbedding(props, sieve.sievename)) { + props.setProperty(CorefProperties.LOAD_WORD_EMBEDDING_PROP, "true"); } } dictionaries = new Dictionaries(props); - docMaker = new DocumentMaker(props, dictionaries); + docMaker = new CorefDocMaker(props, dictionaries); } public Dictionaries dictionaries() { return dictionaries; } public static void runCoref(String[] args) throws Exception { - runCoref(StringUtils.argsToProperties(args)); - } + Redwood.hideChannelsEverywhere( + "debug-cluster", "debug-mention", "debug-preprocessor", "debug-docreader", "debug-mergethres", + "debug-featureselection", "debug-md" + ); - public static void runCoref(Properties props) throws Exception { - /* - * property, environment setting + /* + property, environment setting */ - Redwood.hideChannelsEverywhere( - "debug-cluster", "debug-mention", "debug-preprocessor", "debug-docreader", "debug-mergethres", - "debug-featureselection", "debug-md" - ); - int nThreads = HybridCorefProperties.getThreadCounts(props); + Properties props = StringUtils.argsToProperties(args); + int nThreads = CorefProperties.getThreadCounts(props); String timeStamp = Calendar.getInstance().getTime().toString().replaceAll("\\s", "-").replaceAll(":", "-"); + props.put(CorefProperties.PATH_INPUT_PROP, CorefProperties.getPathEvalData(props)); Logger logger = Logger.getLogger(HybridCorefSystem.class.getName()); // set log file path - if(props.containsKey(HybridCorefProperties.LOG_PROP)){ - File logFile = new File(props.getProperty(HybridCorefProperties.LOG_PROP)); + if(props.containsKey(CorefProperties.LOG_PROP)){ + File logFile = new File(props.getProperty(CorefProperties.LOG_PROP)); RedwoodConfiguration.current().handlers( RedwoodConfiguration.Handlers.file(logFile)).apply(); Redwood.log("Starting coref log"); @@ -104,7 +85,7 @@ public static void runCoref(Properties props) throws Exception { log.info(props.toString()); - if(HybridCorefProperties.checkMemory(props)) checkMemoryUsage(); + if(CorefProperties.checkMemory(props)) checkMemoryUsage(); HybridCorefSystem cs = new HybridCorefSystem(props); @@ -118,8 +99,8 @@ public static void runCoref(Properties props) throws Exception { PrintWriter writerGold = null; PrintWriter writerBeforeCoref = null; PrintWriter writerAfterCoref = null; - if (HybridCorefProperties.doScore(props)) { - String pathOutput = CorefProperties.conllOutputPath(props); + if (CorefProperties.doScore(props)) { + String pathOutput = CorefProperties.getPathOutput(props); (new File(pathOutput)).mkdir(); goldOutput = pathOutput + "output-" + timeStamp + ".gold.txt"; beforeCorefOutput = pathOutput + "output-" + timeStamp + ".predicted.txt"; @@ -156,7 +137,7 @@ public ThreadsafeProcessor, StringBuilder[]> n }); Date startTime = null; - if(HybridCorefProperties.checkTime(props)) { + if(CorefProperties.checkTime(props)) { startTime = new Date(); System.err.printf("END-TO-END COREF Start time: %s\n", startTime); } @@ -177,20 +158,20 @@ public ThreadsafeProcessor, StringBuilder[]> n IOUtils.closeIgnoringExceptions(writerBeforeCoref); IOUtils.closeIgnoringExceptions(writerAfterCoref); - if(HybridCorefProperties.checkTime(props)) { + if(CorefProperties.checkTime(props)) { System.err.printf("END-TO-END COREF Elapsed time: %.3f seconds\n", (((new Date()).getTime() - startTime.getTime()) / 1000F)); // System.err.printf("CORENLP PROCESS TIME TOTAL: %.3f seconds\n", cs.mentionExtractor.corenlpProcessTime); } - if(HybridCorefProperties.checkMemory(props)) checkMemoryUsage(); + if(CorefProperties.checkMemory(props)) checkMemoryUsage(); // scoring - if (HybridCorefProperties.doScore(props)) { - String summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, beforeCorefOutput); - CorefScorer.printScoreSummary(summary, logger, false); + if (CorefProperties.doScore(props)) { + String summary = Scorer.getEvalSummary(CorefProperties.getPathScorer(props), goldOutput, beforeCorefOutput); + OldCorefPrinter.printScoreSummary(summary, logger, false); - summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, afterCorefOutput); - CorefScorer.printScoreSummary(summary, logger, true); - CorefScorer.printFinalConllScore(summary); + summary = Scorer.getEvalSummary(CorefProperties.getPathScorer(props), goldOutput, afterCorefOutput); + OldCorefPrinter.printScoreSummary(summary, logger, true); + OldCorefPrinter.printFinalConllScore(summary); } } @@ -215,15 +196,6 @@ private static int logOutput(MulticoreWrapper, return docCnt; } - @Override - public void runCoref(Document document) { - try { - coref(document); - } catch (Exception e) { - throw new RuntimeException("Error running hybrid coref system", e); - } - } - /** * main entry of coreference system. * @@ -233,26 +205,28 @@ public void runCoref(Document document) { * @throws Exception */ public Map coref(Document document, StringBuilder[] output) throws Exception { - if(HybridCorefProperties.printMDLog(props)) { - Redwood.log(HybridCorefPrinter.printMentionDetectionLog(document)); + + if(CorefProperties.printMDLog(props)) { + Redwood.log(OldCorefPrinter.printMentionDetectionLog(document)); } - if(HybridCorefProperties.doScore(props)) { + if(CorefProperties.doScore(props)) { output[0] = (new StringBuilder()).append(CorefPrinter.printConllOutput(document, true)); // gold output[1] = (new StringBuilder()).append(CorefPrinter.printConllOutput(document, false)); // before coref } output[3] = new StringBuilder(); // log from sieves for(Sieve sieve : sieves){ - CorefUtils.checkForInterrupt(); + if (Thread.interrupted()) { // Allow interrupting + throw new RuntimeInterruptedException(); + } output[3].append(sieve.resolveMention(document, dictionaries, props)); } // post processing - if(HybridCorefProperties.doPostProcessing(props)) postProcessing(document); - - if(HybridCorefProperties.doScore(props)) { + if(CorefProperties.doPostProcessing(props)) postProcessing(document); + if(CorefProperties.doScore(props)) { output[2] = (new StringBuilder()).append(CorefPrinter.printConllOutput(document, false, true)); // after coref } @@ -298,7 +272,7 @@ private static void postProcessing(Document document) { for(CorefCluster c : document.corefClusters.values()){ Set removeMentions = Generics.newHashSet(); for(Mention m : c.getCorefMentions()) { - if(HybridCorefProperties.REMOVE_APPOSITION_PREDICATENOMINATIVES + if(CorefProperties.REMOVE_APPOSITION_PREDICATENOMINATIVES && ((m.appositions!=null && m.appositions.size() > 0) || (m.predicateNominatives!=null && m.predicateNominatives.size() > 0) || (m.relativePronouns!=null && m.relativePronouns.size() > 0))){ @@ -309,7 +283,7 @@ private static void postProcessing(Document document) { } c.corefMentions.removeAll(removeMentions); - if(HybridCorefProperties.REMOVE_SINGLETONS && c.getCorefMentions().size()==1) { + if(CorefProperties.REMOVE_SINGLETONS && c.getCorefMentions().size()==1) { removeClusterSet.add(c.clusterID); } } @@ -327,7 +301,23 @@ private static void checkMemoryUsage() { long memory = runtime.totalMemory() - runtime.freeMemory(); log.info("USED MEMORY (bytes): " + memory); } - + /** Remove singleton clusters */ + public static List> filterMentionsWithSingletonClusters(Document document, List> mentions) + { + + List> res = new ArrayList<>(mentions.size()); + for (List ml:mentions) { + List filtered = new ArrayList<>(); + for (Mention m:ml) { + CorefCluster cluster = document.corefClusters.get(m.corefClusterID); + if (cluster != null && cluster.getCorefMentions().size() > 1) { + filtered.add(m); + } + } + res.add(filtered); + } + return res; + } public static void main(String[] args) throws Exception { Date startTime = new Date(); System.err.printf("Start time: %s\n", startTime); diff --git a/src/edu/stanford/nlp/coref/md/CorefMentionFinder.java b/src/edu/stanford/nlp/coref/md/CorefMentionFinder.java index bc08960178..50b53eb9ec 100644 --- a/src/edu/stanford/nlp/coref/md/CorefMentionFinder.java +++ b/src/edu/stanford/nlp/coref/md/CorefMentionFinder.java @@ -12,7 +12,6 @@ import edu.stanford.nlp.coref.data.Dictionaries; import edu.stanford.nlp.coref.data.Mention; - import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreLabel; diff --git a/src/edu/stanford/nlp/coref/md/DependencyCorefMentionFinder.java b/src/edu/stanford/nlp/coref/md/DependencyCorefMentionFinder.java index 7c4d5dcd81..3cb962d7a0 100644 --- a/src/edu/stanford/nlp/coref/md/DependencyCorefMentionFinder.java +++ b/src/edu/stanford/nlp/coref/md/DependencyCorefMentionFinder.java @@ -1,4 +1,6 @@ package edu.stanford.nlp.coref.md; +import edu.stanford.nlp.util.logging.Redwood; + import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -20,14 +22,13 @@ import edu.stanford.nlp.semgraph.SemanticGraphEdge; import edu.stanford.nlp.semgraph.SemanticGraphUtils; import edu.stanford.nlp.trees.GrammaticalRelation; +import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; -import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.IntPair; import edu.stanford.nlp.util.Pair; -import edu.stanford.nlp.util.logging.Redwood; public class DependencyCorefMentionFinder extends CorefMentionFinder { @@ -41,7 +42,7 @@ public DependencyCorefMentionFinder(Properties props) throws ClassNotFoundExcept } public MentionDetectionClassifier mdClassifier = null; - + /** Main method of mention detection. * Extract all NP, PRP or NE, and filter out by manually written patterns. */ @@ -51,7 +52,7 @@ public List> findMentions(Annotation doc, Dictionaries dict, Prope Set neStrings = Generics.newHashSet(); List> mentionSpanSetList = Generics.newArrayList(); List sentences = doc.get(CoreAnnotations.SentencesAnnotation.class); - + for (CoreMap s : sentences) { List mentions = new ArrayList<>(); predictedMentions.add(mentions); @@ -61,12 +62,12 @@ public List> findMentions(Annotation doc, Dictionaries dict, Prope extractPremarkedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet); HybridCorefMentionFinder.extractNamedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet); extractNPorPRPFromDependency(s, mentions, mentionSpanSet, namedEntitySpanSet); - + addNamedEntityStrings(s, neStrings, namedEntitySpanSet); mentionSpanSetList.add(mentionSpanSet); } // extractNamedEntityModifiers(sentences, mentionSpanSetList, predictedMentions, neStrings); - + for(int i=0 ; i mentions) { private void extractNPorPRPFromDependency(CoreMap s, List mentions, Set mentionSpanSet, Set namedEntitySpanSet) { List sent = s.get(CoreAnnotations.TokensAnnotation.class); - + SemanticGraph basic = s.get(BasicDependenciesAnnotation.class); - + List nounsOrPrp = basic.getAllNodesByPartOfSpeechPattern("N.*|PRP.*|DT"); // DT is for "this, these, etc" Tree tree = s.get(TreeAnnotation.class); - + for(IndexedWord w : nounsOrPrp) { SemanticGraphEdge edge = basic.getEdge(basic.getParent(w), w); GrammaticalRelation rel = null; @@ -115,7 +116,7 @@ private void extractNPorPRPFromDependency(CoreMap s, List mentions, Set // TODO: what to remove? remove more? if(shortname.matches("det|compound")) { - + // // for debug --------------- // Tree t = tree.getLeaves().get(w.index()-1); // for(Tree p : tree.pathNodeToNode(t, tree)) { @@ -128,14 +129,14 @@ private void extractNPorPRPFromDependency(CoreMap s, List mentions, Set // break; // } // } // for debug ------------- - + continue; } else { extractMentionForHeadword(w, basic, s, mentions, mentionSpanSet, namedEntitySpanSet); } } } - + private void extractMentionForHeadword(IndexedWord headword, SemanticGraph dep, CoreMap s, List mentions, Set mentionSpanSet, Set namedEntitySpanSet) { List sent = s.get(CoreAnnotations.TokensAnnotation.class); SemanticGraph basic = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class); @@ -149,19 +150,19 @@ private void extractMentionForHeadword(IndexedWord headword, SemanticGraph dep, extractPronounForHeadword(headword, dep, s, mentions, mentionSpanSet, namedEntitySpanSet); return; } - + // add NP mention - IntPair npSpan = getNPSpan(headword, dep, sent); + IntPair npSpan = getNPSpan(headword, dep, sent); int beginIdx = npSpan.get(0); int endIdx = npSpan.get(1)+1; if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with , if ("IN".equals(sent.get(beginIdx).tag())) { beginIdx++; } // try to remove first IN. addMention(beginIdx, endIdx, headword, mentions, mentionSpanSet, namedEntitySpanSet, sent, basic, enhanced); - + // // extract the first element in conjunction (A and B -> extract A here "A and B", "B" will be extracted above) // - + // to make sure we find the first conjunction Set conjChildren = dep.getChildrenWithReln(headword, UniversalEnglishGrammaticalRelations.CONJUNCT); if(conjChildren.size() > 0) { @@ -187,17 +188,17 @@ private void extractMentionForHeadword(IndexedWord headword, SemanticGraph dep, */ private IntPair getNPSpan(IndexedWord headword, SemanticGraph dep, List sent) { int headwordIdx = headword.index()-1; - + List children = dep.getChildList(headword); // if(children.size()==0) return new IntPair(headwordIdx, headwordIdx); // the headword is the only word - + // check if we have copula relation IndexedWord cop = dep.getChildWithReln(headword, UniversalEnglishGrammaticalRelations.COPULA); int startIdx = (cop==null)? 0 : children.indexOf(cop)+1; - + // children which will be inside of NP List insideNP = Generics.newArrayList(); - + for(int i=startIdx ; i < children.size() ; i++) { IndexedWord child = children.get(i); SemanticGraphEdge edge = dep.getEdge(headword, child); @@ -207,40 +208,40 @@ private IntPair getNPSpan(IndexedWord headword, SemanticGraph dep, List firstChildLeftRight = SemanticGraphUtils.leftRightMostChildVertices(insideNP.get(0), dep); Pair lastChildLeftRight = SemanticGraphUtils.leftRightMostChildVertices(insideNP.get(insideNP.size()-1), dep); - - // headword can be first or last word + + // headword can be first or last word int beginIdx = Math.min(headwordIdx, firstChildLeftRight.first.index()-1); int endIdx = Math.max(headwordIdx, lastChildLeftRight.second.index()-1); - + return new IntPair(beginIdx, endIdx); } - + private IntPair getNPSpanOld(IndexedWord headword, SemanticGraph dep, List sent) { IndexedWord cop = dep.getChildWithReln(headword, UniversalEnglishGrammaticalRelations.COPULA); Pair leftRight = SemanticGraphUtils.leftRightMostChildVertices(headword, dep); - - // headword can be first or last word + + // headword can be first or last word int beginIdx = Math.min(headword.index()-1, leftRight.first.index()-1); int endIdx = Math.max(headword.index()-1, leftRight.second.index()-1); - + // no copula relation if(cop==null) return new IntPair(beginIdx, endIdx); - + // if we have copula relation List children = dep.getChildList(headword); int copIdx = children.indexOf(cop); - + if(copIdx+1 < children.size()) { beginIdx = Math.min(headword.index()-1, SemanticGraphUtils.leftMostChildVertice(children.get(copIdx+1), dep).index()-1); } else { beginIdx = headword.index()-1; } - + return new IntPair(beginIdx, endIdx); } @@ -266,14 +267,14 @@ private void extractPronounForHeadword(IndexedWord headword, SemanticGraph dep, } int beginIdx = headword.index()-1; int endIdx = headword.index(); - + // handle "you all", "they both" etc if(sent.size() > headword.index() && sent.get(headword.index()).word().matches("all|both")) { IndexedWord c = dep.getNodeByIndex(headword.index()+1); SemanticGraphEdge edge = dep.getEdge(headword, c); if(edge!=null) endIdx++; } - + IntPair mSpan = new IntPair(beginIdx, endIdx); if(!mentionSpanSet.contains(mSpan) && (!insideNE(mSpan, namedEntitySpanSet)) ) { int dummyMentionId = -1; @@ -284,7 +285,7 @@ private void extractPronounForHeadword(IndexedWord headword, SemanticGraph dep, mentions.add(m); mentionSpanSet.add(mSpan); } - + // when pronoun is a part of conjunction (e.g., you and I) Set conjChildren = dep.getChildrenWithReln(headword, UniversalEnglishGrammaticalRelations.CONJUNCT); if(conjChildren.size() > 0) { @@ -305,20 +306,20 @@ public void findHead(CoreMap s, List mentions) { findHeadInDependency(s, m); } } - + // TODO: still errors in head finder public static void findHeadInDependency(CoreMap s, Mention m) { List sent = s.get(CoreAnnotations.TokensAnnotation.class); SemanticGraph basicDep = s.get(BasicDependenciesAnnotation.class); if(m.headWord == null) { - + // when there's punctuation, no node found in the dependency tree int curIdx; IndexedWord cur = null; for(curIdx = m.endIndex-1 ; curIdx >= m.startIndex ; curIdx--) { - if((cur = basicDep.getNodeByIndexSafe(curIdx+1)) != null) break; + if((cur = basicDep.getNodeByIndexSafe(curIdx+1)) != null) break; } - + if(cur==null) curIdx = m.endIndex-1; while(cur!=null) { IndexedWord p = basicDep.getParent(cur); @@ -338,59 +339,59 @@ public static void findHeadInDependency(CoreMap s, Mention m) { } } -// /** Filter out all spurious mentions +// /** Filter out all spurious mentions // * @param goldMentionsByID */ // @Override // public void removeSpuriousMentionsEn(Annotation doc, List> predictedMentions, Dictionaries dict) { -// +// // Set standAlones = new HashSet(); // List sentences = doc.get(CoreAnnotations.SentencesAnnotation.class); -// +// // for(int i=0 ; i < predictedMentions.size() ; i++) { // CoreMap s = sentences.get(i); // List mentions = predictedMentions.get(i); -// +// // Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); // List sent = s.get(CoreAnnotations.TokensAnnotation.class); // Set remove = Generics.newHashSet(); -// +// // for(Mention m : mentions){ // String headPOS = m.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class); // String headNE = m.headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class); // // // non word such as 'hmm' // if(dict.nonWords.contains(m.headString)) remove.add(m); -// +// // // quantRule : not starts with 'any', 'all' etc -// if (m.originalSpan.size() > 0) { +// if (m.originalSpan.size() > 0) { // String firstWord = m.originalSpan.get(0).get(CoreAnnotations.TextAnnotation.class).toLowerCase(Locale.ENGLISH); // if(firstWord.matches("none|no|nothing|not")) { // remove.add(m); // } //// if(dict.quantifiers.contains(firstWord)) remove.add(m); // } -// +// // // partitiveRule // if (partitiveRule(m, sent, dict)) { // remove.add(m); // } -// +// // // bareNPRule // if (headPOS.equals("NN") && !dict.temporals.contains(m.headString) // && (m.originalSpan.size()==1 || m.originalSpan.get(0).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("JJ"))) { // remove.add(m); // } -// +// // // remove generic rule //// if(m.generic==true) remove.add(m); -// +// // if (m.headString.equals("%")) { // remove.add(m); // } // if (headNE.equals("PERCENT") || headNE.equals("MONEY")) { // remove.add(m); // } -// +// // // adjective form of nations // // the [American] policy -> not mention // // speak in [Japanese] -> mention @@ -398,11 +399,11 @@ public static void findHeadInDependency(CoreMap s, Mention m) { // if (dict.isAdjectivalDemonym(m.spanToString())) { // remove.add(m); // } -// +// // // stop list (e.g., U.S., there) // if (inStopList(m)) remove.add(m); // } -// +// // // nested mention with shared headword (except apposition, enumeration): pick larger one // for (Mention m1 : mentions){ // for (Mention m2 : mentions){ @@ -422,33 +423,33 @@ public static void findHeadInDependency(CoreMap s, Mention m) { /** Filter out all spurious mentions */ @Override public void removeSpuriousMentionsEn(Annotation doc, List> predictedMentions, Dictionaries dict) { - + List sentences = doc.get(CoreAnnotations.SentencesAnnotation.class); - + for(int i=0 ; i < predictedMentions.size() ; i++) { CoreMap s = sentences.get(i); List mentions = predictedMentions.get(i); - + List sent = s.get(CoreAnnotations.TokensAnnotation.class); Set remove = Generics.newHashSet(); - + for(Mention m : mentions){ String headPOS = m.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class); - + // non word such as 'hmm' if(dict.nonWords.contains(m.headString)) remove.add(m); - + // adjective form of nations // the [American] policy -> not mention // speak in [Japanese] -> mention // check if the mention is noun and the next word is not noun if (dict.isAdjectivalDemonym(m.spanToString())) { - if(!headPOS.startsWith("N") + if(!headPOS.startsWith("N") || (m.endIndex < sent.size() && sent.get(m.endIndex).tag().startsWith("N")) ) { remove.add(m); } } - + // stop list (e.g., U.S., there) if (inStopList(m)) remove.add(m); } diff --git a/src/edu/stanford/nlp/coref/md/MentionDetectionClassifier.java b/src/edu/stanford/nlp/coref/md/MentionDetectionClassifier.java index 4175aa770b..1cf38eeb53 100644 --- a/src/edu/stanford/nlp/coref/md/MentionDetectionClassifier.java +++ b/src/edu/stanford/nlp/coref/md/MentionDetectionClassifier.java @@ -8,10 +8,9 @@ import java.util.Properties; import java.util.Set; -import edu.stanford.nlp.coref.hybrid.rf.RandomForest; - import edu.stanford.nlp.coref.data.Dictionaries; import edu.stanford.nlp.coref.data.Mention; +import edu.stanford.nlp.coref.rf.RandomForest; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.RVFDatum; @@ -20,7 +19,7 @@ import edu.stanford.nlp.stats.Counters; import edu.stanford.nlp.util.Generics; -public class MentionDetectionClassifier implements Serializable { +public class MentionDetectionClassifier implements Serializable { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(MentionDetectionClassifier.class); @@ -28,14 +27,14 @@ public class MentionDetectionClassifier implements Serializable { private static final long serialVersionUID = -4100580709477023158L; public RandomForest rf; - + public MentionDetectionClassifier(RandomForest rf) { this.rf = rf; } public static Counter extractFeatures(Mention p, Set shares, Set neStrings, Dictionaries dict, Properties props) { Counter features = new ClassicCounter<>(); - + String span = p.lowercaseNormalizedSpanString(); String ner = p.headWord.ner(); int sIdx = p.startIndex; @@ -45,8 +44,8 @@ public static Counter extractFeatures(Mention p, Set shares, Se CoreLabel nextWord = (eIdx == sent.size())? null : sent.get(eIdx); CoreLabel firstWord = p.originalSpan.get(0); CoreLabel lastWord = p.originalSpan.get(p.originalSpan.size()-1); - - + + features.incrementCount("B-NETYPE-"+ner); if(neStrings.contains(span)) { features.incrementCount("B-NE-STRING-EXIST"); @@ -56,16 +55,16 @@ public static Counter extractFeatures(Mention p, Set shares, Se } if(preWord!=null) features.incrementCount("B-PRECEDINGWORD-"+preWord.word()); if(nextWord!=null) features.incrementCount("B-FOLLOWINGWORD-"+nextWord.word()); - + if(preWord!=null) features.incrementCount("B-PRECEDINGPOS-"+preWord.tag()); if(nextWord!=null) features.incrementCount("B-FOLLOWINGPOS-"+nextWord.tag()); - + features.incrementCount("B-FIRSTWORD-"+firstWord.word()); features.incrementCount("B-FIRSTPOS-"+firstWord.tag()); - + features.incrementCount("B-LASTWORD-"+lastWord.word()); features.incrementCount("B-LASTWORD-"+lastWord.tag()); - + for(Mention s : shares) { if(s==p) continue; if(s.insideIn(p)) { @@ -80,17 +79,17 @@ public static Counter extractFeatures(Mention p, Set shares, Se break; } } - + return features; } - + public static MentionDetectionClassifier loadMentionDetectionClassifier(String filename) throws ClassNotFoundException, IOException { log.info("loading MentionDetectionClassifier ..."); MentionDetectionClassifier mdc = IOUtils.readObjectFromURLOrClasspathOrFileSystem(filename); log.info("done"); return mdc; } - + public double probabilityOf(Mention p, Set shares, Set neStrings, Dictionaries dict, Properties props) { try { boolean dummyLabel = false; @@ -142,5 +141,5 @@ public void classifyMentions(List> predictedMentions, Dictionaries } } } - + } diff --git a/src/edu/stanford/nlp/coref/md/RuleBasedCorefMentionFinder.java b/src/edu/stanford/nlp/coref/md/RuleBasedCorefMentionFinder.java index d6ba93ae37..3f9fc4031e 100644 --- a/src/edu/stanford/nlp/coref/md/RuleBasedCorefMentionFinder.java +++ b/src/edu/stanford/nlp/coref/md/RuleBasedCorefMentionFinder.java @@ -1,4 +1,3 @@ - package edu.stanford.nlp.coref.md; import java.util.ArrayList; @@ -28,7 +27,7 @@ public class RuleBasedCorefMentionFinder extends CorefMentionFinder { public RuleBasedCorefMentionFinder(HeadFinder headFinder, Properties props) { - this(true, headFinder, CorefProperties.getLanguage(props)); + this(CorefProperties.allowReparsing(props), headFinder, CorefProperties.getLanguage(props)); } public RuleBasedCorefMentionFinder(boolean allowReparsing, HeadFinder headFinder, Locale lang) { @@ -64,7 +63,7 @@ public List> filterPredictedMentions(List> allGoldMe removeSpuriousMentions(doc, predictedMentions, dict, CorefProperties.removeNestedMentions(props), lang); return predictedMentions; } - + /** Main method of mention detection. * Extract all NP, PRP or NE, and filter out by manually written patterns. */ @@ -74,7 +73,7 @@ public List> findMentions(Annotation doc, Dictionaries dict, Prope Set neStrings = Generics.newHashSet(); List> mentionSpanSetList = Generics.newArrayList(); List sentences = doc.get(CoreAnnotations.SentencesAnnotation.class); - + // extract premarked mentions, NP/PRP, named entity, enumerations for (CoreMap s : sentences) { List mentions = new ArrayList<>(); @@ -86,33 +85,22 @@ public List> findMentions(Annotation doc, Dictionaries dict, Prope extractNamedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet); extractNPorPRP(s, mentions, mentionSpanSet, namedEntitySpanSet); extractEnumerations(s, mentions, mentionSpanSet, namedEntitySpanSet); - + addNamedEntityStrings(s, neStrings, namedEntitySpanSet); mentionSpanSetList.add(mentionSpanSet); } - - if (lang == Locale.CHINESE && CorefProperties.liberalChineseMD(props)) { - extractNamedEntityModifiers(sentences, mentionSpanSetList, predictedMentions, neStrings); - } - + // find head for (int i=0, sz = sentences.size(); i < sz; i++) { findHead(sentences.get(i), predictedMentions.get(i)); setBarePlural(predictedMentions.get(i)); } - + // mention selection based on document-wise info - if (lang == Locale.ENGLISH) { - removeSpuriousMentionsEn(doc, predictedMentions, dict); - } else if (lang == Locale.CHINESE) { - if (CorefProperties.liberalChineseMD(props)) { - removeSpuriousMentionsZhSimple(doc, predictedMentions, dict, - CorefProperties.removeNestedMentions(props)); - } else { - removeSpuriousMentionsZh(doc, predictedMentions, dict, - CorefProperties.removeNestedMentions(props)); - } - } + removeSpuriousMentions(doc, predictedMentions, dict, CorefProperties.removeNestedMentions(props), lang); + +// // assign mention IDs +// if(assignIds) assignMentionIDs(predictedMentions, maxID); return predictedMentions; } @@ -141,7 +129,7 @@ public void extractNPorPRP(CoreMap s, List mentions, Set menti List mLeaves = t.getLeaves(); int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1; int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class); - //if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with , + if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with , IntPair mSpan = new IntPair(beginIdx, endIdx); if(!mentionSpanSet.contains(mSpan) && ( lang==Locale.CHINESE || !insideNE(mSpan, namedEntitySpanSet)) ) { // if(!mentionSpanSet.contains(mSpan) && (!insideNE(mSpan, namedEntitySpanSet) || t.value().startsWith("PRP")) ) { @@ -149,11 +137,11 @@ public void extractNPorPRP(CoreMap s, List mentions, Set menti Mention m = new Mention(dummyMentionId, beginIdx, endIdx, sent, basicDependency, enhancedDependency, new ArrayList<>(sent.subList(beginIdx, endIdx)), t); mentions.add(m); mentionSpanSet.add(mSpan); - + // if(m.originalSpan.size() > 1) { // boolean isNE = true; // for(CoreLabel cl : m.originalSpan) { -// if(!cl.tag().startsWith("NNP")) isNE = false; +// if(!cl.tag().startsWith("NNP")) isNE = false; // } // if(isNE) { // namedEntitySpanSet.add(mSpan); @@ -208,39 +196,22 @@ protected static void extractNamedEntityMentions(CoreMap s, List mentio } } - private static void removeSpuriousMentionsZhSimple(Annotation doc, - List> predictedMentions, Dictionaries dict, boolean removeNested) { - for(int i=0 ; i < predictedMentions.size() ; i++) { - List mentions = predictedMentions.get(i); - Set remove = Generics.newHashSet(); - for(Mention m : mentions){ - if (m.originalSpan.size()==1 && m.headWord.tag().equals("CD")) { - remove.add(m); - } - if (m.spanToString().contains("quot")) { - remove.add(m); - } - } - mentions.removeAll(remove); - } - } - - /** Filter out all spurious mentions + /** Filter out all spurious mentions */ @Override public void removeSpuriousMentionsEn(Annotation doc, List> predictedMentions, Dictionaries dict) { - + Set standAlones = new HashSet<>(); List sentences = doc.get(CoreAnnotations.SentencesAnnotation.class); - + for(int i=0 ; i < predictedMentions.size() ; i++) { CoreMap s = sentences.get(i); List mentions = predictedMentions.get(i); - + Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class); List sent = s.get(CoreAnnotations.TokensAnnotation.class); Set remove = Generics.newHashSet(); - + for(Mention m : mentions){ String headPOS = m.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class); String headNE = m.headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class); @@ -248,40 +219,40 @@ public void removeSpuriousMentionsEn(Annotation doc, List> predict if(isPleonastic(m, tree)) { remove.add(m); } - + // non word such as 'hmm' if(dict.nonWords.contains(m.headString)) remove.add(m); - + // quantRule : not starts with 'any', 'all' etc - if (m.originalSpan.size() > 0) { + if (m.originalSpan.size() > 0) { String firstWord = m.originalSpan.get(0).get(CoreAnnotations.TextAnnotation.class).toLowerCase(Locale.ENGLISH); if(firstWord.matches("none|no|nothing|not")) { remove.add(m); } // if(dict.quantifiers.contains(firstWord)) remove.add(m); } - + // partitiveRule if (partitiveRule(m, sent, dict)) { remove.add(m); } - + // bareNPRule if (headPOS.equals("NN") && !dict.temporals.contains(m.headString) && (m.originalSpan.size()==1 || m.originalSpan.get(0).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("JJ"))) { remove.add(m); } - + // remove generic rule // if(m.generic==true) remove.add(m); - + if (m.headString.equals("%")) { remove.add(m); } if (headNE.equals("PERCENT") || headNE.equals("MONEY")) { remove.add(m); } - + // adjective form of nations // the [American] policy -> not mention // speak in [Japanese] -> mention @@ -289,11 +260,11 @@ public void removeSpuriousMentionsEn(Annotation doc, List> predict if (dict.isAdjectivalDemonym(m.spanToString())) { remove.add(m); } - + // stop list (e.g., U.S., there) if (inStopList(m)) remove.add(m); } - + // nested mention with shared headword (except apposition, enumeration): pick larger one for (Mention m1 : mentions){ for (Mention m2 : mentions){ diff --git a/src/edu/stanford/nlp/coref/misc/MentionDetectionEvaluator.java b/src/edu/stanford/nlp/coref/misc/MentionDetectionEvaluator.java deleted file mode 100644 index c4e0579126..0000000000 --- a/src/edu/stanford/nlp/coref/misc/MentionDetectionEvaluator.java +++ /dev/null @@ -1,48 +0,0 @@ -package edu.stanford.nlp.coref.misc; - -import java.util.Properties; - -import edu.stanford.nlp.coref.CorefDocumentProcessor; -import edu.stanford.nlp.coref.CorefProperties; -import edu.stanford.nlp.coref.CorefProperties.Dataset; -import edu.stanford.nlp.coref.data.CorefCluster; -import edu.stanford.nlp.coref.data.Dictionaries; -import edu.stanford.nlp.coref.data.Document; -import edu.stanford.nlp.coref.data.Mention; -import edu.stanford.nlp.util.StringUtils; -import edu.stanford.nlp.util.logging.Redwood; - -public class MentionDetectionEvaluator implements CorefDocumentProcessor { - private static Redwood.RedwoodChannels log = Redwood.channels(MentionDetectionEvaluator.class); - private int correctSystemMentions = 0; - private int systemMentions = 0; - private int goldMentions = 0; - - @Override - public void process(int id, Document document) { - for (CorefCluster gold : document.goldCorefClusters.values()) { - for (Mention m : gold.corefMentions) { - if (document.predictedMentionsByID.containsKey(m.mentionID)) { - correctSystemMentions += 1; - } - goldMentions += 1; - } - } - systemMentions += document.predictedMentionsByID.size(); - - log.info("Precision: " + correctSystemMentions + " / " + systemMentions + " = " + - String.format("%.4f", correctSystemMentions / (double) systemMentions)); - log.info("Recall: " + correctSystemMentions + " / " + goldMentions + " = " + - String.format("%.4f", correctSystemMentions / (double) goldMentions)); - } - - @Override - public void finish() throws Exception {} - - public static void main(String[] args) throws Exception { - Properties props = StringUtils.argsToProperties(new String[] {"-props", args[0]}); - Dictionaries dictionaries = new Dictionaries(props); - CorefProperties.setInput(props, Dataset.TRAIN); - new MentionDetectionEvaluator().run(props, dictionaries); - } -} diff --git a/src/edu/stanford/nlp/coref/neural/CategoricalFeatureExtractor.java b/src/edu/stanford/nlp/coref/neural/CategoricalFeatureExtractor.java index 7a6927d538..cb511a8883 100644 --- a/src/edu/stanford/nlp/coref/neural/CategoricalFeatureExtractor.java +++ b/src/edu/stanford/nlp/coref/neural/CategoricalFeatureExtractor.java @@ -43,9 +43,9 @@ public SimpleMatrix getPairFeatures(Pair pair, Document docume Mention m1 = document.predictedMentionsByID.get(pair.first); Mention m2 = document.predictedMentionsByID.get(pair.second); List featureVals = pairwiseFeatures(document, m1, m2, dictionaries, conll); - SimpleMatrix features = new SimpleMatrix(featureVals.size(), 1); + SimpleMatrix features = new SimpleMatrix(featureVals.size() + 1, 1); for (int i = 0; i < featureVals.size(); i++) { - features.set(i, featureVals.get(i)); + features.set(i + 1, featureVals.get(i)); } features = NeuralUtils.concatenate(features, encodeDistance(m2.sentNum - m1.sentNum), @@ -54,8 +54,11 @@ public SimpleMatrix getPairFeatures(Pair pair, Document docume m1.sentNum == m2.sentNum && m1.endIndex > m2.startIndex ? 1 : 0}}), getMentionFeatures(m1, document, mentionsByHeadIndex), getMentionFeatures(m2, document, mentionsByHeadIndex), - encodeGenre(document)); + featurizeGenre(document)); + // replicating a bug in the python code + features.set(0, 0, features.get(features.numRows() - 1, 0)); + features = features.extractMatrix(0, features.numRows() - 1, 0, 1); return features; } @@ -81,7 +84,7 @@ public SimpleMatrix getAnaphoricityFeatures(Mention m, Document document, Map> mentionsByHeadIndex) { return NeuralUtils.concatenate( getMentionFeatures(m, document, mentionsByHeadIndex), - encodeGenre(document) + featurizeGenre(document) ); } @@ -116,9 +119,8 @@ private static SimpleMatrix encodeDistance(int d) { return m; } - private SimpleMatrix encodeGenre(Document document) { - return conll ? NeuralUtils.oneHot( - genres.get(document.docInfo.get("DOC_ID").split("/")[0]), genres.size()) : - new SimpleMatrix(1, 1); + private SimpleMatrix featurizeGenre(Document document) { + return NeuralUtils.oneHot( + conll ? genres.get(document.docInfo.get("DOC_ID").split("/")[0]) : 3, genres.size()); } } diff --git a/src/edu/stanford/nlp/coref/neural/EmbeddingExtractor.java b/src/edu/stanford/nlp/coref/neural/EmbeddingExtractor.java index 7e9e3729f2..8dfa0279b7 100644 --- a/src/edu/stanford/nlp/coref/neural/EmbeddingExtractor.java +++ b/src/edu/stanford/nlp/coref/neural/EmbeddingExtractor.java @@ -28,9 +28,9 @@ public EmbeddingExtractor(boolean conll, Embedding staticWordEmbeddings, } public SimpleMatrix getDocumentEmbedding(Document document) { - if (!conll) { + /*if (!conll) { return new SimpleMatrix(staticWordEmbeddings.getEmbeddingSize(), 1); - } + }*/ List words = new ArrayList<>(); Set seenSentences = new HashSet<>(); for (Mention m : document.predictedMentionsByID.values()) { @@ -51,7 +51,7 @@ public SimpleMatrix getMentionEmbeddings(Mention m, SimpleMatrix docEmbedding) { getAverageEmbedding(m.sentenceWords, m.startIndex, m.endIndex), getAverageEmbedding(m.sentenceWords, m.startIndex - 5, m.startIndex), getAverageEmbedding(m.sentenceWords, m.endIndex, m.endIndex + 5), - getAverageEmbedding(m.sentenceWords.subList(0, m.sentenceWords.size() - 1)), + getAverageEmbedding(m.sentenceWords.subList(0, m.sentenceWords.size() - 1)), // -1 is a bug in original python docEmbedding, getWordEmbedding(m.sentenceWords, m.headIndex), getWordEmbedding(m.sentenceWords, m.startIndex), diff --git a/src/edu/stanford/nlp/coref/neural/NeuralCorefDataExporter.java b/src/edu/stanford/nlp/coref/neural/NeuralCorefDataExporter.java index 7cfaf40662..15d43a2b3c 100644 --- a/src/edu/stanford/nlp/coref/neural/NeuralCorefDataExporter.java +++ b/src/edu/stanford/nlp/coref/neural/NeuralCorefDataExporter.java @@ -129,8 +129,7 @@ public void process(int id, Document document) { String key = m1.mentionNum + " " + m2.mentionNum; JsonArrayBuilder builder = Json.createArrayBuilder(); - for (int val : CategoricalFeatureExtractor.pairwiseFeatures( - document, m1, m2, dictionaries, conll)) { + for (int val : CategoricalFeatureExtractor.pairwiseFeatures(document, m1, m2, dictionaries, conll)) { builder.add(val); } features.add(key, builder.build()); @@ -158,7 +157,6 @@ public static void exportData(String outputPath, Dataset dataset, Properties pro CorefProperties.setInput(props, dataset); String dataPath = outputPath + "/data_raw/"; String goldClusterPath = outputPath + "/gold/"; - IOUtils.ensureDir(new File(outputPath)); IOUtils.ensureDir(new File(dataPath)); IOUtils.ensureDir(new File(goldClusterPath)); new NeuralCorefDataExporter(props, dictionaries, diff --git a/src/edu/stanford/nlp/coref/neural/NeuralCorefProperties.java b/src/edu/stanford/nlp/coref/neural/NeuralCorefProperties.java index 33716f0c5b..dd641bc264 100644 --- a/src/edu/stanford/nlp/coref/neural/NeuralCorefProperties.java +++ b/src/edu/stanford/nlp/coref/neural/NeuralCorefProperties.java @@ -14,7 +14,7 @@ public static double greedyness(Properties props) { public static String modelPath(Properties props) { String defaultPath = "edu/stanford/nlp/models/coref/neural/" + (CorefProperties.getLanguage(props) == Locale.CHINESE ? "chinese" : "english") + - (CorefProperties.conll(props) ? "-model-conll" : "-model-default") + ".ser.gz"; + (CorefProperties.conll(props) ? "-model-conll" : "-model-conll") + ".ser.gz"; return PropertiesUtils.getString(props, "coref.neural.modelPath", defaultPath); } diff --git a/src/edu/stanford/nlp/coref/neural/properties/chinese-default.properties b/src/edu/stanford/nlp/coref/neural/properties/chinese-default.properties index 0490ff60f9..5322df1847 100644 --- a/src/edu/stanford/nlp/coref/neural/properties/chinese-default.properties +++ b/src/edu/stanford/nlp/coref/neural/properties/chinese-default.properties @@ -25,4 +25,4 @@ ssplit.boundaryTokenRegex = [.]|[!?]+|[。]|[!?]+ pos.model = edu/stanford/nlp/models/pos-tagger/chinese-distsim/chinese-distsim.tagger #parse -parse.model = edu/stanford/nlp/models/srparser/chineseSR.ser.gz +parse.model = edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz diff --git a/src/edu/stanford/nlp/coref/hybrid/properties/coref-conll-dep.properties b/src/edu/stanford/nlp/coref/properties/coref-conll-dep.properties similarity index 97% rename from src/edu/stanford/nlp/coref/hybrid/properties/coref-conll-dep.properties rename to src/edu/stanford/nlp/coref/properties/coref-conll-dep.properties index bedc405118..217b1aa7d3 100644 --- a/src/edu/stanford/nlp/coref/hybrid/properties/coref-conll-dep.properties +++ b/src/edu/stanford/nlp/coref/properties/coref-conll-dep.properties @@ -2,29 +2,29 @@ # general coref.annotators = pos, lemma, ner, parse -coref.algorithm = hybrid #coref.sieves = MarkRole, DiscourseMatch, ExactStringMatch, RelaxedExactStringMatch, PreciseConstructs, StrictHeadMatch1, StrictHeadMatch2, StrictHeadMatch3, StrictHeadMatch4, RelaxedHeadMatch, PronounMatch coref.sieves = SpeakerMatch, PreciseConstructs, pp-rf, cc-rf, pc-rf, ll-rf, pr-rf +coref.input.type = conll coref.doScore = true coref.postprocessing = true coref.calculateFeatureImportance = false -coref.useConstituencyParse = false +coref.useConstituencyTree = false coref.useSemantics = true coref.loadWordEmbedding = true -coref.conll = true +coref.useMarkedDiscourse = true ############################################################################# # data & model path ## conll 2012 ## train -coref.path.trainData = /scr/nlp/data/conll-2012/v4/data/train/data/english/annotations/ +coref.path.traindata = /scr/nlp/data/conll-2012/v4/data/train/data/english/annotations/ #coref.path.traindata = /scr/nlp/data/conll-2012/v4/data/train+dev/ ## eval #coref.path.evaldata = /scr/nlp/data/conll-2012/v4/data/development/data/english/annotations/ -coref.path.testData = /scr/nlp/data/conll-2012/v9/data/test/data/english/annotations +coref.path.evaldata = /scr/nlp/data/conll-2012/v9/data/test/data/english/annotations ## models diff --git a/src/edu/stanford/nlp/coref/hybrid/properties/coref-conll.properties b/src/edu/stanford/nlp/coref/properties/coref-conll.properties similarity index 97% rename from src/edu/stanford/nlp/coref/hybrid/properties/coref-conll.properties rename to src/edu/stanford/nlp/coref/properties/coref-conll.properties index b734e5eb9d..e4a0c9129f 100644 --- a/src/edu/stanford/nlp/coref/hybrid/properties/coref-conll.properties +++ b/src/edu/stanford/nlp/coref/properties/coref-conll.properties @@ -2,29 +2,29 @@ # general coref.annotators = pos, lemma, ner, parse -coref.algorithm = hybrid #coref.sieves = MarkRole, DiscourseMatch, ExactStringMatch, RelaxedExactStringMatch, PreciseConstructs, StrictHeadMatch1, StrictHeadMatch2, StrictHeadMatch3, StrictHeadMatch4, RelaxedHeadMatch, PronounMatch coref.sieves = SpeakerMatch, PreciseConstructs, pp-rf, cc-rf, pc-rf, ll-rf, pr-rf +coref.input.type = conll coref.doScore = true coref.postprocessing = true coref.calculateFeatureImportance = false -coref.useConstituencyParse = true +coref.useConstituencyTree = true coref.useSemantics = true coref.loadWordEmbedding = true -coref.conll = true +coref.useMarkedDiscourse = true ############################################################################# # data & model path ## conll 2012 ## train -coref.path.trainData = /scr/nlp/data/conll-2012/v4/data/train/data/english/annotations/ +coref.path.traindata = /scr/nlp/data/conll-2012/v4/data/train/data/english/annotations/ #coref.path.traindata = /scr/nlp/data/conll-2012/v4/data/train+dev/ ## eval #coref.path.evaldata = /scr/nlp/data/conll-2012/v4/data/development/data/english/annotations/ -coref.path.testData = /scr/nlp/data/conll-2012/v9/data/test/data/english/annotations +coref.path.evaldata = /scr/nlp/data/conll-2012/v9/data/test/data/english/annotations ## models diff --git a/src/edu/stanford/nlp/coref/hybrid/properties/coref-default-dep.properties b/src/edu/stanford/nlp/coref/properties/coref-default-dep.properties similarity index 92% rename from src/edu/stanford/nlp/coref/hybrid/properties/coref-default-dep.properties rename to src/edu/stanford/nlp/coref/properties/coref-default-dep.properties index 80a8920015..4004ded12f 100644 --- a/src/edu/stanford/nlp/coref/hybrid/properties/coref-default-dep.properties +++ b/src/edu/stanford/nlp/coref/properties/coref-default-dep.properties @@ -2,12 +2,15 @@ # general coref.annotators = pos, lemma, ner, parse -coref.algorithm = hybrid coref.sieves = SpeakerMatch, PreciseConstructs, pp-rf, cc-rf, pc-rf, ll-rf, pr-rf -coref.useConstituencyParse = false -coref.conll = false -coref.path.trainData = /scr/nlp/data/conll-2012/v4/data/train/data/english/annotations/ +coref.useConstituencyTree = false +coref.useGoldNE = false +coref.useGoldParse = false +coref.useGoldPOS = false + +coref.input.type = conll +coref.path.traindata = /scr/nlp/data/conll-2012/v4/data/train/data/english/annotations/ ############################################################################# # data & model path diff --git a/src/edu/stanford/nlp/coref/hybrid/properties/dcoref-conll-dep.properties b/src/edu/stanford/nlp/coref/properties/dcoref-conll-dep.properties similarity index 96% rename from src/edu/stanford/nlp/coref/hybrid/properties/dcoref-conll-dep.properties rename to src/edu/stanford/nlp/coref/properties/dcoref-conll-dep.properties index b9effe461d..971527d2d4 100644 --- a/src/edu/stanford/nlp/coref/hybrid/properties/dcoref-conll-dep.properties +++ b/src/edu/stanford/nlp/coref/properties/dcoref-conll-dep.properties @@ -2,15 +2,15 @@ # general coref.annotators = pos, lemma, ner, parse -coref.algorithm = hybrid coref.sieves = MarkRole, DiscourseMatch, ExactStringMatch, RelaxedExactStringMatch, PreciseConstructs, StrictHeadMatch1, StrictHeadMatch2, StrictHeadMatch3, StrictHeadMatch4, RelaxedHeadMatch, PronounMatch +coref.input.type = conll coref.doScore = true coref.postprocessing = true coref.calculateFeatureImportance = false -coref.useConstituencyParse = false +coref.useConstituencyTree = false coref.useSemantics = true -coref.conll = true +coref.useMarkedDiscourse = true ############################################################################# # data & model path @@ -19,12 +19,12 @@ coref.conll = true ## train #coref.path.traindata = /scr/nlp/data/conll-2012/v4/data/train/data/english/annotations/ #coref.path.traindata = /scr/nlp/data/conll-2012/v4/data/train+dev/ -coref.path.trainData = /220/log-hcoref/forthesis/cleanup/small/train +coref.path.traindata = /220/log-hcoref/forthesis/cleanup/small/train ## eval #coref.path.evaldata = /scr/nlp/data/conll-2012/v4/data/development/data/english/annotations/ -coref.path.evalData = /scr/nlp/data/conll-2012/v9/data/test/data/english/annotations -#coref.path.evaldata = /220/log-hcoref/forthesis/cleanup/small/eval +coref.path.evaldata = /scr/nlp/data/conll-2012/v9/data/test/data/english/annotations +#coref.path.evaldata = /220/log-coref/forthesis/cleanup/small/eval ## scorer diff --git a/src/edu/stanford/nlp/coref/hybrid/properties/dcoref-conll.properties b/src/edu/stanford/nlp/coref/properties/dcoref-conll.properties similarity index 96% rename from src/edu/stanford/nlp/coref/hybrid/properties/dcoref-conll.properties rename to src/edu/stanford/nlp/coref/properties/dcoref-conll.properties index 9705b94033..e54ed96c47 100644 --- a/src/edu/stanford/nlp/coref/hybrid/properties/dcoref-conll.properties +++ b/src/edu/stanford/nlp/coref/properties/dcoref-conll.properties @@ -2,15 +2,15 @@ # general coref.annotators = pos, lemma, ner, parse -coref.algorithm = hybrid coref.sieves = MarkRole, DiscourseMatch, ExactStringMatch, RelaxedExactStringMatch, PreciseConstructs, StrictHeadMatch1, StrictHeadMatch2, StrictHeadMatch3, StrictHeadMatch4, RelaxedHeadMatch, PronounMatch +coref.input.type = conll coref.doScore = true coref.postprocessing = true coref.calculateFeatureImportance = false -coref.useConstituencyParse = true +coref.useConstituencyTree = true coref.useSemantics = false -coref.conll = true +coref.useMarkedDiscourse = true ############################################################################# # data & model path @@ -19,12 +19,12 @@ coref.conll = true ## train #coref.path.traindata = /scr/nlp/data/conll-2012/v4/data/train/data/english/annotations/ #coref.path.traindata = /scr/nlp/data/conll-2012/v4/data/train+dev/ -coref.path.trainData = /220/log-hcoref/forthesis/cleanup/small/train +coref.path.traindata = /220/log-hcoref/forthesis/cleanup/small/train ## eval #coref.path.evaldata = /scr/nlp/data/conll-2012/v4/data/development/data/english/annotations/ -coref.path.testData = /scr/nlp/data/conll-2012/v9/data/test/data/english/annotations -#coref.path.evaldata = /220/log-hcoref/forthesis/cleanup/small/eval +coref.path.evaldata = /scr/nlp/data/conll-2012/v9/data/test/data/english/annotations +#coref.path.evaldata = /220/log-coref/forthesis/cleanup/small/eval ## models diff --git a/src/edu/stanford/nlp/coref/hybrid/properties/zh-coref-default.properties b/src/edu/stanford/nlp/coref/properties/zh-coref-default.properties similarity index 78% rename from src/edu/stanford/nlp/coref/hybrid/properties/zh-coref-default.properties rename to src/edu/stanford/nlp/coref/properties/zh-coref-default.properties index 5fe1fef2bc..ebfee432f3 100644 --- a/src/edu/stanford/nlp/coref/hybrid/properties/zh-coref-default.properties +++ b/src/edu/stanford/nlp/coref/properties/zh-coref-default.properties @@ -1,22 +1,15 @@ -# pipeline options +# Pipeline options annotators = segment, ssplit, pos, lemma, ner, parse, mention, coref -coref.algorithm = hybrid -# data paths -coref.data = /scr/nlp/data/conll-2012/ -coref.conllOutputPath = /scr/nlp/coref/logs/ -coref.scorer = /scr/nlp/data/conll-2012/scorer/v8.01/scorer.pl - -# coref options coref.sieves = ChineseHeadMatch, ExactStringMatch, PreciseConstructs, StrictHeadMatch1, StrictHeadMatch2, StrictHeadMatch3, StrictHeadMatch4, PronounMatch coref.input.type = raw coref.postprocessing = true coref.calculateFeatureImportance = false -coref.useConstituencyParse= true +coref.useConstituencyTree = true #coref.useConstituencyTree = false coref.useSemantics = false coref.md.type = RULE -coref.md.liberalChineseMD = false +coref.mode = hybrid coref.path.word2vec = coref.language = zh @@ -45,5 +38,4 @@ ssplit.boundaryTokenRegex = [.]|[!?]+|[。]|[!?]+ pos.model = edu/stanford/nlp/models/pos-tagger/chinese-distsim/chinese-distsim.tagger #parse -parse.model = edu/stanford/nlp/models/srparser/chineseSR.ser.gz - +parse.model = edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz diff --git a/src/edu/stanford/nlp/coref/hybrid/properties/zh-dcoref-conll-no-output.properties b/src/edu/stanford/nlp/coref/properties/zh-dcoref-conll-no-output.properties similarity index 68% rename from src/edu/stanford/nlp/coref/hybrid/properties/zh-dcoref-conll-no-output.properties rename to src/edu/stanford/nlp/coref/properties/zh-dcoref-conll-no-output.properties index 402a872d96..467d072d78 100644 --- a/src/edu/stanford/nlp/coref/hybrid/properties/zh-dcoref-conll-no-output.properties +++ b/src/edu/stanford/nlp/coref/properties/zh-dcoref-conll-no-output.properties @@ -1,30 +1,21 @@ -# Pipeline options annotators = pos, lemma, ner, parse -coref.algorithm = hybrid - -coref.sieves = ChineseHeadMatch, ExactStringMatch, PreciseConstructs, StrictHeadMatch1, StrictHeadMatch2, StrictHeadMatch3, StrictHeadMatch4, PronounMatch -coref.doScore = true -coref.postprocessing = true +coref.addMissingAnnotations = true +coref.big.gender.number = edu/stanford/nlp/models/dcoref/gender.data.gz coref.calculateFeatureImportance = false -coref.useConstituencyParse = true -coref.useSemantics = false +coref.doScore = true +coref.input.type = conll +coref.language = zh coref.md.type = RULE -coref.md.liberalChineseMD = false +coref.path.evaldata = /Users/jebolton/conll-coref-data/chinese-development/annotations/ +coref.path.scorer.conll = /Users/jebolton/conll-scoring-lib/reference-coreference-scorers/v8.01/scorer.pl coref.path.word2vec = -coref.language = zh +coref.postprocessing = true coref.print.md.log = false -coref.big.gender.number = edu/stanford/nlp/models/dcoref/gender.data.gz -coref.zh.dict = edu/stanford/nlp/models/dcoref/zh-attributes.txt.gz - -coref.addMissingAnnotations = true +coref.sieves = ChineseHeadMatch, ExactStringMatch, PreciseConstructs, StrictHeadMatch1, StrictHeadMatch2, StrictHeadMatch3, StrictHeadMatch4, PronounMatch coref.specialCaseNewswire = true - -# Evaluation -coref.path.scorer.conll = /scr/nlp/data/conll-2012/scorer/v8.01/scorer.pl -#coref.path.output = /scr/nlp/coref/output/ -coref.path.testData = /scr/nlp/data/conll-2012/v4/data/development/data/chinese/annotations/ - -# NER -ner.model = edu/stanford/nlp/models/ner/chinese.misc.distsim.crf.ser.gz +coref.useConstituencyTree = true +coref.useSemantics = false +coref.zh.dict = edu/stanford/nlp/models/dcoref/zh-attributes.txt.gz ner.applyNumericClassifiers = false +ner.model = edu/stanford/nlp/models/ner/chinese.misc.distsim.crf.ser.gz ner.useSUTime = false diff --git a/src/edu/stanford/nlp/coref/hybrid/properties/zh-dcoref-conll.properties b/src/edu/stanford/nlp/coref/properties/zh-dcoref-conll.properties similarity index 86% rename from src/edu/stanford/nlp/coref/hybrid/properties/zh-dcoref-conll.properties rename to src/edu/stanford/nlp/coref/properties/zh-dcoref-conll.properties index 4fa07806ce..a2da64cc25 100644 --- a/src/edu/stanford/nlp/coref/hybrid/properties/zh-dcoref-conll.properties +++ b/src/edu/stanford/nlp/coref/properties/zh-dcoref-conll.properties @@ -1,27 +1,26 @@ # Pipeline options annotators = pos, lemma, ner, parse -coref.algorithm = hybrid coref.sieves = ChineseHeadMatch, ExactStringMatch, PreciseConstructs, StrictHeadMatch1, StrictHeadMatch2, StrictHeadMatch3, StrictHeadMatch4, PronounMatch +coref.input.type = conll coref.doScore = true coref.postprocessing = true coref.calculateFeatureImportance = false coref.useConstituencyTree = true coref.useSemantics = false coref.md.type = RULE -coref.md.liberalChineseMD = false coref.path.word2vec = coref.language = zh coref.print.md.log = false coref.big.gender.number = edu/stanford/nlp/models/dcoref/gender.data.gz coref.zh.dict = edu/stanford/nlp/models/dcoref/zh-attributes.txt.gz -coref.conll = true +coref.addMissingAnnotations = true # Evaluation coref.path.scorer.conll = /scr/nlp/data/conll-2012/scorer/v8.01/scorer.pl coref.path.output = /scr/nlp/coref/output/ -coref.path.testData = /scr/nlp/data/conll-2012/v4/data/development/data/chinese/annotations/ +coref.path.evaldata = /scr/nlp/data/conll-2012/v4/data/development/data/chinese/annotations/ # NER ner.model = edu/stanford/nlp/models/ner/chinese.misc.distsim.crf.ser.gz diff --git a/src/edu/stanford/nlp/coref/hybrid/rf/DecisionTree.java b/src/edu/stanford/nlp/coref/rf/DecisionTree.java similarity index 95% rename from src/edu/stanford/nlp/coref/hybrid/rf/DecisionTree.java rename to src/edu/stanford/nlp/coref/rf/DecisionTree.java index e4e744b283..29f673ed63 100644 --- a/src/edu/stanford/nlp/coref/hybrid/rf/DecisionTree.java +++ b/src/edu/stanford/nlp/coref/rf/DecisionTree.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.rf; +package edu.stanford.nlp.coref.rf; import java.io.Serializable; @@ -11,23 +11,23 @@ public class DecisionTree implements Serializable { public DecisionTreeNode root; public Index featureIndex; - + public DecisionTree(Index featureIndex) { this.featureIndex = featureIndex; this.root = null; } - + public double probabilityOfTrue(RVFDatum datum) { return probabilityOfTrue(datum.asFeaturesCounter()); } public double probabilityOfTrue(Counter features) { DecisionTreeNode cur = root; - + while(!cur.isLeaf()) { double value = features.getCount(featureIndex.get(cur.idx)); cur = (value < cur.split)? cur.children[0] : cur.children[1]; } - + return (cur.split); // at the leaf node, idx represents true or false. 1: true, 0: false, split represents probability of true. } } diff --git a/src/edu/stanford/nlp/coref/hybrid/rf/DecisionTreeNode.java b/src/edu/stanford/nlp/coref/rf/DecisionTreeNode.java similarity index 94% rename from src/edu/stanford/nlp/coref/hybrid/rf/DecisionTreeNode.java rename to src/edu/stanford/nlp/coref/rf/DecisionTreeNode.java index 8057909ac2..a88c4b003f 100644 --- a/src/edu/stanford/nlp/coref/hybrid/rf/DecisionTreeNode.java +++ b/src/edu/stanford/nlp/coref/rf/DecisionTreeNode.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.rf; +package edu.stanford.nlp.coref.rf; import java.io.Serializable; @@ -8,13 +8,13 @@ public class DecisionTreeNode implements Serializable { int idx; // if not leaf, feature index. if leaf, idx=1 -> true, idx=0 -> false. float split; // if not leaf, split point. if leaf, true probability. DecisionTreeNode[] children; // go left if value is less than split - + DecisionTreeNode() { idx = -1; split = Float.NaN; children = null; } - + public DecisionTreeNode(int label, float prob) { this(); idx = label; diff --git a/src/edu/stanford/nlp/coref/hybrid/rf/RandomForest.java b/src/edu/stanford/nlp/coref/rf/RandomForest.java similarity index 95% rename from src/edu/stanford/nlp/coref/hybrid/rf/RandomForest.java rename to src/edu/stanford/nlp/coref/rf/RandomForest.java index 6897a47d30..7ca106e471 100644 --- a/src/edu/stanford/nlp/coref/hybrid/rf/RandomForest.java +++ b/src/edu/stanford/nlp/coref/rf/RandomForest.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.rf; +package edu.stanford.nlp.coref.rf; import java.io.Serializable; @@ -8,15 +8,15 @@ public class RandomForest implements Serializable { private static final long serialVersionUID = -2736377471905671276L; - + public final DecisionTree[] trees; public final Index featureIndex; - + public RandomForest(Index featureIndex, int numTrees) { this.featureIndex = featureIndex; this.trees = new DecisionTree[numTrees]; } - + public double probabilityOfTrue(RVFDatum datum) { return probabilityOfTrue(datum.asFeaturesCounter()); } diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/AliasMatch.java b/src/edu/stanford/nlp/coref/sieve/AliasMatch.java similarity index 86% rename from src/edu/stanford/nlp/coref/hybrid/sieve/AliasMatch.java rename to src/edu/stanford/nlp/coref/sieve/AliasMatch.java index d8f5dad051..e0b9240d0b 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/AliasMatch.java +++ b/src/edu/stanford/nlp/coref/sieve/AliasMatch.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; public class AliasMatch extends DeterministicCorefSieve { public AliasMatch() { diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/ChineseHeadMatch.java b/src/edu/stanford/nlp/coref/sieve/ChineseHeadMatch.java similarity index 87% rename from src/edu/stanford/nlp/coref/hybrid/sieve/ChineseHeadMatch.java rename to src/edu/stanford/nlp/coref/sieve/ChineseHeadMatch.java index e7b08e91b0..70059ab82b 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/ChineseHeadMatch.java +++ b/src/edu/stanford/nlp/coref/sieve/ChineseHeadMatch.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; import java.util.Properties; diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/CorefDictionaryMatch.java b/src/edu/stanford/nlp/coref/sieve/CorefDictionaryMatch.java similarity index 91% rename from src/edu/stanford/nlp/coref/hybrid/sieve/CorefDictionaryMatch.java rename to src/edu/stanford/nlp/coref/sieve/CorefDictionaryMatch.java index a599945ec0..a5b32852c7 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/CorefDictionaryMatch.java +++ b/src/edu/stanford/nlp/coref/sieve/CorefDictionaryMatch.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; /** * Sieve that uses the coreference dictionary for the technical domain @@ -7,6 +7,7 @@ * @author recasens */ public class CorefDictionaryMatch extends DeterministicCorefSieve { + public CorefDictionaryMatch(){ super(); flags.USE_iwithini = true; @@ -16,4 +17,5 @@ public CorefDictionaryMatch(){ flags.USE_ATTRIBUTES_AGREE = true; flags.USE_COREF_DICT = true; } + } diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/DcorefSieveOptions.java b/src/edu/stanford/nlp/coref/sieve/DcorefSieveOptions.java similarity index 97% rename from src/edu/stanford/nlp/coref/hybrid/sieve/DcorefSieveOptions.java rename to src/edu/stanford/nlp/coref/sieve/DcorefSieveOptions.java index 5b35ba4f34..914aeee17d 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/DcorefSieveOptions.java +++ b/src/edu/stanford/nlp/coref/sieve/DcorefSieveOptions.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; public class DcorefSieveOptions { public boolean DO_PRONOUN; @@ -8,7 +8,7 @@ public class DcorefSieveOptions { public boolean USE_PREDICATENOMINATIVES; public boolean USE_ACRONYM; public boolean USE_RELATIVEPRONOUN; - public boolean USE_ROLEAPPOSITION; + public boolean USE_ROLEAPPOSITION; public boolean USE_EXACTSTRINGMATCH; public boolean USE_NAME_MATCH; public boolean USE_INCLUSION_HEADMATCH; @@ -32,7 +32,7 @@ public class DcorefSieveOptions { public boolean USE_COREF_DICT; public boolean USE_SPEAKERMATCH; public boolean USE_CHINESE_HEAD_MATCH; - + public String toString() { StringBuilder os = new StringBuilder(); os.append("{"); @@ -70,7 +70,7 @@ public String toString() { os.append("}"); return os.toString(); } - + public DcorefSieveOptions() { DO_PRONOUN= false; USE_INCOMPATIBLES = true; @@ -79,7 +79,7 @@ public DcorefSieveOptions() { USE_PREDICATENOMINATIVES = false; USE_ACRONYM = false; USE_RELATIVEPRONOUN = false; - USE_ROLEAPPOSITION = false; + USE_ROLEAPPOSITION = false; USE_EXACTSTRINGMATCH = false; USE_INCLUSION_HEADMATCH = false; USE_RELAXED_HEADMATCH = false; diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/DeterministicCorefSieve.java b/src/edu/stanford/nlp/coref/sieve/DeterministicCorefSieve.java similarity index 95% rename from src/edu/stanford/nlp/coref/hybrid/sieve/DeterministicCorefSieve.java rename to src/edu/stanford/nlp/coref/sieve/DeterministicCorefSieve.java index 70cded4d85..ad8f2317d5 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/DeterministicCorefSieve.java +++ b/src/edu/stanford/nlp/coref/sieve/DeterministicCorefSieve.java @@ -24,7 +24,8 @@ // USA // -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; +import edu.stanford.nlp.coref.CorefRules; import edu.stanford.nlp.util.logging.Redwood; import java.util.ArrayList; @@ -35,17 +36,17 @@ import java.util.Properties; import java.util.Set; -import edu.stanford.nlp.coref.CorefRules; +import edu.stanford.nlp.coref.CorefPrinter; +import edu.stanford.nlp.coref.CorefProperties; import edu.stanford.nlp.coref.data.CorefCluster; import edu.stanford.nlp.coref.data.Dictionaries; -import edu.stanford.nlp.coref.data.Document; -import edu.stanford.nlp.coref.data.Mention; import edu.stanford.nlp.coref.data.Dictionaries.MentionType; import edu.stanford.nlp.coref.data.Dictionaries.Number; import edu.stanford.nlp.coref.data.Dictionaries.Person; +import edu.stanford.nlp.coref.data.Document; import edu.stanford.nlp.coref.data.Document.DocType; -import edu.stanford.nlp.coref.hybrid.HybridCorefPrinter; -import edu.stanford.nlp.coref.hybrid.HybridCorefProperties; +import edu.stanford.nlp.coref.data.Mention; +import edu.stanford.nlp.coref.OldCorefPrinter; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation; import edu.stanford.nlp.trees.Tree; @@ -66,15 +67,15 @@ public abstract class DeterministicCorefSieve extends Sieve { public DeterministicCorefSieve() { super(); - this.classifierType = ClassifierType.RULE; + this.classifierType = ClassifierType.RULE; flags = new DcorefSieveOptions(); } public DeterministicCorefSieve(Properties props) { super(props); - this.classifierType = ClassifierType.RULE; + this.classifierType = ClassifierType.RULE; flags = new DcorefSieveOptions(); } - + public void findCoreferentAntecedent(Mention m, int mIdx, Document document, Dictionaries dict, Properties props, StringBuilder sbLog) throws Exception { // check for skip: first mention only, discourse salience @@ -82,7 +83,7 @@ public void findCoreferentAntecedent(Mention m, int mIdx, Document document, Dic && this.skipThisMention(document, m, document.corefClusters.get(m.corefClusterID), dict)) { return; } - + Set roleSet = document.roleSet; for (int sentJ = m.sentNum; sentJ >= 0; sentJ--) { List l = Sieve.getOrderedAntecedents(m, sentJ, mIdx, document.predictedMentions, dict); @@ -104,15 +105,15 @@ public void findCoreferentAntecedent(Mention m, int mIdx, Document document, Dic for (Mention ant : l) { if(skipForAnalysis(ant, m, props)) continue; - - // m2 - antecedent of m1 + + // m2 - antecedent of m1 // Skip singletons according to the singleton predictor // (only for non-NE mentions) // Recasens, de Marneffe, and Potts (NAACL 2013) if (m.isSingleton && m.mentionType != MentionType.PROPER && ant.isSingleton && ant.mentionType != MentionType.PROPER) continue; if (m.corefClusterID == ant.corefClusterID) continue; - + if(!mType.contains(m.mentionType) || !aType.contains(ant.mentionType)) continue; if(m.mentionType == MentionType.PRONOMINAL) { if(!matchedMentionType(m, mTypeStr)) continue; @@ -138,10 +139,10 @@ public void findCoreferentAntecedent(Mention m, int mIdx, Document document, Dic // } // print dcoref log - if(HybridCorefProperties.debug(props)) { - sbLog.append(HybridCorefPrinter.printErrorLogDcoref(m, ant, document, dict, mIdx, this.getClass().getName())); + if(CorefProperties.debug(props)) { + sbLog.append(OldCorefPrinter.printErrorLogDcoref(m, ant, document, dict, mIdx, this.getClass().getName())); } - + int removeID = c1.clusterID; // System.out.println("Merging ant "+c2+" with "+c1); CorefCluster.mergeClusters(c2, c1); @@ -209,7 +210,7 @@ public boolean coreferent(Document document, CorefCluster mentionCluster, Mention ant, Dictionaries dict, Set roleSet) throws Exception { - + boolean ret = false; Mention mention = mentionCluster.getRepresentativeMention(); if (flags.USE_INCOMPATIBLES) { @@ -219,7 +220,7 @@ public boolean coreferent(Document document, CorefCluster mentionCluster, return false; } } - if (flags.DO_PRONOUN && Math.abs(mention2.sentNum-ant.sentNum) > 3 && + if (flags.DO_PRONOUN && Math.abs(mention2.sentNum-ant.sentNum) > 3 && mention2.person!=Person.I && mention2.person!=Person.YOU) { return false; } @@ -243,13 +244,13 @@ public boolean coreferent(Document document, CorefCluster mentionCluster, if(flags.USE_SPEAKERMATCH) { String mSpeaker = mention2.headWord.get(SpeakerAnnotation.class); String aSpeaker = ant.headWord.get(SpeakerAnnotation.class); - + // from same speaker if(mention2.person == Person.I && ant.person == Person.I) return (mSpeaker.equals(aSpeaker)); - + // - speaker - if( (mention2.person == Person.I && mSpeaker.equals(Integer.toString(ant.mentionID))) - || (ant.person == Person.I && aSpeaker.equals(Integer.toString(mention2.mentionID))) ) return true; + if( (mention2.person == Person.I && mSpeaker.equals(Integer.toString(ant.mentionID))) + || (ant.person == Person.I && aSpeaker.equals(Integer.toString(mention2.mentionID))) ) return true; } if(flags.USE_DISCOURSEMATCH) { String mString = mention.lowercaseNormalizedSpanString(); @@ -349,7 +350,7 @@ public boolean coreferent(Document document, CorefCluster mentionCluster, if(flags.USE_EXACTSTRINGMATCH && CorefRules.entityExactStringMatch(mention, ant, dict, roleSet)){ return true; } -// if(flags.USE_EXACTSTRINGMATCH && Rules.entityExactStringMatch(mentionCluster, potentialAntecedent, dict, roleSet)){ +// if(flags.USE_EXACTSTRINGMATCH && CorefRules.entityExactStringMatch(mentionCluster, potentialAntecedent, dict, roleSet)){ // return true; // } if (flags.USE_NAME_MATCH && checkEntityMatch(document, mentionCluster, potentialAntecedent, dict, roleSet)) { @@ -415,29 +416,29 @@ public boolean coreferent(Document document, CorefCluster mentionCluster, } return false; } - + if(flags.USE_DISTANCE && CorefRules.entityTokenDistance(mention2, ant)){ return false; } - + if(flags.USE_COREF_DICT){ // Head match if(ant.headWord.lemma().equals(mention2.headWord.lemma())) return false; - + // Constraint: ignore pairs commonNoun - properNoun - if(ant.mentionType != MentionType.PROPER && - ( mention2.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP") - || !mention2.headWord.word().substring(1).equals(mention2.headWord.word().substring(1).toLowerCase()) ) ) return false; - + if(ant.mentionType != MentionType.PROPER && + ( mention2.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP") + || !mention2.headWord.word().substring(1).equals(mention2.headWord.word().substring(1).toLowerCase()) ) ) return false; + // Constraint: ignore plurals if(ant.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNS") && mention2.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNS")) return false; - + // Constraint: ignore mentions with indefinite determiners - if(dict.indefinitePronouns.contains(ant.originalSpan.get(0).lemma()) - || dict.indefinitePronouns.contains(mention2.originalSpan.get(0).lemma())) return false; - + if(dict.indefinitePronouns.contains(ant.originalSpan.get(0).lemma()) + || dict.indefinitePronouns.contains(mention2.originalSpan.get(0).lemma())) return false; + // Constraint: ignore coordinated mentions if(ant.isCoordinated() || mention2.isCoordinated()) return false; @@ -446,13 +447,13 @@ public boolean coreferent(Document document, CorefCluster mentionCluster, // Constraint: sentence context incompatibility when the mentions are common nouns if(CorefRules.sentenceContextIncompatible(mention2, ant, dict)) return false; - + if(CorefRules.entityClusterAllCorefDictionary(mentionCluster, potentialAntecedent, dict, 1, 8)) return true; if(CorefRules.entityCorefDictionary(mention, ant, dict, 2, 2)) return true; if(CorefRules.entityCorefDictionary(mention, ant, dict, 3, 2)) return true; if(CorefRules.entityCorefDictionary(mention, ant, dict, 4, 2)) return true; } - + if(flags.DO_PRONOUN){ Mention m; if (mention.predicateNominatives!=null && mention.predicateNominatives.contains(mention2)) { @@ -462,10 +463,10 @@ public boolean coreferent(Document document, CorefCluster mentionCluster, } boolean mIsPronoun = (m.isPronominal() || dict.allPronouns.contains(m.toString())); - boolean attrAgree = HybridCorefProperties.useDefaultPronounAgreement(props)? + boolean attrAgree = CorefProperties.useDefaultPronounAgreement(props)? CorefRules.entityAttributesAgree(mentionCluster, potentialAntecedent): CorefRules.entityAttributesAgree(mentionCluster, potentialAntecedent, lang); - + if(mIsPronoun && attrAgree){ if(dict.demonymSet.contains(ant.lowercaseNormalizedSpanString()) && dict.notOrganizationPRP.contains(m.headString)){ @@ -479,12 +480,12 @@ public boolean coreferent(Document document, CorefCluster mentionCluster, return true; } } - + if(flags.USE_CHINESE_HEAD_MATCH) { if (mention2.headWord == ant.headWord && mention2.insideIn(ant)) { if(!document.isCoref(mention2, ant)) { // TODO: exclude conjunction - // log.info("error in chinese head match: "+mention2.spanToString()+"\t"+ant.spanToString()); + log.info("error in chinese head match: "+mention2.spanToString()+"\t"+ant.spanToString()); } return true; } diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/DiscourseMatch.java b/src/edu/stanford/nlp/coref/sieve/DiscourseMatch.java similarity index 76% rename from src/edu/stanford/nlp/coref/hybrid/sieve/DiscourseMatch.java rename to src/edu/stanford/nlp/coref/sieve/DiscourseMatch.java index c61516c23d..e05042762d 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/DiscourseMatch.java +++ b/src/edu/stanford/nlp/coref/sieve/DiscourseMatch.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; public class DiscourseMatch extends DeterministicCorefSieve { public DiscourseMatch() { diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/ExactStringMatch.java b/src/edu/stanford/nlp/coref/sieve/ExactStringMatch.java similarity index 77% rename from src/edu/stanford/nlp/coref/hybrid/sieve/ExactStringMatch.java rename to src/edu/stanford/nlp/coref/sieve/ExactStringMatch.java index 26650f98b1..e5177959a8 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/ExactStringMatch.java +++ b/src/edu/stanford/nlp/coref/sieve/ExactStringMatch.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; public class ExactStringMatch extends DeterministicCorefSieve { public ExactStringMatch() { diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/LexicalChainMatch.java b/src/edu/stanford/nlp/coref/sieve/LexicalChainMatch.java similarity index 87% rename from src/edu/stanford/nlp/coref/hybrid/sieve/LexicalChainMatch.java rename to src/edu/stanford/nlp/coref/sieve/LexicalChainMatch.java index 55ceec8366..83100c02a2 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/LexicalChainMatch.java +++ b/src/edu/stanford/nlp/coref/sieve/LexicalChainMatch.java @@ -1,10 +1,10 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; public class LexicalChainMatch extends DeterministicCorefSieve { public LexicalChainMatch() { super(); - + flags.USE_iwithini = true; flags.USE_ATTRIBUTES_AGREE = true; flags.USE_WN_HYPERNYM = true; diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/MarkRole.java b/src/edu/stanford/nlp/coref/sieve/MarkRole.java similarity index 74% rename from src/edu/stanford/nlp/coref/hybrid/sieve/MarkRole.java rename to src/edu/stanford/nlp/coref/sieve/MarkRole.java index 7bb49b8fb6..b0416b1152 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/MarkRole.java +++ b/src/edu/stanford/nlp/coref/sieve/MarkRole.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; public class MarkRole extends DeterministicCorefSieve { public MarkRole() { diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/NameMatch.java b/src/edu/stanford/nlp/coref/sieve/NameMatch.java similarity index 94% rename from src/edu/stanford/nlp/coref/hybrid/sieve/NameMatch.java rename to src/edu/stanford/nlp/coref/sieve/NameMatch.java index 2f3e441950..13861eaa0f 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/NameMatch.java +++ b/src/edu/stanford/nlp/coref/sieve/NameMatch.java @@ -1,15 +1,15 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; -import edu.stanford.nlp.util.Generics; -import edu.stanford.nlp.util.ReflectionLoading; -import java.util.Properties; -import java.util.Set; - -import edu.stanford.nlp.coref.CorefRules; +import edu.stanford.nlp.coref.*; import edu.stanford.nlp.coref.data.CorefCluster; import edu.stanford.nlp.coref.data.Dictionaries; import edu.stanford.nlp.coref.data.Document; import edu.stanford.nlp.coref.data.Mention; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.ReflectionLoading; + +import java.util.Properties; +import java.util.Set; /** * Use name matcher - match full names only @@ -21,7 +21,7 @@ public class NameMatch extends DeterministicCorefSieve { protected int minTokens = 0; // Minimum number of tokens in name before attempting match protected boolean ignoreGender = true; - private final Set supportedNerTypes = Generics.newHashSet(); + private Set supportedNerTypes = Generics.newHashSet(); public NameMatch() { super(); diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/NameMatchPrecise.java b/src/edu/stanford/nlp/coref/sieve/NameMatchPrecise.java similarity index 83% rename from src/edu/stanford/nlp/coref/hybrid/sieve/NameMatchPrecise.java rename to src/edu/stanford/nlp/coref/sieve/NameMatchPrecise.java index 59d585315d..f51f8ad6ee 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/NameMatchPrecise.java +++ b/src/edu/stanford/nlp/coref/sieve/NameMatchPrecise.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; /** * Use name matcher - more precise match @@ -14,4 +14,6 @@ public NameMatchPrecise() { minTokens = 2; } + + } diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/OracleSieve.java b/src/edu/stanford/nlp/coref/sieve/OracleSieve.java similarity index 96% rename from src/edu/stanford/nlp/coref/hybrid/sieve/OracleSieve.java rename to src/edu/stanford/nlp/coref/sieve/OracleSieve.java index 6a68667740..d42c7ecebc 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/OracleSieve.java +++ b/src/edu/stanford/nlp/coref/sieve/OracleSieve.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; import edu.stanford.nlp.util.logging.Redwood; import java.util.List; @@ -38,8 +38,8 @@ public void findCoreferentAntecedent(Mention m, int mIdx, Document document, Dic } Sieve.merge(document, m.mentionID, candidate.mentionID); return; - } - } + } + } } } } diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/PreciseConstructs.java b/src/edu/stanford/nlp/coref/sieve/PreciseConstructs.java similarity index 88% rename from src/edu/stanford/nlp/coref/hybrid/sieve/PreciseConstructs.java rename to src/edu/stanford/nlp/coref/sieve/PreciseConstructs.java index f5634bd5ae..a431f03a55 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/PreciseConstructs.java +++ b/src/edu/stanford/nlp/coref/sieve/PreciseConstructs.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; public class PreciseConstructs extends DeterministicCorefSieve { public PreciseConstructs() { diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/PronounMatch.java b/src/edu/stanford/nlp/coref/sieve/PronounMatch.java similarity index 78% rename from src/edu/stanford/nlp/coref/hybrid/sieve/PronounMatch.java rename to src/edu/stanford/nlp/coref/sieve/PronounMatch.java index 9875314d67..355ac8e280 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/PronounMatch.java +++ b/src/edu/stanford/nlp/coref/sieve/PronounMatch.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; public class PronounMatch extends DeterministicCorefSieve { public PronounMatch() { diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/RFSieve.java b/src/edu/stanford/nlp/coref/sieve/RFSieve.java similarity index 94% rename from src/edu/stanford/nlp/coref/hybrid/sieve/RFSieve.java rename to src/edu/stanford/nlp/coref/sieve/RFSieve.java index 463b426139..5b03f8cb97 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/RFSieve.java +++ b/src/edu/stanford/nlp/coref/sieve/RFSieve.java @@ -1,4 +1,7 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; +import edu.stanford.nlp.coref.CorefRules; +import edu.stanford.nlp.util.logging.Redwood; + import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; @@ -7,8 +10,8 @@ import java.util.Properties; import java.util.Set; +import edu.stanford.nlp.coref.CorefPrinter; import edu.stanford.nlp.coref.CorefProperties; -import edu.stanford.nlp.coref.CorefRules; import edu.stanford.nlp.coref.data.CorefChain; import edu.stanford.nlp.coref.data.CorefCluster; import edu.stanford.nlp.coref.data.Dictionaries; @@ -20,10 +23,9 @@ import edu.stanford.nlp.coref.data.Document; import edu.stanford.nlp.coref.data.Document.DocType; import edu.stanford.nlp.coref.data.Mention; -import edu.stanford.nlp.coref.hybrid.HybridCorefPrinter; -import edu.stanford.nlp.coref.hybrid.HybridCorefProperties; -import edu.stanford.nlp.coref.hybrid.rf.RandomForest; import edu.stanford.nlp.coref.md.RuleBasedCorefMentionFinder; +import edu.stanford.nlp.coref.OldCorefPrinter; +import edu.stanford.nlp.coref.rf.RandomForest; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation; import edu.stanford.nlp.ling.CoreLabel; @@ -34,21 +36,20 @@ import edu.stanford.nlp.stats.Counters; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.util.Generics; -import edu.stanford.nlp.util.logging.Redwood; public class RFSieve extends Sieve { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(RFSieve.class); - + private static final long serialVersionUID = -4090017054885920527L; - + // for RF sieve public RandomForest rf; - + /** the probability threshold for merging two mentions */ public double thresMerge; - + // constructor for RF sieve public RFSieve(RandomForest rf, Properties props, String sievename) { super(props, sievename); @@ -61,11 +62,11 @@ public void findCoreferentAntecedent(Mention m, int mIdx, Document document, Dic int sentIdx = m.sentNum; Counter probs = new ClassicCounter<>(); - + int mentionDist = 0; for(int sentDist=0 ; sentDist <= Math.min(this.maxSentDist, sentIdx) ; sentDist++) { List candidates = getOrderedAntecedents(m, sentIdx-sentDist, mIdx, document.predictedMentions, dict); - + for(Mention candidate : candidates) { if(skipForAnalysis(candidate, m, props)) continue; @@ -75,60 +76,60 @@ public void findCoreferentAntecedent(Mention m, int mIdx, Document document, Dic if(!matchedMentionType(m, mTypeStr)) continue; if(!matchedMentionType(candidate, aTypeStr)) continue; } - + if(sentDist==0 && m.appearEarlierThan(candidate)) continue; // ignore cataphora mentionDist++; RVFDatum datum = extractDatum(m, candidate, document, mentionDist, dict, props, sievename); - + double probTrue = 0; if(this.classifierType == ClassifierType.RF) { probTrue = this.rf.probabilityOfTrue(datum); } - + probs.setCount(candidate.mentionID, probTrue); } } - - if(HybridCorefProperties.debug(props)) { - sbLog.append(HybridCorefPrinter.printErrorLog(m, document, probs, mIdx, dict, this)); + + if(CorefProperties.debug(props)) { + sbLog.append(OldCorefPrinter.printErrorLog(m, document, probs, mIdx, dict, this)); } - + if(probs.size() > 0 && Counters.max(probs) > this.thresMerge) { // merge highest prob candidate int antID = Counters.argmax(probs); - + Sieve.merge(document, m.mentionID, antID); } } - public static RVFDatum extractDatum(Mention m, Mention candidate, + public static RVFDatum extractDatum(Mention m, Mention candidate, Document document, int mentionDist, Dictionaries dict, Properties props, String sievename) { try { - + boolean label = (document.goldMentions==null)? false : document.isCoref(m, candidate); Counter features = new ClassicCounter<>(); CorefCluster mC = document.corefClusters.get(m.corefClusterID); CorefCluster aC = document.corefClusters.get(candidate.corefClusterID); - + CoreLabel mFirst = m.sentenceWords.get(m.startIndex); CoreLabel mLast = m.sentenceWords.get(m.endIndex-1); CoreLabel mPreceding = (m.startIndex>0)? m.sentenceWords.get(m.startIndex-1) : null; CoreLabel mFollowing = (m.endIndex < m.sentenceWords.size())? m.sentenceWords.get(m.endIndex) : null; - + CoreLabel aFirst = candidate.sentenceWords.get(candidate.startIndex); CoreLabel aLast = candidate.sentenceWords.get(candidate.endIndex-1); CoreLabel aPreceding = (candidate.startIndex>0)? candidate.sentenceWords.get(candidate.startIndex-1) : null; CoreLabel aFollowing = (candidate.endIndex < candidate.sentenceWords.size())? candidate.sentenceWords.get(candidate.endIndex) : null; - - + + //////////////////////////////////////////////////////////////////////////////// /////// basic features: distance, doctype, mention length, roles //////////// //////////////////////////////////////////////////////////////////////////////// - if(HybridCorefProperties.useBasicFeatures(props, sievename)) { + if(CorefProperties.useBasicFeatures(props, sievename)) { int sentDist = m.sentNum - candidate.sentNum; features.incrementCount("SENTDIST", sentDist); features.incrementCount("MENTIONDIST", mentionDist); - + int minSentDist = sentDist; for(Mention a : aC.corefMentions) { minSentDist = Math.min(minSentDist, Math.abs(m.sentNum - a.sentNum)); @@ -136,12 +137,12 @@ public static RVFDatum extractDatum(Mention m, Mention candidat features.incrementCount("MINSENTDIST", minSentDist); // When they are in the same sentence, divides a sentence into clauses and add such feature - if(CorefProperties.useConstituencyParse(props)) { + if(CorefProperties.useConstituencyTree(props)) { if(m.sentNum == candidate.sentNum) { int clauseCount = 0; Tree tree = m.contextParseTree; Tree current = m.mentionSubTree; - + while(true){ current = current.ancestor(1, tree); if(current.label().value().startsWith("S")) { @@ -153,16 +154,16 @@ public static RVFDatum extractDatum(Mention m, Mention candidat features.incrementCount("CLAUSECOUNT", clauseCount); } } - + if(document.docType == DocType.CONVERSATION) features.incrementCount("B-DOCTYPE-"+document.docType); if(m.headWord.get(SpeakerAnnotation.class).equalsIgnoreCase("PER0")) { features.incrementCount("B-SPEAKER-PER0"); } - + if(document.docInfo!=null && document.docInfo.containsKey("DOC_ID")) { features.incrementCount("B-DOCSOURCE-"+document.docInfo.get("DOC_ID").split("/")[1]); } - + features.incrementCount("M-LENGTH", m.originalSpan.size()); features.incrementCount("A-LENGTH", candidate.originalSpan.size()); if(m.originalSpan.size() < candidate.originalSpan.size()) features.incrementCount("B-A-ISLONGER"); @@ -171,22 +172,22 @@ public static RVFDatum extractDatum(Mention m, Mention candidat String antRole = "A-NOROLE"; String mRole = "M-NOROLE"; - + if(m.isSubject) mRole = "M-SUBJ"; if(m.isDirectObject) mRole = "M-DOBJ"; if(m.isIndirectObject) mRole = "M-IOBJ"; if(m.isPrepositionObject) mRole = "M-POBJ"; - + if(candidate.isSubject) antRole = "A-SUBJ"; if(candidate.isDirectObject) antRole = "A-DOBJ"; if(candidate.isIndirectObject) antRole = "A-IOBJ"; if(candidate.isPrepositionObject) antRole = "A-POBJ"; - + features.incrementCount("B-"+mRole); features.incrementCount("B-"+antRole); features.incrementCount("B-"+antRole+"-"+mRole); - if(HybridCorefProperties.combineObjectRoles(props, sievename)) { + if(CorefProperties.combineObjectRoles(props, sievename)) { // combine all objects if(m.isDirectObject || m.isIndirectObject || m.isPrepositionObject || candidate.isDirectObject || candidate.isIndirectObject || candidate.isPrepositionObject) { @@ -201,7 +202,7 @@ public static RVFDatum extractDatum(Mention m, Mention candidat features.incrementCount("B-"+antRole+"-"+mRole); } } - + if(mFirst.word().toLowerCase().matches("a|an")) { features.incrementCount("B-M-START-WITH-INDEFINITE"); } @@ -214,7 +215,7 @@ public static RVFDatum extractDatum(Mention m, Mention candidat if(aFirst.word().equalsIgnoreCase("the")) { features.incrementCount("B-A-START-WITH-DEFINITE"); } - + if(dict.indefinitePronouns.contains(m.lowercaseNormalizedSpanString())) { features.incrementCount("B-M-INDEFINITE-PRONOUN"); } @@ -233,7 +234,7 @@ public static RVFDatum extractDatum(Mention m, Mention candidat if(dict.reflexivePronouns.contains(candidate.headString)) { features.incrementCount("B-A-REFLEXIVE"); } - + if(m.headIndex == m.endIndex-1) features.incrementCount("B-M-HEADEND"); if(m.headIndex < m.endIndex-1) { CoreLabel headnext = m.sentenceWords.get(m.headIndex+1); @@ -253,7 +254,7 @@ public static RVFDatum extractDatum(Mention m, Mention candidat sb.append(a.mentionType).append("-"); } features.incrementCount("B-A-SHAPE-"+sb.toString()); - + sb = new StringBuilder(); sortedMentions = new ArrayList<>(mC.corefMentions.size()); sortedMentions.addAll(mC.corefMentions); @@ -262,8 +263,8 @@ public static RVFDatum extractDatum(Mention m, Mention candidat sb.append(men.mentionType).append("-"); } features.incrementCount("B-M-SHAPE-"+sb.toString()); - - if(CorefProperties.useConstituencyParse(props)) { + + if(CorefProperties.useConstituencyTree(props)) { sb = new StringBuilder(); Tree mTree = m.contextParseTree; Tree mHead = mTree.getLeaves().get(m.headIndex).ancestor(1, mTree); @@ -282,57 +283,57 @@ public static RVFDatum extractDatum(Mention m, Mention candidat } features.incrementCount("B-A-SYNPATH-"+sb.toString()); } - - + + features.incrementCount("A-FIRSTAPPEAR", aC.representative.sentNum); features.incrementCount("M-FIRSTAPPEAR", mC.representative.sentNum); int docSize = document.predictedMentions.size(); // document size in # of sentences features.incrementCount("A-FIRSTAPPEAR-NORMALIZED", aC.representative.sentNum/docSize); features.incrementCount("M-FIRSTAPPEAR-NORMALIZED", mC.representative.sentNum/docSize); } - + //////////////////////////////////////////////////////////////////////////////// /////// mention detection features //////////// //////////////////////////////////////////////////////////////////////////////// - if(HybridCorefProperties.useMentionDetectionFeatures(props, sievename)) { + if(CorefProperties.useMentionDetectionFeatures(props, sievename)) { // bare plurals if(m.originalSpan.size()==1 && m.headWord.tag().equals("NNS")) features.incrementCount("B-M-BAREPLURAL"); if(candidate.originalSpan.size()==1 && candidate.headWord.tag().equals("NNS")) features.incrementCount("B-A-BAREPLURAL"); - + // pleonastic it - if(CorefProperties.useConstituencyParse(props)) { - if(RuleBasedCorefMentionFinder.isPleonastic(m, m.contextParseTree) + if(CorefProperties.useConstituencyTree(props)) { + if(RuleBasedCorefMentionFinder.isPleonastic(m, m.contextParseTree) || RuleBasedCorefMentionFinder.isPleonastic(candidate, candidate.contextParseTree)) { features.incrementCount("B-PLEONASTICIT"); } } - + // quantRule if(dict.quantifiers.contains(mFirst.word().toLowerCase(Locale.ENGLISH))) features.incrementCount("B-M-QUANTIFIER"); if(dict.quantifiers.contains(aFirst.word().toLowerCase(Locale.ENGLISH))) features.incrementCount("B-A-QUANTIFIER"); - - // starts with negation + + // starts with negation if(mFirst.word().toLowerCase(Locale.ENGLISH).matches("none|no|nothing|not") || aFirst.word().toLowerCase(Locale.ENGLISH).matches("none|no|nothing|not")) { features.incrementCount("B-NEGATIVE-START"); } - + // parititive rule if(RuleBasedCorefMentionFinder.partitiveRule(m, m.sentenceWords, dict)) features.incrementCount("B-M-PARTITIVE"); if(RuleBasedCorefMentionFinder.partitiveRule(candidate, candidate.sentenceWords, dict)) features.incrementCount("B-A-PARTITIVE"); - + // % if(m.headString.equals("%")) features.incrementCount("B-M-HEAD%"); if(candidate.headString.equals("%")) features.incrementCount("B-A-HEAD%"); - + // adjective form of nations if(dict.isAdjectivalDemonym(m.spanToString())) features.incrementCount("B-M-ADJ-DEMONYM"); if(dict.isAdjectivalDemonym(candidate.spanToString())) features.incrementCount("B-A-ADJ-DEMONYM"); - + // ends with "etc." if(m.lowercaseNormalizedSpanString().endsWith("etc.")) features.incrementCount("B-M-ETC-END"); if(candidate.lowercaseNormalizedSpanString().endsWith("etc.")) features.incrementCount("B-A-ETC-END"); - + } //////////////////////////////////////////////////////////////////////////////// @@ -348,14 +349,14 @@ public static RVFDatum extractDatum(Mention m, Mention candidat features.incrementCount("B-A-PERSON-"+candidate.person); features.incrementCount("B-M-NETYPE-"+m.nerString); features.incrementCount("B-A-NETYPE-"+candidate.nerString); - + features.incrementCount("B-BOTH-NUMBER-"+candidate.number+"-"+m.number); features.incrementCount("B-BOTH-GENDER-"+candidate.gender+"-"+m.gender); features.incrementCount("B-BOTH-ANIMACY-"+candidate.animacy+"-"+m.animacy); features.incrementCount("B-BOTH-PERSON-"+candidate.person+"-"+m.person); features.incrementCount("B-BOTH-NETYPE-"+candidate.nerString+"-"+m.nerString); - - + + Set mcNumber = Generics.newHashSet(); for(Number n : mC.numbers) { features.incrementCount("B-MC-NUMBER-"+n); @@ -368,7 +369,7 @@ public static RVFDatum extractDatum(Mention m, Mention candidat if(mcNumber.size() == 1) features.incrementCount("B-MC-CLUSTERNUMBER-"+mcNumber.iterator().next()); else features.incrementCount("B-MC-CLUSTERNUMBER-CONFLICT"); } - + Set mcGender = Generics.newHashSet(); for(Gender g : mC.genders) { features.incrementCount("B-MC-GENDER-"+g); @@ -381,7 +382,7 @@ public static RVFDatum extractDatum(Mention m, Mention candidat if(mcGender.size() == 1) features.incrementCount("B-MC-CLUSTERGENDER-"+mcGender.iterator().next()); else features.incrementCount("B-MC-CLUSTERGENDER-CONFLICT"); } - + Set mcAnimacy = Generics.newHashSet(); for(Animacy a : mC.animacies) { features.incrementCount("B-MC-ANIMACY-"+a); @@ -394,7 +395,7 @@ public static RVFDatum extractDatum(Mention m, Mention candidat if(mcAnimacy.size() == 1) features.incrementCount("B-MC-CLUSTERANIMACY-"+mcAnimacy.iterator().next()); else features.incrementCount("B-MC-CLUSTERANIMACY-CONFLICT"); } - + Set mcNER = Generics.newHashSet(); for(String t : mC.nerStrings) { features.incrementCount("B-MC-NETYPE-"+t); @@ -407,7 +408,7 @@ public static RVFDatum extractDatum(Mention m, Mention candidat if(mcNER.size() == 1) features.incrementCount("B-MC-CLUSTERNETYPE-"+mcNER.iterator().next()); else features.incrementCount("B-MC-CLUSTERNETYPE-CONFLICT"); } - + Set acNumber = Generics.newHashSet(); for(Number n : aC.numbers) { features.incrementCount("B-AC-NUMBER-"+n); @@ -420,7 +421,7 @@ public static RVFDatum extractDatum(Mention m, Mention candidat if(acNumber.size() == 1) features.incrementCount("B-AC-CLUSTERNUMBER-"+acNumber.iterator().next()); else features.incrementCount("B-AC-CLUSTERNUMBER-CONFLICT"); } - + Set acGender = Generics.newHashSet(); for(Gender g : aC.genders) { features.incrementCount("B-AC-GENDER-"+g); @@ -433,7 +434,7 @@ public static RVFDatum extractDatum(Mention m, Mention candidat if(acGender.size() == 1) features.incrementCount("B-AC-CLUSTERGENDER-"+acGender.iterator().next()); else features.incrementCount("B-AC-CLUSTERGENDER-CONFLICT"); } - + Set acAnimacy = Generics.newHashSet(); for(Animacy a : aC.animacies) { features.incrementCount("B-AC-ANIMACY-"+a); @@ -446,7 +447,7 @@ public static RVFDatum extractDatum(Mention m, Mention candidat if(acAnimacy.size() == 1) features.incrementCount("B-AC-CLUSTERANIMACY-"+acAnimacy.iterator().next()); else features.incrementCount("B-AC-CLUSTERANIMACY-CONFLICT"); } - + Set acNER = Generics.newHashSet(); for(String t : aC.nerStrings) { features.incrementCount("B-AC-NETYPE-"+t); @@ -459,18 +460,18 @@ public static RVFDatum extractDatum(Mention m, Mention candidat if(acNER.size() == 1) features.incrementCount("B-AC-CLUSTERNETYPE-"+acNER.iterator().next()); else features.incrementCount("B-AC-CLUSTERNETYPE-CONFLICT"); } - - + + if(m.numbersAgree(candidate)) features.incrementCount("B-NUMBER-AGREE"); if(m.gendersAgree(candidate)) features.incrementCount("B-GENDER-AGREE"); if(m.animaciesAgree(candidate)) features.incrementCount("B-ANIMACY-AGREE"); if(CorefRules.entityAttributesAgree(mC, aC)) features.incrementCount("B-ATTRIBUTES-AGREE"); if(CorefRules.entityPersonDisagree(document, m, candidate, dict)) features.incrementCount("B-PERSON-DISAGREE"); - + //////////////////////////////////////////////////////////////////////////////// /////// dcoref rules //////////// //////////////////////////////////////////////////////////////////////////////// - if(HybridCorefProperties.useDcorefRules(props, sievename)) { + if(CorefProperties.useDcorefRules(props, sievename)) { if(CorefRules.entityIWithinI(m, candidate, dict)) features.incrementCount("B-i-within-i"); if(CorefRules.antecedentIsMentionSpeaker(document, m, candidate, dict)) features.incrementCount("B-ANT-IS-SPEAKER"); if(CorefRules.entitySameSpeaker(document, m, candidate)) features.incrementCount("B-SAME-SPEAKER"); @@ -478,14 +479,14 @@ public static RVFDatum extractDatum(Mention m, Mention candidat for(Mention a : aC.corefMentions) { if(CorefRules.entitySubjectObject(m, a)) features.incrementCount("B-CLUSTER-SUBJ-OBJ"); } - + if(CorefRules.entityPersonDisagree(document, m, candidate, dict) && CorefRules.entitySameSpeaker(document, m, candidate)) features.incrementCount("B-PERSON-DISAGREE-SAME-SPEAKER"); - + if(CorefRules.entityIWithinI(mC, aC, dict)) features.incrementCount("B-ENTITY-IWITHINI"); if(CorefRules.antecedentMatchesMentionSpeakerAnnotation(m, candidate, document)) features.incrementCount("B-ANT-IS-SPEAKER-OF-MENTION"); - - Set mType = HybridCorefProperties.getMentionType(props, sievename); + + Set mType = CorefProperties.getMentionType(props, sievename); if(mType.contains(MentionType.PROPER) || mType.contains(MentionType.NOMINAL)) { if(m.headString.equals(candidate.headString)) features.incrementCount("B-HEADMATCH"); if(CorefRules.entityHeadsAgree(mC, aC, m, candidate, dict)) features.incrementCount("B-HEADSAGREE"); @@ -509,17 +510,17 @@ public static RVFDatum extractDatum(Mention m, Mention candidat features.incrementCount("NUM-LIST-", numEntitiesInList(m)); if(m.spanToString().contains("two") || m.spanToString().contains("2") || m.spanToString().contains("both")) features.incrementCount("LIST-M-TWO"); if(m.spanToString().contains("three") || m.spanToString().contains("3")) features.incrementCount("LIST-M-THREE"); - if(candidate.spanToString().contains("two") - || candidate.spanToString().contains("2") + if(candidate.spanToString().contains("two") + || candidate.spanToString().contains("2") || candidate.spanToString().contains("both")) { features.incrementCount("B-LIST-A-TWO"); } - if(candidate.spanToString().contains("three") + if(candidate.spanToString().contains("three") || candidate.spanToString().contains("3")) { features.incrementCount("B-LIST-A-THREE"); } } - + if(mType.contains(MentionType.PRONOMINAL)) { if(dict.firstPersonPronouns.contains(m.headString)) features.incrementCount("B-M-I"); if(dict.secondPersonPronouns.contains(m.headString)) features.incrementCount("B-M-YOU"); @@ -528,7 +529,7 @@ public static RVFDatum extractDatum(Mention m, Mention candidat if(dict.neutralPronouns.contains(m.headString)) features.incrementCount("B-M-NEUTRAL"); if(dict.malePronouns.contains(m.headString)) features.incrementCount("B-M-MALE"); if(dict.femalePronouns.contains(m.headString)) features.incrementCount("B-M-FEMALE"); - + if(dict.firstPersonPronouns.contains(candidate.headString)) features.incrementCount("B-A-I"); if(dict.secondPersonPronouns.contains(candidate.headString)) features.incrementCount("B-A-YOU"); if(dict.thirdPersonPronouns.contains(candidate.headString)) features.incrementCount("B-A-3RDPERSON"); @@ -536,14 +537,14 @@ public static RVFDatum extractDatum(Mention m, Mention candidat if(dict.neutralPronouns.contains(candidate.headString)) features.incrementCount("B-A-NEUTRAL"); if(dict.malePronouns.contains(candidate.headString)) features.incrementCount("B-A-MALE"); if(dict.femalePronouns.contains(candidate.headString)) features.incrementCount("B-A-FEMALE"); - + features.incrementCount("B-M-GENERIC-"+m.generic); features.incrementCount("B-A-GENERIC-"+candidate.generic); - if(HybridCorefPrinter.dcorefPronounSieve.skipThisMention(document, m, mC, dict)) { + if(OldCorefPrinter.dcorefPronounSieve.skipThisMention(document, m, mC, dict)) { features.incrementCount("B-SKIPTHISMENTION-true"); } - + if(m.spanToString().equalsIgnoreCase("you") && mFollowing!=null && mFollowing.word().equalsIgnoreCase("know")) { features.incrementCount("B-YOUKNOW-PRECEDING-POS-" + ((mPreceding==null)? "NULL" : mPreceding.tag()) ); features.incrementCount("B-YOUKNOW-PRECEDING-WORD-"+ ((mPreceding==null)? "NULL" : mPreceding.word().toLowerCase()) ); @@ -559,29 +560,29 @@ public static RVFDatum extractDatum(Mention m, Mention candidat features.incrementCount("B-YOUKNOW-FOLLOWING-WORD-"+ ((nextword==null)? "NULL" : nextword.word().toLowerCase()) ); } } - + // discourse match features if(m.person==Person.YOU && document.docType==DocType.ARTICLE && m.headWord.get(CoreAnnotations.SpeakerAnnotation.class).equals("PER0")) { features.incrementCount("B-DISCOURSE-M-YOU-GENERIC?"); } if(candidate.generic && candidate.person==Person.YOU) features.incrementCount("B-DISCOURSE-A-YOU-GENERIC?"); - + String mString = m.lowercaseNormalizedSpanString(); String antString = candidate.lowercaseNormalizedSpanString(); - + // I-I if(m.number==Number.SINGULAR && dict.firstPersonPronouns.contains(mString) && candidate.number==Number.SINGULAR && dict.firstPersonPronouns.contains(antString) && CorefRules.entitySameSpeaker(document, m, candidate)) { features.incrementCount("B-DISCOURSE-I-I-SAMESPEAKER"); } - + // (speaker - I) if ((m.number==Number.SINGULAR && dict.firstPersonPronouns.contains(mString)) && CorefRules.antecedentIsMentionSpeaker(document, m, candidate, dict)) { features.incrementCount("B-DISCOURSE-SPEAKER-I"); } - + // (I - speaker) if ((candidate.number==Number.SINGULAR && dict.firstPersonPronouns.contains(antString)) && CorefRules.antecedentIsMentionSpeaker(document, candidate, m, dict)) { @@ -613,28 +614,28 @@ public static RVFDatum extractDatum(Mention m, Mention candidat features.incrementCount("B-DISCOURSE-WE-WE-DIFFSPEAKER"); } } - + //////////////////////////////////////////////////////////////////////////////// /////// POS features //////////// //////////////////////////////////////////////////////////////////////////////// - if(HybridCorefProperties.usePOSFeatures(props, sievename)) { + if(CorefProperties.usePOSFeatures(props, sievename)) { features.incrementCount("B-LEXICAL-M-HEADPOS-"+m.headWord.tag()); features.incrementCount("B-LEXICAL-A-HEADPOS-"+candidate.headWord.tag()); features.incrementCount("B-LEXICAL-M-FIRSTPOS-"+mFirst.tag()); features.incrementCount("B-LEXICAL-A-FIRSTPOS-"+aFirst.tag()); features.incrementCount("B-LEXICAL-M-LASTPOS-"+mLast.tag()); features.incrementCount("B-LEXICAL-A-LASTPOS-"+aLast.tag()); - + features.incrementCount("B-LEXICAL-M-PRECEDINGPOS-"+ ((mPreceding==null)? "NULL" : mPreceding.tag()) ); features.incrementCount("B-LEXICAL-A-PRECEDINGPOS-"+ ((aPreceding==null)? "NULL" : aPreceding.tag()) ); features.incrementCount("B-LEXICAL-M-FOLLOWINGPOS-"+ ((mFollowing==null)? "NULL" : mFollowing.tag()) ); features.incrementCount("B-LEXICAL-A-FOLLOWINGPOS-"+ ((aFollowing==null)? "NULL" : aFollowing.tag()) ); } - + //////////////////////////////////////////////////////////////////////////////// /////// lexical features //////////// //////////////////////////////////////////////////////////////////////////////// - if(HybridCorefProperties.useLexicalFeatures(props, sievename)) { + if(CorefProperties.useLexicalFeatures(props, sievename)) { features.incrementCount("B-LEXICAL-M-HEADWORD-"+m.headString.toLowerCase()); features.incrementCount("B-LEXICAL-A-HEADWORD-"+candidate.headString.toLowerCase()); @@ -642,12 +643,12 @@ public static RVFDatum extractDatum(Mention m, Mention candidat features.incrementCount("B-LEXICAL-A-FIRSTWORD-"+aFirst.word().toLowerCase()); features.incrementCount("B-LEXICAL-M-LASTWORD-"+mLast.word().toLowerCase()); features.incrementCount("B-LEXICAL-A-LASTWORD-"+aLast.word().toLowerCase()); - + features.incrementCount("B-LEXICAL-M-PRECEDINGWORD-"+ ((mPreceding==null)? "NULL" : mPreceding.word().toLowerCase()) ); features.incrementCount("B-LEXICAL-A-PRECEDINGWORD-"+ ((aPreceding==null)? "NULL" : aPreceding.word().toLowerCase()) ); features.incrementCount("B-LEXICAL-M-FOLLOWINGWORD-"+ ((mFollowing==null)? "NULL" : mFollowing.word().toLowerCase()) ); features.incrementCount("B-LEXICAL-A-FOLLOWINGWORD-"+ ((aFollowing==null)? "NULL" : aFollowing.word().toLowerCase()) ); - + //extra headword, modifiers lexical features for(String mHead : mC.heads) { if(!aC.heads.contains(mHead)) features.incrementCount("B-LEXICAL-MC-EXTRAHEAD-"+mHead); @@ -656,35 +657,35 @@ public static RVFDatum extractDatum(Mention m, Mention candidat if(!aC.words.contains(mWord)) features.incrementCount("B-LEXICAL-MC-EXTRAWORD-"+mWord); } } - + //////////////////////////////////////////////////////////////////////////////// /////// word vector features //////////// //////////////////////////////////////////////////////////////////////////////// // cosine - if(HybridCorefProperties.useWordEmbedding(props, sievename)) { + if(CorefProperties.useWordEmbedding(props, sievename)) { // dimension int dim = dict.vectors.entrySet().iterator().next().getValue().length; - + // distance between headword float[] mV = dict.vectors.get(m.headString.toLowerCase()); float[] aV = dict.vectors.get(candidate.headString.toLowerCase()); if(mV!=null && aV!=null) { features.incrementCount("WORDVECTOR-DIFF-HEADWORD", cosine(mV, aV)); } - + mV = dict.vectors.get(mFirst.word().toLowerCase()); aV = dict.vectors.get(aFirst.word().toLowerCase()); if(mV!=null && aV!=null) { features.incrementCount("WORDVECTOR-DIFF-FIRSTWORD", cosine(mV, aV)); } - + mV = dict.vectors.get(mLast.word().toLowerCase()); aV = dict.vectors.get(aLast.word().toLowerCase()); if(mV!=null && aV!=null) { features.incrementCount("WORDVECTOR-DIFF-LASTWORD", cosine(mV, aV)); } - + if(mPreceding!=null && aPreceding!=null) { mV = dict.vectors.get(mPreceding.word().toLowerCase()); aV = dict.vectors.get(aPreceding.word().toLowerCase()); @@ -699,10 +700,10 @@ public static RVFDatum extractDatum(Mention m, Mention candidat features.incrementCount("WORDVECTOR-DIFF-FOLLOWINGWORD", cosine(mV, aV)); } } - + float[] aggreM = new float[dim]; float[] aggreA = new float[dim]; - + for(CoreLabel cl : m.originalSpan) { float[] v = dict.vectors.get(cl.word().toLowerCase()); if(v==null) continue; @@ -716,7 +717,7 @@ public static RVFDatum extractDatum(Mention m, Mention candidat if(ArrayMath.L2Norm(aggreM)!=0 && ArrayMath.L2Norm(aggreA)!=0) { features.incrementCount("WORDVECTOR-AGGREGATE-DIFF", cosine(aggreM, aggreA)); } - + int cnt = 0; double dist = 0; for(CoreLabel mcl : m.originalSpan) { @@ -730,16 +731,16 @@ public static RVFDatum extractDatum(Mention m, Mention candidat } features.incrementCount("WORDVECTOR-AVG-DIFF", dist/cnt); } - + return new RVFDatum<>(features, label); } catch (Exception e) { log.info("Datum Extraction failed in Sieve.java while processing document: "+document.docInfo.get("DOC_ID")+" part: "+document.docInfo.get("DOC_PART")); throw new RuntimeException(e); } } - + // assume the input vectors are normalized - private static double cosine(float[] normalizedVector1, float[] normalizedVector2) { + private static double cosine(float[] normalizedVector1, float[] normalizedVector2) { double inner = ArrayMath.innerProduct(normalizedVector1, normalizedVector2); return inner; } @@ -751,7 +752,7 @@ public static int numEntitiesInList(Mention m) { if((cl.word().equalsIgnoreCase("and") || cl.word().equalsIgnoreCase("or")) && !m.originalSpan.get(i-1).word().equals(",")) num++; } - + return num; } } diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/RelaxedExactStringMatch.java b/src/edu/stanford/nlp/coref/sieve/RelaxedExactStringMatch.java similarity index 79% rename from src/edu/stanford/nlp/coref/hybrid/sieve/RelaxedExactStringMatch.java rename to src/edu/stanford/nlp/coref/sieve/RelaxedExactStringMatch.java index c0ed79e740..f6d2315ca3 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/RelaxedExactStringMatch.java +++ b/src/edu/stanford/nlp/coref/sieve/RelaxedExactStringMatch.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; public class RelaxedExactStringMatch extends DeterministicCorefSieve { public RelaxedExactStringMatch() { diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/RelaxedHeadMatch.java b/src/edu/stanford/nlp/coref/sieve/RelaxedHeadMatch.java similarity index 85% rename from src/edu/stanford/nlp/coref/hybrid/sieve/RelaxedHeadMatch.java rename to src/edu/stanford/nlp/coref/sieve/RelaxedHeadMatch.java index 212a34b495..be54d82946 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/RelaxedHeadMatch.java +++ b/src/edu/stanford/nlp/coref/sieve/RelaxedHeadMatch.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; public class RelaxedHeadMatch extends DeterministicCorefSieve { public RelaxedHeadMatch() { diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/Sieve.java b/src/edu/stanford/nlp/coref/sieve/Sieve.java similarity index 80% rename from src/edu/stanford/nlp/coref/hybrid/sieve/Sieve.java rename to src/edu/stanford/nlp/coref/sieve/Sieve.java index 07fe385ddd..028bca1f85 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/Sieve.java +++ b/src/edu/stanford/nlp/coref/sieve/Sieve.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; import edu.stanford.nlp.util.logging.Redwood; import java.io.Serializable; @@ -11,14 +11,15 @@ import java.util.Properties; import java.util.Set; +import edu.stanford.nlp.coref.CorefPrinter; +import edu.stanford.nlp.coref.CorefProperties; import edu.stanford.nlp.coref.data.CorefCluster; import edu.stanford.nlp.coref.data.Dictionaries; -import edu.stanford.nlp.coref.data.Document; -import edu.stanford.nlp.coref.data.Mention; import edu.stanford.nlp.coref.data.Dictionaries.MentionType; import edu.stanford.nlp.coref.data.Dictionaries.Person; -import edu.stanford.nlp.coref.hybrid.HybridCorefPrinter; -import edu.stanford.nlp.coref.hybrid.HybridCorefProperties; +import edu.stanford.nlp.coref.data.Document; +import edu.stanford.nlp.coref.data.Mention; +import edu.stanford.nlp.coref.OldCorefPrinter; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.trees.Tree; @@ -28,31 +29,31 @@ public abstract class Sieve implements Serializable { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(Sieve.class); - + private static final long serialVersionUID = 3986463332365306868L; - + public enum ClassifierType {RULE, RF, ORACLE} - + public ClassifierType classifierType = null; - + protected Locale lang; - + public final String sievename; - + /** the maximum sentence distance for linking two mentions */ public int maxSentDist = -1; - + /** type of mention we want to resolve. e.g., if mType is PRONOMINAL, we only resolve pronoun mentions */ public final Set mType; - + /** type of mention we want to compare to. e.g., if aType is PROPER, the resolution can be done only with PROPER antecedent */ public final Set aType; - + public final Set mTypeStr; public final Set aTypeStr; - + public Properties props = null; - + public Sieve() { this.lang = Locale.ENGLISH; this.sievename = this.getClass().getSimpleName(); @@ -62,35 +63,35 @@ public Sieve() { this.mTypeStr = Generics.newHashSet(); this.aTypeStr = Generics.newHashSet(); } - + public Sieve(Properties props){ - this.lang = HybridCorefProperties.getLanguage(props); + this.lang = CorefProperties.getLanguage(props); this.sievename = this.getClass().getSimpleName(); - this.aType = HybridCorefProperties.getAntecedentType(props, sievename); - this.mType = HybridCorefProperties.getMentionType(props, sievename); - this.maxSentDist = HybridCorefProperties.getMaxSentDistForSieve(props, sievename); - this.mTypeStr = HybridCorefProperties.getMentionTypeStr(props, sievename); - this.aTypeStr = HybridCorefProperties.getAntecedentTypeStr(props, sievename); + this.aType = CorefProperties.getAntecedentType(props, sievename); + this.mType = CorefProperties.getMentionType(props, sievename); + this.maxSentDist = CorefProperties.getMaxSentDistForSieve(props, sievename); + this.mTypeStr = CorefProperties.getMentionTypeStr(props, sievename); + this.aTypeStr = CorefProperties.getAntecedentTypeStr(props, sievename); } public Sieve(Properties props, String sievename) { - this.lang = HybridCorefProperties.getLanguage(props); + this.lang = CorefProperties.getLanguage(props); this.sievename = sievename; - this.aType = HybridCorefProperties.getAntecedentType(props, sievename); - this.mType = HybridCorefProperties.getMentionType(props, sievename); - this.maxSentDist = HybridCorefProperties.getMaxSentDistForSieve(props, sievename); - this.mTypeStr = HybridCorefProperties.getMentionTypeStr(props, sievename); - this.aTypeStr = HybridCorefProperties.getAntecedentTypeStr(props, sievename); + this.aType = CorefProperties.getAntecedentType(props, sievename); + this.mType = CorefProperties.getMentionType(props, sievename); + this.maxSentDist = CorefProperties.getMaxSentDistForSieve(props, sievename); + this.mTypeStr = CorefProperties.getMentionTypeStr(props, sievename); + this.aTypeStr = CorefProperties.getAntecedentTypeStr(props, sievename); } public String resolveMention(Document document, Dictionaries dict, Properties props) throws Exception { StringBuilder sbLog = new StringBuilder(); - - if(HybridCorefProperties.debug(props)) { + + if(CorefProperties.debug(props)) { sbLog.append("======================================================="); - sbLog.append(HybridCorefPrinter.printRawDoc(document, true, true)); + sbLog.append(OldCorefPrinter.printRawDoc(document, true, true)); } - + for(List mentionsInSent : document.predictedMentions) { for(int mIdx = 0 ; mIdx < mentionsInSent.size() ; mIdx++) { Mention m = mentionsInSent.get(mIdx); @@ -100,43 +101,43 @@ public String resolveMention(Document document, Dictionaries dict, Properties pr } return sbLog.toString(); } - + public abstract void findCoreferentAntecedent(Mention m, int mIdx, Document document, Dictionaries dict, Properties props, StringBuilder sbLog) throws Exception; - + // load sieve (from file or make a deterministic sieve) public static Sieve loadSieve(Properties props, String sievename) throws Exception { // log.info("Loading sieve: "+sievename+" ..."); - switch(HybridCorefProperties.getClassifierType(props, sievename)) { + switch(CorefProperties.getClassifierType(props, sievename)) { case RULE: - DeterministicCorefSieve sieve = (DeterministicCorefSieve) Class.forName("edu.stanford.nlp.coref.hybrid.sieve."+sievename).getConstructor().newInstance(); + DeterministicCorefSieve sieve = (DeterministicCorefSieve) Class.forName("edu.stanford.nlp.coref.sieve."+sievename).getConstructor().newInstance(); sieve.props = props; - sieve.lang = HybridCorefProperties.getLanguage(props); + sieve.lang = CorefProperties.getLanguage(props); return sieve; - + case RF: - log.info("Loading sieve: " + sievename + " from " + HybridCorefProperties.getPathModel(props, sievename) + " ... "); - RFSieve rfsieve = IOUtils.readObjectFromURLOrClasspathOrFileSystem(HybridCorefProperties.getPathModel(props, sievename)); - rfsieve.thresMerge = HybridCorefProperties.getMergeThreshold(props, sievename); + log.info("Loading sieve: " + sievename + " from " + CorefProperties.getPathModel(props, sievename) + " ... "); + RFSieve rfsieve = IOUtils.readObjectFromURLOrClasspathOrFileSystem(CorefProperties.getPathModel(props, sievename)); + rfsieve.thresMerge = CorefProperties.getMergeThreshold(props, sievename); log.info("done. Merging threshold: " + rfsieve.thresMerge); return rfsieve; - + case ORACLE: OracleSieve oracleSieve = new OracleSieve(props, sievename); oracleSieve.props = props; return oracleSieve; - + default: throw new RuntimeException("no sieve type specified"); } } - - + + public static List loadSieves(Properties props) throws Exception { List sieves = new ArrayList<>(); - String sieveProp = HybridCorefProperties.getSieves(props); - String currentSieveForTrain = HybridCorefProperties.getCurrentSieveForTrain(props); - String[] sievenames = (currentSieveForTrain==null)? + String sieveProp = CorefProperties.getSieves(props); + String currentSieveForTrain = CorefProperties.getCurrentSieveForTrain(props); + String[] sievenames = (currentSieveForTrain==null)? sieveProp.trim().split(",\\s*") : sieveProp.split(currentSieveForTrain)[0].trim().split(",\\s*"); for(String sievename : sievenames) { Sieve sieve = loadSieve(props, sievename); @@ -149,35 +150,35 @@ public static boolean hasThat(List words) { for(CoreLabel cl : words) { if(cl.word().equalsIgnoreCase("that") && cl.tag().equalsIgnoreCase("IN")) { return true; - } - } + } + } return false; } public static boolean hasToVerb(List words) { for(int i=0 ; i types) { @@ -215,12 +216,12 @@ protected static boolean matchedMentionType(Mention m, String type) { if(type.equalsIgnoreCase("it") && m.isPronominal() && m.person == Person.IT) return true; if(type.equalsIgnoreCase("they") && m.isPronominal() && m.person == Person.THEY) return true; if(type.equalsIgnoreCase("we") && m.isPronominal() && m.person == Person.WE) return true; - + // check named entity type if(type.toLowerCase().startsWith("ne:")) { if(type.toLowerCase().substring(3).startsWith(m.nerString.toLowerCase().substring(0, Math.min(3, m.nerString.length())))) return true; } - + return false; } @@ -234,12 +235,12 @@ public static List getOrderedAntecedents( // ordering antecedents if (antecedentSentence == m.sentNum) { // same sentence orderedAntecedents.addAll(orderedMentionsBySentence.get(m.sentNum).subList(0, mPosition)); - + if(dict.relativePronouns.contains(m.spanToString())) Collections.reverse(orderedAntecedents); else { orderedAntecedents = sortMentionsByClause(orderedAntecedents, m); } - + } else { // previous sentence orderedAntecedents.addAll(orderedMentionsBySentence.get(antecedentSentence)); } diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/SpeakerMatch.java b/src/edu/stanford/nlp/coref/sieve/SpeakerMatch.java similarity index 75% rename from src/edu/stanford/nlp/coref/hybrid/sieve/SpeakerMatch.java rename to src/edu/stanford/nlp/coref/sieve/SpeakerMatch.java index 71de772bb0..bd27157c2d 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/SpeakerMatch.java +++ b/src/edu/stanford/nlp/coref/sieve/SpeakerMatch.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; public class SpeakerMatch extends DeterministicCorefSieve { public SpeakerMatch() { diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/StrictHeadMatch1.java b/src/edu/stanford/nlp/coref/sieve/StrictHeadMatch1.java similarity index 85% rename from src/edu/stanford/nlp/coref/hybrid/sieve/StrictHeadMatch1.java rename to src/edu/stanford/nlp/coref/sieve/StrictHeadMatch1.java index 55e5bec326..223d1a77a6 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/StrictHeadMatch1.java +++ b/src/edu/stanford/nlp/coref/sieve/StrictHeadMatch1.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; public class StrictHeadMatch1 extends DeterministicCorefSieve { public StrictHeadMatch1() { diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/StrictHeadMatch2.java b/src/edu/stanford/nlp/coref/sieve/StrictHeadMatch2.java similarity index 83% rename from src/edu/stanford/nlp/coref/hybrid/sieve/StrictHeadMatch2.java rename to src/edu/stanford/nlp/coref/sieve/StrictHeadMatch2.java index 56f2e1fa5a..b6a809b4d6 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/StrictHeadMatch2.java +++ b/src/edu/stanford/nlp/coref/sieve/StrictHeadMatch2.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; public class StrictHeadMatch2 extends DeterministicCorefSieve { public StrictHeadMatch2() { diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/StrictHeadMatch3.java b/src/edu/stanford/nlp/coref/sieve/StrictHeadMatch3.java similarity index 83% rename from src/edu/stanford/nlp/coref/hybrid/sieve/StrictHeadMatch3.java rename to src/edu/stanford/nlp/coref/sieve/StrictHeadMatch3.java index 2ce4ba0fc4..e6caeb938d 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/StrictHeadMatch3.java +++ b/src/edu/stanford/nlp/coref/sieve/StrictHeadMatch3.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; public class StrictHeadMatch3 extends DeterministicCorefSieve { public StrictHeadMatch3() { diff --git a/src/edu/stanford/nlp/coref/hybrid/sieve/StrictHeadMatch4.java b/src/edu/stanford/nlp/coref/sieve/StrictHeadMatch4.java similarity index 88% rename from src/edu/stanford/nlp/coref/hybrid/sieve/StrictHeadMatch4.java rename to src/edu/stanford/nlp/coref/sieve/StrictHeadMatch4.java index 2f4d0633d6..7ecfb39e52 100644 --- a/src/edu/stanford/nlp/coref/hybrid/sieve/StrictHeadMatch4.java +++ b/src/edu/stanford/nlp/coref/sieve/StrictHeadMatch4.java @@ -1,4 +1,4 @@ -package edu.stanford.nlp.coref.hybrid.sieve; +package edu.stanford.nlp.coref.sieve; public class StrictHeadMatch4 extends DeterministicCorefSieve { public StrictHeadMatch4() { diff --git a/src/edu/stanford/nlp/coref/statistical/StatisticalCorefAlgorithm.java b/src/edu/stanford/nlp/coref/statistical/BestFirstCorefSystem.java similarity index 65% rename from src/edu/stanford/nlp/coref/statistical/StatisticalCorefAlgorithm.java rename to src/edu/stanford/nlp/coref/statistical/BestFirstCorefSystem.java index b5bdb142c5..804cea4333 100644 --- a/src/edu/stanford/nlp/coref/statistical/StatisticalCorefAlgorithm.java +++ b/src/edu/stanford/nlp/coref/statistical/BestFirstCorefSystem.java @@ -9,10 +9,6 @@ import java.util.Properties; import java.util.Set; -import edu.stanford.nlp.coref.CorefAlgorithm; -import edu.stanford.nlp.coref.CorefProperties; -import edu.stanford.nlp.coref.CorefUtils; -import edu.stanford.nlp.coref.data.Dictionaries; import edu.stanford.nlp.coref.data.Dictionaries.MentionType; import edu.stanford.nlp.coref.data.Document; import edu.stanford.nlp.coref.data.Mention; @@ -23,12 +19,7 @@ import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.RuntimeInterruptedException; -/** - * Does best-first coreference resolution by linking each mention to its highest scoring candidate - * antecedent if that score is above a threshold. - * @author Kevin Clark - */ -public class StatisticalCorefAlgorithm implements CorefAlgorithm { +public class BestFirstCorefSystem extends StatisticalCorefSystem { private final Map, Double> thresholds; private final FeatureExtractor extractor; @@ -36,26 +27,15 @@ public class StatisticalCorefAlgorithm implements CorefAlgorithm { private final int maxMentionDistance; private final int maxMentionDistanceWithStringMatch; - public StatisticalCorefAlgorithm(Properties props, Dictionaries dictionaries) { - this(props, dictionaries, - StatisticalCorefProperties.wordCountsPath(props), - StatisticalCorefProperties.rankingModelPath(props), - CorefProperties.maxMentionDistance(props), - CorefProperties.maxMentionDistanceWithStringMatch(props), - StatisticalCorefProperties.pairwiseScoreThresholds(props)); + public BestFirstCorefSystem(Properties props, String wordCountsFile, String modelFile, + int maxMentionDistance, int maxMentionDistanceWithStringMatch, double threshold) { + this(props, wordCountsFile, modelFile, maxMentionDistance, maxMentionDistanceWithStringMatch, + new double[] {threshold, threshold, threshold, threshold}); } - public StatisticalCorefAlgorithm(Properties props, Dictionaries dictionaries, String wordCountsFile, - String modelFile, int maxMentionDistance, int maxMentionDistanceWithStringMatch, - double threshold) { - this(props, dictionaries, wordCountsFile, modelFile, maxMentionDistance, - maxMentionDistanceWithStringMatch, new double[] {threshold, threshold, threshold, - threshold}); - } - - public StatisticalCorefAlgorithm(Properties props, Dictionaries dictionaries, String wordCountsFile, - String modelPath, int maxMentionDistance, int maxMentionDistanceWithStringMatch, - double[] thresholds) { + public BestFirstCorefSystem(Properties props, String wordCountsFile, String modelPath, + int maxMentionDistance, int maxMentionDistanceWithStringMatch, double[] thresholds) { + super(props); extractor = new FeatureExtractor(props, dictionaries, null, wordCountsFile); classifier = PairwiseModel.newBuilder("classifier", MetaFeatureExtractor.newBuilder().build()).modelPath(modelPath).build(); @@ -76,16 +56,42 @@ private static Map, Double> makeThresholds(double[] thres @Override public void runCoref(Document document) { Compressor compressor = new Compressor<>(); + List sortedMentions = StatisticalCorefUtils.getSortedMentions(document); if (Thread.interrupted()) { // Allow interrupting throw new RuntimeInterruptedException(); } + for (int i = 0; i < sortedMentions.size(); i++) { + sortedMentions.get(i).mentionNum = i; + } - Map, Boolean> pairs = new HashMap<>(); - for (Map.Entry> e: CorefUtils.heuristicFilter( - CorefUtils.getSortedMentions(document), - maxMentionDistance, maxMentionDistanceWithStringMatch).entrySet()) { - for (int m1 : e.getValue()) { - pairs.put(new Pair<>(m1, e.getKey()), true); + Map, Boolean> pairs = + StatisticalCorefUtils.getUnlabeledMentionPairs(document, maxMentionDistance); + if (maxMentionDistance != Integer.MAX_VALUE) { + Map> wordToMentions = new HashMap<>(); + for (Mention m : document.predictedMentionsByID.values()) { + if (Thread.interrupted()) { // Allow interrupting + throw new RuntimeInterruptedException(); + } + for (String word : getContentWords(m)) { + wordToMentions.putIfAbsent(word, new ArrayList<>()); + wordToMentions.get(word).add(m); + } + } + for (Mention m1 : document.predictedMentionsByID.values()) { + if (Thread.interrupted()) { // Allow interrupting + throw new RuntimeInterruptedException(); + } + for (String word : getContentWords(m1)) { + List ms = wordToMentions.get(word); + if (ms != null) { + for (Mention m2 : ms) { + if (m1.mentionNum < m2.mentionNum + && m1.mentionNum >= m2.mentionNum - maxMentionDistanceWithStringMatch) { + pairs.put(new Pair<>(m1.mentionID, m2.mentionID), false); + } + } + } + } } } @@ -118,7 +124,7 @@ public void runCoref(Document document) { MentionType mt2 = document.predictedMentionsByID.get(pair.second).mentionType; if (pairwiseScores.getCount(pair) > thresholds.get(new Pair<>(mt1 == MentionType.PRONOMINAL, mt2 == MentionType.PRONOMINAL))) { - CorefUtils.mergeCoreferenceClusters(pair, document); + StatisticalCorefUtils.mergeCoreferenceClusters(pair, document); } } } @@ -134,4 +140,5 @@ private static List getContentWords(Mention m) { } return words; } + } diff --git a/src/edu/stanford/nlp/coref/statistical/Clusterer.java b/src/edu/stanford/nlp/coref/statistical/Clusterer.java index 102e77f702..0e07aa2579 100644 --- a/src/edu/stanford/nlp/coref/statistical/Clusterer.java +++ b/src/edu/stanford/nlp/coref/statistical/Clusterer.java @@ -9,21 +9,13 @@ import java.util.Map; import java.util.Random; -import edu.stanford.nlp.coref.statistical.ClustererDataLoader.ClustererDoc; -import edu.stanford.nlp.coref.statistical.EvalUtils.B3Evaluator; -import edu.stanford.nlp.coref.statistical.EvalUtils.Evaluator; - import edu.stanford.nlp.io.IOUtils; +import edu.stanford.nlp.coref.statistical.ClustererDataLoader.ClustererDoc; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.logging.Redwood; -/** - * System for building up coreference clusters incrementally, merging a pair of clusters each step. - * Trained with a variant of the SEARN imitation learning algorithm. - * @author Kevin Clark - */ public class Clusterer { private static final boolean USE_CLASSIFICATION = true; private static final boolean USE_RANKING = true; @@ -91,7 +83,7 @@ public void doTraining(String modelName) { List trainDocs; try { PrintWriter configWriter = new PrintWriter(outputPath + "config", "UTF-8"); - configWriter.print(StatisticalCorefTrainer.fieldValues(this)); + configWriter.print(StatisticalCorefUtils.fieldValues(this)); configWriter.close(); progressWriter = new PrintWriter(outputPath + "progress", "UTF-8"); @@ -189,7 +181,7 @@ private void trainPolicy(List>> exam private double evaluatePolicy(List docs, boolean training) { isTraining = 0; - B3Evaluator evaluator = new B3Evaluator(); + EvalUtils.B3Evaluator evaluator = new EvalUtils.B3Evaluator(); for (ClustererDoc doc : docs) { State currentState = new State(doc); while (!currentState.isComplete()) { @@ -425,7 +417,7 @@ public double getFinalCost(ClustererClassifier classifier) { return cost; } - public void updateEvaluator(Evaluator evaluator) { + public void updateEvaluator(EvalUtils.Evaluator evaluator) { evaluator.update(doc.goldClusters, clusters, doc.mentionToGold, mentionToCluster); } diff --git a/src/edu/stanford/nlp/coref/statistical/ClustererDataLoader.java b/src/edu/stanford/nlp/coref/statistical/ClustererDataLoader.java index 765fb775c7..fad453905a 100644 --- a/src/edu/stanford/nlp/coref/statistical/ClustererDataLoader.java +++ b/src/edu/stanford/nlp/coref/statistical/ClustererDataLoader.java @@ -14,10 +14,6 @@ import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.util.Pair; -/** - * Loads the data used to train {@link Clusterer}. - * @author Kevin Clark - */ public class ClustererDataLoader { public static class ClustererDoc { public final int id; diff --git a/src/edu/stanford/nlp/coref/statistical/ClusteringCorefAlgorithm.java b/src/edu/stanford/nlp/coref/statistical/ClusteringCorefSystem.java similarity index 66% rename from src/edu/stanford/nlp/coref/statistical/ClusteringCorefAlgorithm.java rename to src/edu/stanford/nlp/coref/statistical/ClusteringCorefSystem.java index 8f76c0bf27..f6414b083d 100644 --- a/src/edu/stanford/nlp/coref/statistical/ClusteringCorefAlgorithm.java +++ b/src/edu/stanford/nlp/coref/statistical/ClusteringCorefSystem.java @@ -4,38 +4,23 @@ import java.util.Properties; import java.util.stream.Collectors; -import edu.stanford.nlp.coref.CorefAlgorithm; -import edu.stanford.nlp.coref.CorefUtils; -import edu.stanford.nlp.coref.data.Dictionaries; import edu.stanford.nlp.coref.data.Document; -import edu.stanford.nlp.coref.statistical.ClustererDataLoader.ClustererDoc; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.RuntimeInterruptedException; -/** - * Builds up coreference clusters incrementally with agglomerative clustering. - * @author Kevin Clark - */ -public class ClusteringCorefAlgorithm implements CorefAlgorithm { +// TODO: some serializable model class +public class ClusteringCorefSystem extends StatisticalCorefSystem { private final Clusterer clusterer; private final PairwiseModel classificationModel; private final PairwiseModel rankingModel; private final PairwiseModel anaphoricityModel; private final FeatureExtractor extractor; - public ClusteringCorefAlgorithm(Properties props, Dictionaries dictionaries) { - this(props, dictionaries, - StatisticalCorefProperties.clusteringModelPath(props), - StatisticalCorefProperties.classificationModelPath(props), - StatisticalCorefProperties.rankingModelPath(props), - StatisticalCorefProperties.anaphoricityModelPath(props), - StatisticalCorefProperties.wordCountsPath(props)); - } - - public ClusteringCorefAlgorithm(Properties props, Dictionaries dictionaries, String clusteringPath, - String classificationPath, String rankingPath, String anaphoricityPath, - String wordCountsPath) { + public ClusteringCorefSystem(Properties props, String clusteringPath, String classificationPath, + String rankingPath, String anaphoricityPath, String wordCountsPath) { + super(props); clusterer = new Clusterer(clusteringPath); classificationModel = PairwiseModel.newBuilder("classification", MetaFeatureExtractor.newBuilder().build()) @@ -52,8 +37,9 @@ public ClusteringCorefAlgorithm(Properties props, Dictionaries dictionaries, Str @Override public void runCoref(Document document) { Map, Boolean> mentionPairs = - CorefUtils.getUnlabeledMentionPairs(document); - if (mentionPairs.size() == 0) { + StatisticalCorefUtils.getUnlabeledMentionPairs(document); + // when the mention count is 0 or 1, just return since there is no coref work to be done + if (mentionPairs.keySet().size() == 0) { return; } Compressor compressor = new Compressor<>(); @@ -63,7 +49,9 @@ public void runCoref(Document document) { Counter> rankingScores = new ClassicCounter<>(); Counter anaphoricityScores = new ClassicCounter<>(); for (Example example : examples.examples) { - CorefUtils.checkForInterrupt(); + if (Thread.interrupted()) { // Allow interrupting + throw new RuntimeInterruptedException(); + } Pair mentionPair = new Pair<>(example.mentionId1, example.mentionId2); classificationScores.incrementCount(mentionPair, classificationModel @@ -76,11 +64,14 @@ public void runCoref(Document document) { } } - ClustererDoc doc = new ClustererDoc(0, classificationScores, rankingScores, anaphoricityScores, + ClustererDataLoader.ClustererDoc doc = new ClustererDataLoader.ClustererDoc(0, classificationScores, rankingScores, anaphoricityScores, mentionPairs, null, document.predictedMentionsByID.entrySet().stream().collect( Collectors.toMap(Map.Entry::getKey, e -> e.getValue().mentionType.toString()))); for (Pair mentionPair : clusterer.getClusterMerges(doc)) { - CorefUtils.mergeCoreferenceClusters(mentionPair, document); + if (Thread.interrupted()) { // Allow interrupting + throw new RuntimeInterruptedException(); + } + StatisticalCorefUtils.mergeCoreferenceClusters(mentionPair, document); } } } diff --git a/src/edu/stanford/nlp/coref/statistical/CompressedFeatureVector.java b/src/edu/stanford/nlp/coref/statistical/CompressedFeatureVector.java index 5dda512e87..69ed0ffaf9 100644 --- a/src/edu/stanford/nlp/coref/statistical/CompressedFeatureVector.java +++ b/src/edu/stanford/nlp/coref/statistical/CompressedFeatureVector.java @@ -3,10 +3,6 @@ import java.io.Serializable; import java.util.List; -/** - * A low-memory representation of a {@link Counter} created by a {@link Compressor}. - * @author Kevin Clark - */ public class CompressedFeatureVector implements Serializable { private static final long serialVersionUID = -8889507443653366753L; public final List keys; diff --git a/src/edu/stanford/nlp/coref/statistical/Compressor.java b/src/edu/stanford/nlp/coref/statistical/Compressor.java index 7499e9a82d..a42ec24d50 100644 --- a/src/edu/stanford/nlp/coref/statistical/Compressor.java +++ b/src/edu/stanford/nlp/coref/statistical/Compressor.java @@ -9,11 +9,6 @@ import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; -/** - * Converts a Counter to a {@link CompressedFeatureVector} (i.e., parallel lists of integer - * keys and double values), which takes up much less memory. - * @author Kevin Clark - */ public class Compressor implements Serializable { private static final long serialVersionUID = 364548642855692442L; private final Map index; diff --git a/src/edu/stanford/nlp/coref/statistical/DatasetBuilder.java b/src/edu/stanford/nlp/coref/statistical/DatasetBuilder.java index 304fb69d61..2b1bdf166c 100644 --- a/src/edu/stanford/nlp/coref/statistical/DatasetBuilder.java +++ b/src/edu/stanford/nlp/coref/statistical/DatasetBuilder.java @@ -8,17 +8,13 @@ import java.util.Random; import java.util.stream.Collectors; -import edu.stanford.nlp.coref.CorefDocumentProcessor; -import edu.stanford.nlp.coref.CorefUtils; +import edu.stanford.nlp.coref.data.CorefCluster; import edu.stanford.nlp.coref.data.Document; +import edu.stanford.nlp.coref.data.Mention; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.util.Pair; -/** - * Produces train/dev/test sets for training coreference models with (optionally) sampling. - * @author Kevin Clark - */ -public class DatasetBuilder implements CorefDocumentProcessor { +public class DatasetBuilder implements DocumentProcessor { private final int maxExamplesPerDocument; private final double minClassImbalancedPerDocument; private final Map, Boolean>> mentionPairs; @@ -38,7 +34,19 @@ public DatasetBuilder(double minClassImbalancedPerDocument, int maxExamplesPerDo @Override public void process(int id, Document document) { Map, Boolean> labeledPairs = - CorefUtils.getLabeledMentionPairs(document); + StatisticalCorefUtils.getUnlabeledMentionPairs(document); + for (CorefCluster c : document.goldCorefClusters.values()) { + List clusterMentions = new ArrayList<>(c.getCorefMentions()); + for (int i = 0; i < clusterMentions.size(); i++) { + for (Mention clusterMention : clusterMentions) { + Pair mentionPair = new Pair<>( + clusterMentions.get(i).mentionID, clusterMention.mentionID); + if (labeledPairs.containsKey(mentionPair)) { + labeledPairs.put(mentionPair, true); + } + } + } + } long numP = labeledPairs.keySet().stream().filter(m -> labeledPairs.get(m)).count(); List> negative = labeledPairs.keySet().stream() diff --git a/src/edu/stanford/nlp/coref/statistical/DocumentExamples.java b/src/edu/stanford/nlp/coref/statistical/DocumentExamples.java index 30eee4928a..30ffd91e39 100644 --- a/src/edu/stanford/nlp/coref/statistical/DocumentExamples.java +++ b/src/edu/stanford/nlp/coref/statistical/DocumentExamples.java @@ -4,11 +4,6 @@ import java.util.List; import java.util.Map; -/** - * Represents all coreference examples for a particular document. Individual mention features are - * stored separately from pairwise features to save memory. - * @author Kevin Clark - */ public class DocumentExamples implements Serializable { private static final long serialVersionUID = -2474306699767791493L; diff --git a/src/edu/stanford/nlp/coref/statistical/DocumentProcessor.java b/src/edu/stanford/nlp/coref/statistical/DocumentProcessor.java new file mode 100644 index 0000000000..2ff9f6c8a1 --- /dev/null +++ b/src/edu/stanford/nlp/coref/statistical/DocumentProcessor.java @@ -0,0 +1,47 @@ +package edu.stanford.nlp.coref.statistical; + +import java.util.Properties; + +import edu.stanford.nlp.coref.CorefDocMaker; +import edu.stanford.nlp.coref.data.Dictionaries; +import edu.stanford.nlp.coref.data.Document; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.util.logging.Redwood; + +public interface DocumentProcessor { + public void process(int id, Document document); + + public void finish() throws Exception; + + public default String getName() { + return this.getClass().getSimpleName(); + } + + public default void run(Properties props, Dictionaries dictionaries) throws Exception { + run(new CorefDocMaker(props, dictionaries)); + } + + public default void runFromScratch(Properties props, Dictionaries dictionaries) + throws Exception { + StanfordCoreNLP.clearAnnotatorPool(); + run(new CorefDocMaker(props, dictionaries)); + } + + public default void run(CorefDocMaker docMaker) throws Exception { + Redwood.hideChannelsEverywhere("debug-mention", "debug-preprocessor", "debug-docreader", + "debug-md"); + int docId = 0; + Document document = docMaker.nextDoc(); + long time = System.currentTimeMillis(); + while (document != null) { + document.extractGoldCorefClusters(); + process(docId, document); + Redwood.log("scoref", "Processed document " + docId + " in " + + (System.currentTimeMillis() - time) / 1000.0 + "s with " + getName()); + time = System.currentTimeMillis(); + docId++; + document = docMaker.nextDoc(); + } + finish(); + } +} diff --git a/src/edu/stanford/nlp/coref/statistical/DocumentProcessorRunner.java b/src/edu/stanford/nlp/coref/statistical/DocumentProcessorRunner.java new file mode 100644 index 0000000000..8c601256db --- /dev/null +++ b/src/edu/stanford/nlp/coref/statistical/DocumentProcessorRunner.java @@ -0,0 +1,52 @@ +package edu.stanford.nlp.coref.statistical; + +import java.util.Properties; + +import edu.stanford.nlp.coref.CorefDocMaker; +import edu.stanford.nlp.coref.data.Dictionaries; +import edu.stanford.nlp.coref.data.Document; +import edu.stanford.nlp.util.logging.Redwood; + +public class DocumentProcessorRunner { + private final CorefDocMaker docMaker; + private final DocumentProcessor processor; + private final int maxDocs; + + public DocumentProcessorRunner(Properties props, Dictionaries dictionaries, + DocumentProcessor processor) { + this(props, dictionaries, processor, Integer.MAX_VALUE); + } + + public DocumentProcessorRunner(Properties props, Dictionaries dictionaries, + DocumentProcessor processor, int maxDocs) { + Redwood.hideChannelsEverywhere("debug-mention", "debug-preprocessor", "debug-docreader", + "debug-md"); + try { + docMaker = new CorefDocMaker(props, dictionaries); + } catch (Exception e) { + throw new RuntimeException("Error initializing coref system", e); + } + this.processor = processor; + this.maxDocs = maxDocs; + } + + public void run() throws Exception { + Document document = docMaker.nextDoc(); + int docId = 0; + long time = System.currentTimeMillis(); + while (document != null) { + if (docId >= maxDocs) { + break; + } + document.extractGoldCorefClusters(); + processor.process(docId, document); + Redwood.log("scoref", "Processed document " + docId + " in " + + (System.currentTimeMillis() - time) / 1000.0 + "s with " + + processor.getClass().getSimpleName()); + time = System.currentTimeMillis(); + document = docMaker.nextDoc(); + docId++; + } + processor.finish(); + } +} diff --git a/src/edu/stanford/nlp/coref/statistical/EvalUtils.java b/src/edu/stanford/nlp/coref/statistical/EvalUtils.java index d36a17f31e..134f9c860a 100644 --- a/src/edu/stanford/nlp/coref/statistical/EvalUtils.java +++ b/src/edu/stanford/nlp/coref/statistical/EvalUtils.java @@ -7,15 +7,10 @@ import java.util.stream.Collectors; import edu.stanford.nlp.coref.statistical.Clusterer.Cluster; - import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.util.Pair; -/** - * Utility classes for computing the B^3 and MUC coreference metrics - * @author Kevin Clark - */ public class EvalUtils { public static double getCombinedF1(double mucWeight, List> gold, diff --git a/src/edu/stanford/nlp/coref/statistical/Example.java b/src/edu/stanford/nlp/coref/statistical/Example.java index 06829ec1bd..b19c0d2c80 100644 --- a/src/edu/stanford/nlp/coref/statistical/Example.java +++ b/src/edu/stanford/nlp/coref/statistical/Example.java @@ -2,17 +2,14 @@ import java.io.Serializable; -import edu.stanford.nlp.coref.data.Mention; import edu.stanford.nlp.coref.data.Dictionaries.MentionType; -/** - * A representation of a mention-pair for training coreference models. - * @author Kevin Clark - */ +import edu.stanford.nlp.coref.data.Mention; + public class Example implements Serializable { private static final long serialVersionUID = 1104263558466004590L; public final int docId; public final double label; - public final CompressedFeatureVector pairwiseFeatures; + public final CompressedFeatureVector features; public final int mentionId1; public final int mentionId2; @@ -20,10 +17,10 @@ public class Example implements Serializable { public final MentionType mentionType2; public Example(int docId, Mention m1, Mention m2, double label, - CompressedFeatureVector pairwiseFeatures) { + CompressedFeatureVector features) { this.docId = docId; this.label = label; - this.pairwiseFeatures = pairwiseFeatures; + this.features = features; this.mentionId1 = m1.mentionID; this.mentionId2 = m2.mentionID; @@ -34,7 +31,7 @@ public Example(int docId, Mention m1, Mention m2, double label, public Example(Example pair, boolean isPositive) { this.docId = pair.docId; this.label = isPositive ? 1 : 0; - this.pairwiseFeatures = null; + this.features = null; this.mentionId1 = -1; this.mentionId2 = pair.mentionId2; @@ -43,6 +40,6 @@ public Example(Example pair, boolean isPositive) { } public boolean isNewLink() { - return pairwiseFeatures == null; + return features == null; } } diff --git a/src/edu/stanford/nlp/coref/statistical/FeatureExtractor.java b/src/edu/stanford/nlp/coref/statistical/FeatureExtractor.java index 04247ef385..45d2342fa8 100644 --- a/src/edu/stanford/nlp/coref/statistical/FeatureExtractor.java +++ b/src/edu/stanford/nlp/coref/statistical/FeatureExtractor.java @@ -10,13 +10,11 @@ import java.util.Random; import java.util.Set; -import edu.stanford.nlp.coref.CorefProperties; import edu.stanford.nlp.coref.CorefRules; -import edu.stanford.nlp.coref.CorefUtils; import edu.stanford.nlp.coref.data.CorefCluster; import edu.stanford.nlp.coref.data.Dictionaries; import edu.stanford.nlp.coref.data.Dictionaries.MentionType; -import edu.stanford.nlp.coref.data.Dictionaries.Number; +import edu.stanford.nlp.coref.data.Dictionaries.Number; import edu.stanford.nlp.coref.data.Dictionaries.Person; import edu.stanford.nlp.coref.data.Document; import edu.stanford.nlp.coref.data.Document.DocType; @@ -34,10 +32,6 @@ import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.StringUtils; -/** - * A class for featurizing mention pairs and individual mentions. - * @author Kevin Clark - */ public class FeatureExtractor { private static int MIN_WORD_COUNT = 20; private static int BIN_EXACT = 10; @@ -78,8 +72,8 @@ public FeatureExtractor(Properties props, Dictionaries dictionaries, this.dictionaries = dictionaries; this.compressor = compressor; this.vocabulary = vocabulary; - this.useDocSource = CorefProperties.conll(props); - this.useConstituencyParse = CorefProperties.useConstituencyParse(props); + this.useDocSource = StatisticalCorefProperties.conll(props); + this.useConstituencyParse = StatisticalCorefProperties.useConstituencyParse(props); } private static Set loadVocabulary(String wordCountsPath) { @@ -104,10 +98,11 @@ public DocumentExamples extract(int id, Document document, public DocumentExamples extract(int id, Document document, Map, Boolean> labeledPairs, Compressor compressor) { - List mentionsList = CorefUtils.getSortedMentions(document); + List mentionsList = StatisticalCorefUtils.getSortedMentions(document); Map> mentionsByHeadIndex = new HashMap<>(); for (int i = 0; i < mentionsList.size(); i++) { Mention m = mentionsList.get(i); + m.mentionNum = i; List withIndex = mentionsByHeadIndex.get(m.headIndex); if (withIndex == null) { withIndex = new ArrayList<>(); @@ -313,7 +308,7 @@ private Counter getFeatures(Document doc, Mention m1, Mention m2) { addFeature(features, "heads-agree", m2.headsAgree(m1)); addFeature(features, "exact-match", m1.toString().trim().toLowerCase().equals( m2.toString().trim().toLowerCase())); - addFeature(features, "partial-match", relaxedStringMatch(m1, m2)); + addFeature(features, "partial-match", partialMatch(m1, m2)); double editDistance = StringUtils.editDistance(m1.spanToString(), m2.spanToString()) / (double) (m1.spanToString().length() + m2.spanToString().length()); @@ -414,7 +409,7 @@ private Counter getFeatures(Document doc, Mention m1, Mention m2) { addFeature(features, "entity-attributes-agree", CorefRules.entityAttributesAgree(c2, c1)); addFeature(features, "entity-token-distance", CorefRules.entityTokenDistance(m2, m1)); addFeature(features, "i-within-i", CorefRules.entityIWithinI(m2, m1, dictionaries)); - addFeature(features, "exact-string-match", CorefRules.entityExactStringMatch(c2, c1,dictionaries, doc.roleSet)); + addFeature(features, "exact-string-match", CorefRules.entityExactStringMatch(c2, c1, dictionaries, doc.roleSet)); addFeature(features, "entity-relaxed-heads-agree", CorefRules.entityRelaxedHeadsAgreeBetweenMentions(c2, c1, m2, m1)); addFeature(features, "is-acronym", CorefRules.entityIsAcronym(doc, c2, c1)); @@ -470,7 +465,7 @@ private static void addNumeric(Counter features, String key, int value) features.incrementCount(key, value); } - public static boolean relaxedStringMatch(Mention m1, Mention m2) { + private static boolean partialMatch(Mention m1, Mention m2) { Set propers = getPropers(m1); propers.retainAll(getPropers(m2)); return !propers.isEmpty(); @@ -541,9 +536,25 @@ private static SemanticGraphEdge getDependencyParent(Mention m) { private static SemanticGraphEdge getDependencyParent(Mention m, IndexedWord w) { Iterator iterator = m.enhancedDependency.incomingEdgeIterator(w); - return iterator.hasNext() ? iterator.next() : null; + if (iterator.hasNext()) { + return iterator.next(); + } + return null; } + /*private static SemanticGraphEdge getDependencyGrandparent(Mention m) { + SemanticGraphEdge parentEdge = getDependencyParent(m); + if (parentEdge == null) { + return null; + } + IndexedWord parentIndexedWord = parentEdge.getSource(); + Iterator iterator = m.collapsedDependency.incomingEdgeIterator(parentIndexedWord); + if (iterator.hasNext()) { + return iterator.next(); + } + return null; + }*/ + private void addDependencyFeatures(Counter features, String prefix, SemanticGraphEdge e, boolean addWord) { if (e == null) { @@ -670,4 +681,10 @@ private static CoreLabel prevWord(Mention m) { private static CoreLabel prevprevWord(Mention m) { return m.startIndex > 1 ? m.sentenceWords.get(m.startIndex - 2) : null; } + + public static boolean relaxedStringMatch(Mention m1, Mention m2) { + Set propers = getPropers(m1); + propers.retainAll(getPropers(m2)); + return !propers.isEmpty(); + } } diff --git a/src/edu/stanford/nlp/coref/statistical/FeatureExtractorRunner.java b/src/edu/stanford/nlp/coref/statistical/FeatureExtractorRunner.java index 274e060888..ba22d2f285 100644 --- a/src/edu/stanford/nlp/coref/statistical/FeatureExtractorRunner.java +++ b/src/edu/stanford/nlp/coref/statistical/FeatureExtractorRunner.java @@ -5,17 +5,12 @@ import java.util.Map; import java.util.Properties; -import edu.stanford.nlp.coref.CorefDocumentProcessor; import edu.stanford.nlp.coref.data.Dictionaries; import edu.stanford.nlp.coref.data.Document; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.util.Pair; -/** - * Runs feature extraction over coreference documents. - * @author Kevin Clark - */ -public class FeatureExtractorRunner implements CorefDocumentProcessor { +public class FeatureExtractorRunner implements DocumentProcessor { private final FeatureExtractor extractor; private final Compressor compressor; diff --git a/src/edu/stanford/nlp/coref/statistical/MaxMarginMentionRanker.java b/src/edu/stanford/nlp/coref/statistical/MaxMarginMentionRanker.java index f9ae344757..0f0bff2514 100644 --- a/src/edu/stanford/nlp/coref/statistical/MaxMarginMentionRanker.java +++ b/src/edu/stanford/nlp/coref/statistical/MaxMarginMentionRanker.java @@ -3,13 +3,8 @@ import java.util.Map; import edu.stanford.nlp.coref.statistical.SimpleLinearClassifier.Loss; - import edu.stanford.nlp.stats.Counter; -/** - * A max-margin mention-ranking coreference model. - * @author Kevin Clark - */ public class MaxMarginMentionRanker extends PairwiseModel { public enum ErrorType { FN(0), FN_PRON(1), FL(2), WL(3); diff --git a/src/edu/stanford/nlp/coref/statistical/MetaFeatureExtractor.java b/src/edu/stanford/nlp/coref/statistical/MetaFeatureExtractor.java index 6e9f8f383f..180bc532ae 100644 --- a/src/edu/stanford/nlp/coref/statistical/MetaFeatureExtractor.java +++ b/src/edu/stanford/nlp/coref/statistical/MetaFeatureExtractor.java @@ -8,14 +8,10 @@ import java.util.Set; import edu.stanford.nlp.coref.data.Dictionaries.MentionType; - import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; -/** - * Class for filtering out input features and producing feature conjunctions. - * @author Kevin Clark - */ + public class MetaFeatureExtractor { public enum PairConjunction {FIRST, LAST, BOTH} @@ -72,7 +68,7 @@ public MetaFeatureExtractor(Builder builder) { disallowedPrefixes = builder.disallowedPrefixes; neTypeConjuntion = builder.useNEType; - str = StatisticalCorefTrainer.fieldValues(builder); + str = StatisticalCorefUtils.fieldValues(builder); } public static MetaFeatureExtractor anaphoricityMFE() { @@ -107,7 +103,7 @@ public Counter getFeatures(Example example, if (!example.isNewLink()) { assert(!anaphoricityClassifier); - pairFeatures = compressor.uncompress(example.pairwiseFeatures); + pairFeatures = compressor.uncompress(example.features); features1 = compressor.uncompress(mentionFeatures.get(example.mentionId1)); } else { features2.incrementCount("bias"); diff --git a/src/edu/stanford/nlp/coref/statistical/MetadataWriter.java b/src/edu/stanford/nlp/coref/statistical/MetadataWriter.java index 10c016dbcc..d2c086bcf5 100644 --- a/src/edu/stanford/nlp/coref/statistical/MetadataWriter.java +++ b/src/edu/stanford/nlp/coref/statistical/MetadataWriter.java @@ -8,7 +8,6 @@ import java.util.Set; import java.util.stream.Collectors; -import edu.stanford.nlp.coref.CorefDocumentProcessor; import edu.stanford.nlp.coref.data.CorefCluster; import edu.stanford.nlp.coref.data.Document; import edu.stanford.nlp.coref.data.Mention; @@ -18,11 +17,7 @@ import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.util.Pair; -/** - * Writes various pieces of information about coreference documents to disk. - * @author Kevin Clark - */ -public class MetadataWriter implements CorefDocumentProcessor { +public class MetadataWriter implements DocumentProcessor { private final Map> mentionTypes; private final Map>> goldClusters; private final Counter wordCounts; @@ -43,7 +38,28 @@ public MetadataWriter(boolean countWords) { @Override public void process(int id, Document document) { - // Mention types + Map, Boolean> labeledPairs = + StatisticalCorefUtils.getUnlabeledMentionPairs(document); + for (CorefCluster c : document.goldCorefClusters.values()) { + List clusterMentions = new ArrayList<>(c.getCorefMentions()); + for (int i = 0; i < clusterMentions.size(); i++) { + for (Mention clusterMention : clusterMentions) { + Pair mentionPair = new Pair<>( + clusterMentions.get(i).mentionID, clusterMention.mentionID); + if (labeledPairs.containsKey(mentionPair)) { + labeledPairs.put(mentionPair, true); + } + } + } + } + Map, Boolean> savedPairs = mentionPairs.get(id); + for (Map.Entry, Boolean> e: savedPairs.entrySet()) { + Pair pair = e.getKey(); + boolean label = e.getValue(); + assert(pair.first >= 0 && pair.second >= 0); + assert(label == labeledPairs.get(pair)); + } + mentionTypes.put(id, document.predictedMentionsByID.entrySet().stream().collect( Collectors.toMap(Map.Entry::getKey, e -> e.getValue().mentionType.toString()))); diff --git a/src/edu/stanford/nlp/coref/statistical/PairwiseModel.java b/src/edu/stanford/nlp/coref/statistical/PairwiseModel.java index 226cf2b7f1..3c1d0b883f 100644 --- a/src/edu/stanford/nlp/coref/statistical/PairwiseModel.java +++ b/src/edu/stanford/nlp/coref/statistical/PairwiseModel.java @@ -4,15 +4,8 @@ import java.io.PrintWriter; import java.util.Map; -import edu.stanford.nlp.coref.statistical.SimpleLinearClassifier.LearningRateSchedule; -import edu.stanford.nlp.coref.statistical.SimpleLinearClassifier.Loss; - import edu.stanford.nlp.stats.Counter; -/** - * Pairwise mention-classification model. - * @author Kevin Clark - */ public class PairwiseModel { public final String name; private final int trainingExamples; @@ -28,10 +21,11 @@ public static class Builder { @SuppressWarnings("unused") // output in config file with reflection private final String source = StatisticalCorefTrainer.extractedFeaturesFile; + //private MetaFeatureExtractor metaAnaphor = null; private int trainingExamples = 100000000; private int epochs = 8; - private Loss loss = SimpleLinearClassifier.log(); - private LearningRateSchedule learningRateSchedule = + private SimpleLinearClassifier.Loss loss = SimpleLinearClassifier.log(); + private SimpleLinearClassifier.LearningRateSchedule learningRateSchedule = SimpleLinearClassifier.adaGrad(0.05, 30.0); private double regularizationStrength = 1e-7; private double singletonRatio = 0.3; @@ -48,11 +42,11 @@ public Builder epochs(int epochs) { this.epochs = epochs; return this; } public Builder singletonRatio(double singletonRatio) { this.singletonRatio = singletonRatio; return this; } - public Builder loss(Loss loss) + public Builder loss(SimpleLinearClassifier.Loss loss) { this.loss = loss; return this; } public Builder regularizationStrength(double regularizationStrength) { this.regularizationStrength = regularizationStrength; return this; } - public Builder learningRateSchedule(LearningRateSchedule learningRateSchedule) + public Builder learningRateSchedule(SimpleLinearClassifier.LearningRateSchedule learningRateSchedule) { this.learningRateSchedule = learningRateSchedule; return this; } public Builder modelPath(String modelFile) { this.modelFile = modelFile; return this; } @@ -69,6 +63,7 @@ public static Builder newBuilder(String name, MetaFeatureExtractor meta) { public PairwiseModel(Builder builder) { name = builder.name; meta = builder.meta; + //metaAnaphor = builder.metaAnaphor; trainingExamples = builder.trainingExamples; epochs = builder.epochs; singletonRatio = builder.singletonRatio; @@ -76,7 +71,7 @@ public PairwiseModel(Builder builder) { builder.regularizationStrength, builder.modelFile == null ? null : ((builder.modelFile.endsWith(".ser") || builder.modelFile.endsWith(".gz")) ? builder.modelFile : StatisticalCorefTrainer.pairwiseModelsPath + builder.modelFile + "/model.ser")); - str = StatisticalCorefTrainer.fieldValues(builder); + str = StatisticalCorefUtils.fieldValues(builder); } public String getDefaultOutputPath() { diff --git a/src/edu/stanford/nlp/coref/statistical/PairwiseModelTrainer.java b/src/edu/stanford/nlp/coref/statistical/PairwiseModelTrainer.java index 0fcdbbca85..d2416fe6ab 100644 --- a/src/edu/stanford/nlp/coref/statistical/PairwiseModelTrainer.java +++ b/src/edu/stanford/nlp/coref/statistical/PairwiseModelTrainer.java @@ -8,8 +8,6 @@ import java.util.Map; import java.util.Random; -import edu.stanford.nlp.coref.statistical.MaxMarginMentionRanker.ErrorType; - import edu.stanford.nlp.coref.data.Dictionaries.MentionType; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.stats.ClassicCounter; @@ -17,10 +15,6 @@ import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.logging.Redwood; -/** - * Class for training coreference models - * @author Kevin Clark - */ public class PairwiseModelTrainer { public static void trainRanking(PairwiseModel model) throws Exception { Redwood.log("scoref-train", "Reading compression..."); @@ -79,19 +73,19 @@ public static void trainRanking(PairwiseModel model) throws Exception { double maxNegativeScore = -Double.MAX_VALUE; Example maxScoringNegative = null; - ErrorType maxScoringEt = null; + MaxMarginMentionRanker.ErrorType maxScoringEt = null; for (Example e : es) { double score = model.predict(e, doc.mentionFeatures, compressor); if (e.label != 1) { assert(!(noAntecedent && e.isNewLink())); - ErrorType et = ErrorType.WL; + MaxMarginMentionRanker.ErrorType et = MaxMarginMentionRanker.ErrorType.WL; if (noAntecedent && !e.isNewLink()) { - et = ErrorType.FL; + et = MaxMarginMentionRanker.ErrorType.FL; } else if (!noAntecedent && e.isNewLink()) { if (e.mentionType2 == MentionType.PRONOMINAL) { - et = ErrorType.FN_PRON; + et = MaxMarginMentionRanker.ErrorType.FN_PRON; } else { - et = ErrorType.FN; + et = MaxMarginMentionRanker.ErrorType.FN; } } diff --git a/src/edu/stanford/nlp/coref/statistical/SimpleLinearClassifier.java b/src/edu/stanford/nlp/coref/statistical/SimpleLinearClassifier.java index d025e3e0ba..f1e1bec232 100644 --- a/src/edu/stanford/nlp/coref/statistical/SimpleLinearClassifier.java +++ b/src/edu/stanford/nlp/coref/statistical/SimpleLinearClassifier.java @@ -12,12 +12,9 @@ import edu.stanford.nlp.util.Timing; import edu.stanford.nlp.util.logging.Redwood; -/** - * A simple linear classifier trained by SGD with support for several different loss functions - * and learning rate schedules. - * @author Kevin Clark - */ +/** @author Kevin Clark */ public class SimpleLinearClassifier { + /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(SimpleLinearClassifier.class); private final Loss defaultLoss; @@ -41,8 +38,7 @@ public SimpleLinearClassifier(Loss loss,LearningRateSchedule learningRateSchedul this.weights = Counters.deserializeStringCounter(modelFile); Timing.endDoing("Reading " + modelFile); } else { - this.weights = IOUtils.readObjectAnnouncingTimingFromURLOrClasspathOrFileSystem( - log, "Loading coref model", modelFile); + this.weights = IOUtils.readObjectAnnouncingTimingFromURLOrClasspathOrFileSystem(log, "Loading coref model", modelFile); } } catch (Exception e) { throw new RuntimeException("Error leading weights from " + modelFile, e); diff --git a/src/edu/stanford/nlp/coref/statistical/StatisticalCorefProperties.java b/src/edu/stanford/nlp/coref/statistical/StatisticalCorefProperties.java index 208d8ea380..789fa22833 100644 --- a/src/edu/stanford/nlp/coref/statistical/StatisticalCorefProperties.java +++ b/src/edu/stanford/nlp/coref/statistical/StatisticalCorefProperties.java @@ -6,64 +6,124 @@ import edu.stanford.nlp.coref.CorefProperties; import edu.stanford.nlp.util.PropertiesUtils; -/** - * Manages the properties for training and running statistical coreference systems. - * @author Kevin Clark - */ public class StatisticalCorefProperties { + private static final String DEFAULT_MODELS_PATH = "edu/stanford/nlp/models/dcoref/"; + + public static Properties addHcorefProps(Properties props) { + Properties newProps = (Properties) props.clone(); + newProps.setProperty(CorefProperties.USE_SEMANTICS_PROP, "false"); + newProps.setProperty(CorefProperties.GENDER_NUMBER_PROP, + "edu/stanford/nlp/models/dcoref/gender.data.gz"); + newProps.setProperty(CorefProperties.INPUT_TYPE_PROP, "conll"); + if (props.containsKey("coref.scorer")) { + newProps.setProperty(CorefProperties.PATH_SCORER_PROP, props.getProperty("coref.scorer")); + } + + if (conll(props)) { + newProps.setProperty(CorefProperties.PARSER_PROP,props.getProperty(CorefProperties.PARSER_PROP, "true")); + newProps.setProperty(CorefProperties.MD_TYPE_PROP, props.getProperty(CorefProperties.MD_TYPE_PROP, "rule")); + newProps.setProperty("coref.useMarkedDiscourse", "true"); + } else { + String mdPath = PropertiesUtils.getString(newProps, "coref.mentionDetectionModel", + "edu/stanford/nlp/models/coref/md-model.ser"); + //String mdDir = mdPath.substring(0, mdPath.lastIndexOf('/') + 1); + //String mdModelName = mdPath.substring(mdPath.lastIndexOf('/') + 1); + //newProps.setProperty("coref.md.model", mdModelName); + //newProps.setProperty(CorefProperties.PATH_SERIALIZED_PROP, mdDir); + newProps.setProperty(CorefProperties.MD_TYPE_PROP, "dependency"); + newProps.setProperty(CorefProperties.USE_GOLD_POS_PROP, "false"); + newProps.setProperty(CorefProperties.USE_GOLD_NE_PROP, "false"); + newProps.setProperty(CorefProperties.USE_GOLD_PARSES_PROP, "false"); + } + if (props.containsKey("coref.test")) { + newProps.setProperty(CorefProperties.PATH_INPUT_PROP, props.getProperty("coref.test")); + } + + return newProps; + } + + public enum Dataset {TRAIN, DEV, TEST}; + public static void setInput(Properties props, Dataset d) { + props.setProperty(CorefProperties.PATH_INPUT_PROP, d == Dataset.TRAIN + ? props.getProperty("coref.train") : (d == Dataset.DEV ? props.getProperty("coref.dev") + : props.getProperty("coref.test"))); + } + + public static boolean conll(Properties props) { + return PropertiesUtils.getBool(props, "coref.conll", false); + } + public static String trainingPath(Properties props) { - return props.getProperty("coref.statistical.trainingPath"); + return props.getProperty("coref.trainingPath"); } - private static String getDefaultModelPath(Properties props, String modelName) { - return "edu/stanford/nlp/models/coref/statistical/" + modelName + - (CorefProperties.conll(props) ? "_conll" : "") + ".ser.gz"; + public static String conllOutputPath(Properties props) { + return props.getProperty("coref.conllOutputPath"); } public static String classificationModelPath(Properties props) { - return PropertiesUtils.getString(props, "coref.statistical.classificationModel", - getDefaultModelPath(props, "classification_model")); + return PropertiesUtils.getString(props, "coref.classificationModel", + "edu/stanford/nlp/models/scoref/classification_model.ser.gz"); } public static String rankingModelPath(Properties props) { - return PropertiesUtils.getString(props, "coref.statistical.rankingModel", - getDefaultModelPath(props, "ranking_model")); + return PropertiesUtils.getString(props, "coref.rankingModel", + "edu/stanford/nlp/models/scoref/ranking_model.ser.gz"); } public static String anaphoricityModelPath(Properties props) { - return PropertiesUtils.getString(props, "coref.statistical.anaphoricityModel", - getDefaultModelPath(props, "anaphoricity_model")); + return PropertiesUtils.getString(props, "coref.anaphoricityModel", + "edu/stanford/nlp/models/scoref/anaphoricity_model.ser.gz"); } public static String clusteringModelPath(Properties props) { - return PropertiesUtils.getString(props, "coref.statistical.clusteringModel", - getDefaultModelPath(props, "clustering_model")); + return PropertiesUtils.getString(props, "coref.clusteringModel", + "edu/stanford/nlp/models/scoref/clustering_model.ser"); } public static String wordCountsPath(Properties props) { - return PropertiesUtils.getString(props, "coref.statistical.wordCounts", - "edu/stanford/nlp/models/coref/statistical/word_counts.ser.gz"); + return PropertiesUtils.getString(props, "coref.wordCounts", + "edu/stanford/nlp/models/scoref/word_counts.ser.gz"); + } + + private static String defaultModelPath(Properties props, String modelName) { + return DEFAULT_MODELS_PATH + modelName + (conll(props) ? "_conll" : "" + ".ser"); + } + + public static boolean cluster(Properties props) { + return PropertiesUtils.getBool(props, "coref.doClustering", true); + } + + public static int maxMentionDistance(Properties props) { + return PropertiesUtils.getInt(props, "coref.maxMentionDistance", 50); + } + + public static int maxMentionDistanceWithStringMatch(Properties props) { + return PropertiesUtils.getInt(props, "coref.maxMentionDistanceWithStringMatch", 5000); } public static double[] pairwiseScoreThresholds(Properties props) { - String thresholdsProp = (String) props.get("coref.statistical.pairwiseScoreThresholds"); + String thresholdsProp = (String) props.get("coref.pairwiseScoreThresholds"); if (thresholdsProp != null) { String[] split = thresholdsProp.split(","); if (split.length == 4) { return Arrays.stream(split).mapToDouble(Double::parseDouble).toArray(); } } - double threshold = PropertiesUtils.getDouble( - props, "coref.statistical.pairwiseScoreThresholds", 0.35); + double threshold = PropertiesUtils.getDouble(props, "coref.pairwiseScoreThresholds", 0.35); return new double[] {threshold, threshold, threshold, threshold}; } + public static boolean useConstituencyParse(Properties props) { + boolean defaultValue = conll(props); + return PropertiesUtils.getBool(props, CorefProperties.PARSER_PROP, defaultValue); + } + public static double minClassImbalance(Properties props) { - return PropertiesUtils.getDouble(props, "coref.statistical.minClassImbalance", 0); + return PropertiesUtils.getDouble(props, "coref.minClassImbalance", 0); } public static int minTrainExamplesPerDocument(Properties props) { - return PropertiesUtils.getInt(props, "coref.statistical.minTrainExamplesPerDocument", - Integer.MAX_VALUE); + return PropertiesUtils.getInt(props, "coref.minTrainExamplesPerDocument", Integer.MAX_VALUE); } } diff --git a/src/edu/stanford/nlp/coref/statistical/StatisticalCorefSystem.java b/src/edu/stanford/nlp/coref/statistical/StatisticalCorefSystem.java new file mode 100644 index 0000000000..862db6a229 --- /dev/null +++ b/src/edu/stanford/nlp/coref/statistical/StatisticalCorefSystem.java @@ -0,0 +1,141 @@ +package edu.stanford.nlp.coref.statistical; + +import java.io.FileOutputStream; +import java.io.PrintWriter; +import java.util.Calendar; +import java.util.Map; +import java.util.Properties; +import java.util.logging.Logger; + +import edu.stanford.nlp.coref.CorefCoreAnnotations; +import edu.stanford.nlp.coref.CorefDocMaker; +import edu.stanford.nlp.coref.CorefPrinter; +import edu.stanford.nlp.coref.CorefProperties; +import edu.stanford.nlp.coref.CorefSystem; +import edu.stanford.nlp.coref.Scorer; +import edu.stanford.nlp.coref.data.CorefChain; +import edu.stanford.nlp.coref.data.CorefCluster; +import edu.stanford.nlp.coref.data.Dictionaries; +import edu.stanford.nlp.coref.data.Document; +import edu.stanford.nlp.coref.OldCorefPrinter; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.util.Generics; +import edu.stanford.nlp.util.RuntimeInterruptedException; +import edu.stanford.nlp.util.StringUtils; + +public abstract class StatisticalCorefSystem { + public final Dictionaries dictionaries; + private final Properties props; + private final CorefDocMaker docMaker; + + public StatisticalCorefSystem(Properties props) { + this.props = StatisticalCorefProperties.addHcorefProps(props); + try { + dictionaries = new Dictionaries(this.props); + docMaker = new CorefDocMaker(this.props, dictionaries); + } catch (Exception e) { + throw new RuntimeException("Error initializing coref system", e); + } + } + + public static StatisticalCorefSystem fromProps(Properties props) { + try { + if (StatisticalCorefProperties.cluster(props)) { + return new ClusteringCorefSystem(props, + StatisticalCorefProperties.clusteringModelPath(props), + StatisticalCorefProperties.classificationModelPath(props), + StatisticalCorefProperties.rankingModelPath(props), + StatisticalCorefProperties.anaphoricityModelPath(props), + StatisticalCorefProperties.wordCountsPath(props)); + } else { + return new BestFirstCorefSystem(props, + StatisticalCorefProperties.wordCountsPath(props), + StatisticalCorefProperties.rankingModelPath(props), + StatisticalCorefProperties.maxMentionDistance(props), + StatisticalCorefProperties.maxMentionDistanceWithStringMatch(props), + StatisticalCorefProperties.pairwiseScoreThresholds(props)); + } + } catch (Exception e) { + throw new RuntimeException("Error creating coreference system", e); + } + } + + public void annotate(Annotation ann) { + annotate(ann, true); + } + + public void annotate(Annotation ann, boolean removeSingletonClusters) { + try { + Document document = docMaker.makeDocument(ann); + if (Thread.interrupted()) { // Allow interrupting + throw new RuntimeInterruptedException(); + } + runCoref(document); + if (removeSingletonClusters) { + StatisticalCorefUtils.removeSingletonClusters(document); + } + if (Thread.interrupted()) { // Allow interrupting + throw new RuntimeInterruptedException(); + } + + Map result = Generics.newHashMap(); + for(CorefCluster c : document.corefClusters.values()) { + result.put(c.clusterID, new CorefChain(c, document.positions)); + } + ann.set(CorefCoreAnnotations.CorefChainAnnotation.class, result); + } catch (Exception e) { + throw new RuntimeException("Error annotating document with coref", e); + } + } + + public void runOnConll() throws Exception { + String baseName = StatisticalCorefProperties.conllOutputPath(props) + + Calendar.getInstance().getTime().toString().replaceAll("\\s", "-").replaceAll(":", "-"); + String goldOutput = baseName + ".gold.txt"; + String beforeCorefOutput = baseName + ".predicted.txt"; + String afterCorefOutput = baseName + ".coref.predicted.txt"; + PrintWriter writerGold = new PrintWriter(new FileOutputStream(goldOutput)); + PrintWriter writerBeforeCoref = new PrintWriter(new FileOutputStream(beforeCorefOutput)); + PrintWriter writerAfterCoref = new PrintWriter(new FileOutputStream(afterCorefOutput)); + + (new DocumentProcessor() { + @Override + public void process(int id, Document document) { + writerGold.print(CorefPrinter.printConllOutput(document, true)); + writerBeforeCoref.print(CorefPrinter.printConllOutput(document, false)); + runCoref(document); + StatisticalCorefUtils.removeSingletonClusters(document); + writerAfterCoref.print(CorefPrinter.printConllOutput(document, false, true)); + } + + @Override + public void finish() throws Exception {} + + @Override + public String getName() { + return StatisticalCorefSystem.this.getClass().getSimpleName(); + } + }).run(docMaker); + + Logger logger = Logger.getLogger(CorefSystem.class.getName()); + String summary = Scorer.getEvalSummary(CorefProperties.getPathScorer(props), + goldOutput, beforeCorefOutput); + OldCorefPrinter.printScoreSummary(summary, logger, false); + summary = Scorer.getEvalSummary(CorefProperties.getPathScorer(props), goldOutput, + afterCorefOutput); + OldCorefPrinter.printScoreSummary(summary, logger, true); + OldCorefPrinter.printFinalConllScore(summary); + + writerGold.close(); + writerBeforeCoref.close(); + writerAfterCoref.close(); + } + + public abstract void runCoref(Document document); + + public static void main(String[] args) throws Exception { + Properties props = StringUtils.argsToProperties(new String[] {"-props", args[0]}); + StatisticalCorefSystem coref = StatisticalCorefSystem.fromProps(props); + coref.runOnConll(); + } +} diff --git a/src/edu/stanford/nlp/coref/statistical/StatisticalCorefTrainer.java b/src/edu/stanford/nlp/coref/statistical/StatisticalCorefTrainer.java index 36af09eb7a..5bafe4ecf2 100644 --- a/src/edu/stanford/nlp/coref/statistical/StatisticalCorefTrainer.java +++ b/src/edu/stanford/nlp/coref/statistical/StatisticalCorefTrainer.java @@ -1,18 +1,11 @@ package edu.stanford.nlp.coref.statistical; import java.io.File; -import java.lang.reflect.Field; import java.util.Properties; -import edu.stanford.nlp.coref.CorefProperties; -import edu.stanford.nlp.coref.CorefProperties.Dataset; import edu.stanford.nlp.coref.data.Dictionaries; import edu.stanford.nlp.util.StringUtils; -/** - * Main class for training new statistical coreference systems. - * @author Kevin Clark - */ public class StatisticalCorefTrainer { public static final String CLASSIFICATION_MODEL = "classification"; public static final String RANKING_MODEL = "ranking"; @@ -61,20 +54,6 @@ public static void setDataPath(String name) { extractedFeaturesFile = extractedFeaturesPath + "compressed_features.ser"; } - public static String fieldValues(Object o) { - String s = ""; - Field[] fields = o.getClass().getDeclaredFields(); - for (Field field : fields) { - try { - field.setAccessible(true); - s += field.getName() + " = " + field.get(o) + "\n"; - } catch (Exception e) { - throw new RuntimeException("Error getting field value for " + field.getName(), e); - } - } - return s; - } - private static void preprocess(Properties props, Dictionaries dictionaries, boolean isTrainSet) throws Exception { (isTrainSet ? new DatasetBuilder(StatisticalCorefProperties.minClassImbalance(props), @@ -85,16 +64,17 @@ private static void preprocess(Properties props, Dictionaries dictionaries, bool } public static void doTraining(Properties props) throws Exception { + props = StatisticalCorefProperties.addHcorefProps(props); setTrainingPath(props); Dictionaries dictionaries = new Dictionaries(props); setDataPath("train"); wordCountsFile = "train/word_counts.ser"; - CorefProperties.setInput(props, Dataset.TRAIN); + StatisticalCorefProperties.setInput(props, StatisticalCorefProperties.Dataset.TRAIN); preprocess(props, dictionaries, true); setDataPath("dev"); - CorefProperties.setInput(props, Dataset.DEV); + StatisticalCorefProperties.setInput(props, StatisticalCorefProperties.Dataset.DEV); preprocess(props, dictionaries, false); setDataPath("train"); diff --git a/src/edu/stanford/nlp/coref/statistical/StatisticalCorefUtils.java b/src/edu/stanford/nlp/coref/statistical/StatisticalCorefUtils.java new file mode 100644 index 0000000000..8ef11e6093 --- /dev/null +++ b/src/edu/stanford/nlp/coref/statistical/StatisticalCorefUtils.java @@ -0,0 +1,75 @@ +package edu.stanford.nlp.coref.statistical; + +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import edu.stanford.nlp.coref.data.CorefCluster; +import edu.stanford.nlp.coref.data.Document; +import edu.stanford.nlp.coref.data.Mention; +import edu.stanford.nlp.util.Pair; + +public class StatisticalCorefUtils { + public static List getSortedMentions(Document document) { + List mentions = new ArrayList<>(document.predictedMentionsByID.values()); + Collections.sort(mentions, (m1, m2) -> m1.appearEarlierThan(m2) ? -1 : 1); + return mentions; + } + + public static Map, Boolean> getUnlabeledMentionPairs(Document document) { + return getUnlabeledMentionPairs(document, Integer.MAX_VALUE); + } + + public static Map, Boolean> getUnlabeledMentionPairs(Document document, + int maxMentionDistance) { + Map, Boolean> pairs = new HashMap<>(); + List mentions = getSortedMentions(document); + for (int i = 0; i < mentions.size(); i++) { + for (int j = Math.max(0, i - maxMentionDistance); j < i; j++) { + pairs.put(new Pair<>(mentions.get(j).mentionID, mentions.get(i).mentionID), false); + } + } + return pairs; + } + + public static void mergeCoreferenceClusters(Pair mentionPair, + Document document) { + Mention m1 = document.predictedMentionsByID.get(mentionPair.first); + Mention m2 = document.predictedMentionsByID.get(mentionPair.second); + if (m1.corefClusterID == m2.corefClusterID) { + return; + } + + int removeId = m1.corefClusterID; + CorefCluster c1 = document.corefClusters.get(m1.corefClusterID); + CorefCluster c2 = document.corefClusters.get(m2.corefClusterID); + CorefCluster.mergeClusters(c2, c1); + document.corefClusters.remove(removeId); + } + + public static void removeSingletonClusters(Document document) { + for (CorefCluster c : new ArrayList<>(document.corefClusters.values())) { + if (c.getCorefMentions().size() == 1) { + document.corefClusters.remove(c.clusterID); + } + } + } + + public static String fieldValues(Object o) { + String s = ""; + Field[] fields = o.getClass().getDeclaredFields(); + for (Field field : fields) { + try { + field.setAccessible(true); + s += field.getName() + " = " + field.get(o) + "\n"; + } catch (Exception e) { + throw new RuntimeException("Error getting field value for " + field.getName(), e); + } + } + + return s; + } +} diff --git a/src/edu/stanford/nlp/coref/statistical/properties/english-conll-training.properties b/src/edu/stanford/nlp/coref/statistical/properties/english-conll-training.properties deleted file mode 100644 index c3e61a800e..0000000000 --- a/src/edu/stanford/nlp/coref/statistical/properties/english-conll-training.properties +++ /dev/null @@ -1,3 +0,0 @@ -coref.conll = true -coref.data = /scr/nlp/data/conll-2012/ -coref.statistica.trainingPath = /scr/nlp/coref/training/ diff --git a/src/edu/stanford/nlp/coref/statistical/properties/english-conll.properties b/src/edu/stanford/nlp/coref/statistical/properties/english-conll.properties deleted file mode 100644 index 7618b59507..0000000000 --- a/src/edu/stanford/nlp/coref/statistical/properties/english-conll.properties +++ /dev/null @@ -1,6 +0,0 @@ -coref.algorithm = clustering -coref.conll = true - -coref.data = /scr/nlp/data/conll-2012/ -coref.conllOutputPath = /scr/nlp/coref/logs/ -coref.scorer = /scr/nlp/data/conll-2012/scorer/v8.01/scorer.pl diff --git a/src/edu/stanford/nlp/coref/statistical/properties/english-default-training.properties b/src/edu/stanford/nlp/coref/statistical/properties/english-default-training.properties deleted file mode 100644 index d30210b53d..0000000000 --- a/src/edu/stanford/nlp/coref/statistical/properties/english-default-training.properties +++ /dev/null @@ -1,3 +0,0 @@ -coref.conll = false -coref.data = /scr/nlp/data/conll-2012/ -coref.statistical.trainingPath = /scr/nlp/coref/training/ diff --git a/src/edu/stanford/nlp/coref/statistical/properties/english-default.properties b/src/edu/stanford/nlp/coref/statistical/properties/english-default.properties deleted file mode 100644 index 7c0b8fbb9a..0000000000 --- a/src/edu/stanford/nlp/coref/statistical/properties/english-default.properties +++ /dev/null @@ -1,6 +0,0 @@ -coref.algorithm = statistical -coref.conll = false - -coref.data = /scr/nlp/data/conll-2012/ -coref.conllOutputPath = /scr/nlp/coref/logs/ -coref.scorer = /scr/nlp/data/conll-2012/scorer/v8.01/scorer.pl diff --git a/src/edu/stanford/nlp/coref/statistical/properties/scoref-conll.properties b/src/edu/stanford/nlp/coref/statistical/properties/scoref-conll.properties new file mode 100644 index 0000000000..7354f29370 --- /dev/null +++ b/src/edu/stanford/nlp/coref/statistical/properties/scoref-conll.properties @@ -0,0 +1,12 @@ +coref.conll = true +coref.conllOutputPath = /scr/nlp/coref/logs/ +coref.test = /scr/nlp/data/conll-2012/v9/data/test/data/english/annotations +coref.scorer = /scr/nlp/data/conll-2012/scorer/v8.01/scorer.pl +coref.input.type = conll +coref.addMissingAnnotations = true + +coref.wordCounts = edu/stanford/nlp/models/scoref/word_counts.ser.gz +coref.clusteringModel = edu/stanford/nlp/models/scoref/clustering_model.ser +coref.classificationModel = edu/stanford/nlp/models/scoref/classification_model.ser.gz +coref.rankingModel = edu/stanford/nlp/models/scoref/ranking_model.ser.gz +coref.anaphoricityModel = edu/stanford/nlp/models/scoref/anaphoricity_model.ser.gz diff --git a/src/edu/stanford/nlp/coref/statistical/properties/scoref-default-fast.properties b/src/edu/stanford/nlp/coref/statistical/properties/scoref-default-fast.properties new file mode 100644 index 0000000000..7b537f762b --- /dev/null +++ b/src/edu/stanford/nlp/coref/statistical/properties/scoref-default-fast.properties @@ -0,0 +1,12 @@ +coref.conll = false +coref.doClustering = false +coref.conllOutputPath = /scr/nlp/coref/logs/ +coref.test = /scr/nlp/data/conll-2012/v9/data/test/data/english/annotations +coref.scorer = /scr/nlp/data/conll-2012/scorer/v8.01/scorer.pl + +coref.wordCounts = /scr/nlp/data/coref/models/scoref/word_counts.ser +coref.clusteringModel = /scr/nlp/data/coref/models/scoref/clustering_model.ser +coref.classificationModel = /scr/nlp/data/coref/models/scoref/classification_model.ser +coref.rankingModel = /scr/nlp/data/coref/models/scoref/ranking_model.ser +coref.anaphoricityModel = /scr/nlp/data/coref/models/scoref/anaphoricity_model.ser +coref.mentionDetectionModel = /scr/nlp/data/coref/models/hybrid-conll-dep/md-model.ser diff --git a/src/edu/stanford/nlp/coref/statistical/properties/scoref-default.properties b/src/edu/stanford/nlp/coref/statistical/properties/scoref-default.properties new file mode 100644 index 0000000000..bd52043eda --- /dev/null +++ b/src/edu/stanford/nlp/coref/statistical/properties/scoref-default.properties @@ -0,0 +1,11 @@ +coref.conll = false +coref.conllOutputPath = /scr/nlp/coref/logs/ +coref.test = /scr/nlp/data/conll-2012/v9/data/test/data/english/annotations +coref.scorer = /scr/nlp/data/conll-2012/scorer/v8.01/scorer.pl + +coref.wordCounts = /scr/nlp/data/coref/models/scoref/word_counts.ser +coref.clusteringModel = /scr/nlp/data/coref/models/scoref/clustering_model.ser +coref.classificationModel = /scr/nlp/data/coref/models/scoref/classification_model.ser +coref.rankingModel = /scr/nlp/data/coref/models/scoref/ranking_model.ser +coref.anaphoricityModel = /scr/nlp/data/coref/models/scoref/anaphoricity_model.ser +coref.mentionDetectionModel = /scr/nlp/data/coref/models/hybrid-conll-dep/md-model.ser diff --git a/src/edu/stanford/nlp/coref/statistical/properties/scoref-train-conll.properties b/src/edu/stanford/nlp/coref/statistical/properties/scoref-train-conll.properties new file mode 100644 index 0000000000..c76add21aa --- /dev/null +++ b/src/edu/stanford/nlp/coref/statistical/properties/scoref-train-conll.properties @@ -0,0 +1,4 @@ +coref.conll = true +coref.trainingPath = /scr/nlp/coref/training/ +coref.train = /scr/nlp/data/conll-2012/v4/data/train/data/english/annotations/ +coref.dev = /scr/nlp/data/conll-2012/v4/data/dev/data/english/annotations/ diff --git a/src/edu/stanford/nlp/coref/statistical/properties/scoref-train-default.properties b/src/edu/stanford/nlp/coref/statistical/properties/scoref-train-default.properties new file mode 100644 index 0000000000..d918d02491 --- /dev/null +++ b/src/edu/stanford/nlp/coref/statistical/properties/scoref-train-default.properties @@ -0,0 +1,5 @@ +coref.conll = false +coref.trainingPath = /scr/nlp/coref/training/ +coref.train = /scr/nlp/data/conll-2012/v4/data/train/data/english/annotations/ +coref.dev = /scr/nlp/data/conll-2012/v4/data/dev/data/english/annotations/ +coref.mentionDetectionModel = /scr/nlp/data/coref/models/hybrid-conll-dep/md-model.ser