stanfordnlp
diff --git a/‎scripts/ner/german-2018.hgc_175m_600.prop‎
Lines changed: 0 additions & 65 deletions b/‎scripts/ner/german-2018.hgc_175m_600.prop‎
Lines changed: 0 additions & 65 deletions
diff --git a/‎scripts/ner/german.hgc_175m_600.prop‎
Lines changed: 34 additions & 13 deletions b/‎scripts/ner/german.hgc_175m_600.prop‎
Lines changed: 34 additions & 13 deletions
@@ -1,18 +1,31 @@
-trainFile = /u/nlp/data/german/ner/deu.train
-testFile = /u/nlp/data/german/ner/deu.testa
-serializeTo = german.hgc_175m_600.crf.ser.gz
+trainFile = /u/nlp/data/german/ner/2016/deu.io.f15.utf8.train
+testFile = /u/nlp/data/german/ner/2016/deu.io.f15.utf8.testa
+serializeTo = german.distsim.crf.ser.gz
 
-map = word=0,answer=4
+type=crf
 
+# distSimLexicon = /u/nlp/data/german/ner/hgc_175m_600
+distSimLexicon = /u/nlp/data/german/ner/2016/hgc-175M-600
+# right options for new hgc_175m_600
+distSimFileFormat = alexClark
+unknownWordDistSimClass = 599
 useDistSim = true
-distSimLexicon = /u/nlp/data/german/ner/hgc_175m_600
-distSimEncoding = ISO-8859-1
+numberEquivalenceDistSim = false
+casedDistSim = true
+
+# Now using stripped 2 column files so can add extra datasets!
+map = word=0,answer=1
+
+encoding = utf-8
+# saveFeatureIndexToDisk = true # now buggy but unnecessary
 mergeTags = false
 useTitle = false
+
 useClassFeature=true
 useWord=true
 useNGrams=true
 noMidNGrams=true
+# Having no maxNGramLeng seemed to work marginally better, but omitted for efficiency
 maxNGramLeng=6
 usePrev=true
 useNext=true
@@ -22,23 +35,31 @@ usePrevSequences=true
 useTypeSeqs=true
 useTypeSeqs2=true
 useTypeySequences=true
+# Including useOccurrencePatterns increased scores really marginally (could even disappear now we have weaker regularization)
 useOccurrencePatterns=true
 useLastRealWord=true
 useNextRealWord=true
 normalize=true
+# using chris4 instead hurts in most recent experiment. Earlier, an experiment had seemed to show the opposite.
 wordShape=chris2useLC
 useDisjunctive=true
+# Width 5 works a little better than 4
 disjunctionWidth=5
-type=crf
-useQN = true
-
-# For making faster
 
-QNsize = 10
-saveFeatureIndexToDisk = true
 maxLeft=1
+readerAndWriter=edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter
 useObservedSequencesOnly=true
+useQN = true
+QNsize = 15
+# sigma 20 works better than sigma 5, which is MUCH better than sigma 1; that was the limit of hyperparameter optimization
+# On the basic CoNLL dataset (no distsim, no extra data), sigma=50 is a bit better still (by 0.13 F1)
+sigma = 20
+
+# For making faster (less features); changing this to 0.025 doesn't improve performance
 featureDiffThresh=0.05
 
+# evaluateIOB=true
 
-evaluateIOB=true
+# other notes
+# even though useTaggySequences will use distsim rather than POS sequences, turning it on didn't help
+# adding useWordPairs doesn't seem to help. (Getting them anyway in an edge feature.)