Skip to content

Commit bab1bfd

Browse files
AngledLuffaStanford NLP
authored andcommitted
Simplify german ner props
1 parent 86ddf3e commit bab1bfd

File tree

2 files changed

+34
-78
lines changed

2 files changed

+34
-78
lines changed

scripts/ner/german-2018.hgc_175m_600.prop

Lines changed: 0 additions & 65 deletions
This file was deleted.
Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,31 @@
1-
trainFile = /u/nlp/data/german/ner/deu.train
2-
testFile = /u/nlp/data/german/ner/deu.testa
3-
serializeTo = german.hgc_175m_600.crf.ser.gz
1+
trainFile = /u/nlp/data/german/ner/2016/deu.io.f15.utf8.train
2+
testFile = /u/nlp/data/german/ner/2016/deu.io.f15.utf8.testa
3+
serializeTo = german.distsim.crf.ser.gz
44

5-
map = word=0,answer=4
5+
type=crf
66

7+
# distSimLexicon = /u/nlp/data/german/ner/hgc_175m_600
8+
distSimLexicon = /u/nlp/data/german/ner/2016/hgc-175M-600
9+
# right options for new hgc_175m_600
10+
distSimFileFormat = alexClark
11+
unknownWordDistSimClass = 599
712
useDistSim = true
8-
distSimLexicon = /u/nlp/data/german/ner/hgc_175m_600
9-
distSimEncoding = ISO-8859-1
13+
numberEquivalenceDistSim = false
14+
casedDistSim = true
15+
16+
# Now using stripped 2 column files so can add extra datasets!
17+
map = word=0,answer=1
18+
19+
encoding = utf-8
20+
# saveFeatureIndexToDisk = true # now buggy but unnecessary
1021
mergeTags = false
1122
useTitle = false
23+
1224
useClassFeature=true
1325
useWord=true
1426
useNGrams=true
1527
noMidNGrams=true
28+
# Having no maxNGramLeng seemed to work marginally better, but omitted for efficiency
1629
maxNGramLeng=6
1730
usePrev=true
1831
useNext=true
@@ -22,23 +35,31 @@ usePrevSequences=true
2235
useTypeSeqs=true
2336
useTypeSeqs2=true
2437
useTypeySequences=true
38+
# Including useOccurrencePatterns increased scores really marginally (could even disappear now we have weaker regularization)
2539
useOccurrencePatterns=true
2640
useLastRealWord=true
2741
useNextRealWord=true
2842
normalize=true
43+
# using chris4 instead hurts in most recent experiment. Earlier, an experiment had seemed to show the opposite.
2944
wordShape=chris2useLC
3045
useDisjunctive=true
46+
# Width 5 works a little better than 4
3147
disjunctionWidth=5
32-
type=crf
33-
useQN = true
34-
35-
# For making faster
3648

37-
QNsize = 10
38-
saveFeatureIndexToDisk = true
3949
maxLeft=1
50+
readerAndWriter=edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter
4051
useObservedSequencesOnly=true
52+
useQN = true
53+
QNsize = 15
54+
# sigma 20 works better than sigma 5, which is MUCH better than sigma 1; that was the limit of hyperparameter optimization
55+
# On the basic CoNLL dataset (no distsim, no extra data), sigma=50 is a bit better still (by 0.13 F1)
56+
sigma = 20
57+
58+
# For making faster (less features); changing this to 0.025 doesn't improve performance
4159
featureDiffThresh=0.05
4260

61+
# evaluateIOB=true
4362

44-
evaluateIOB=true
63+
# other notes
64+
# even though useTaggySequences will use distsim rather than POS sequences, turning it on didn't help
65+
# adding useWordPairs doesn't seem to help. (Getting them anyway in an edge feature.)

0 commit comments

Comments
 (0)