Skip to content

Commit a3f9d7f

Browse files
manningStanford NLP
authored andcommitted
Merge branch 'master' of jamie.stanford.edu:/u/nlp/git/javanlp
1 parent 0fbf6d0 commit a3f9d7f

File tree

4 files changed

+94
-11
lines changed

4 files changed

+94
-11
lines changed

itest/src/edu/stanford/nlp/ie/crf/ThreadedCRFClassifierITest.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@ public class ThreadedCRFClassifierITest extends TestCase {
1515
Properties props;
1616

1717
private static final String german1 =
18-
"edu/stanford/nlp/models/ner/german.conll.hgc_175m_600.crf.ser.gz";
18+
"edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz";
1919
/** -- We're no longer supporting this one
2020
private String german2 =
2121
"/u/nlp/data/ner/goodClassifiers/german.dewac_175m_600.crf.ser.gz";
2222
*/
23-
private static final String germanTestFile = "/u/nlp/data/german/ner/2016/deu.utf8.testa";
23+
private static final String germanTestFile = "/u/nlp/data/german/ner/2016/deu.io.f15.utf8.testa";
2424

2525
private static final String english1 =
2626
"/u/nlp/data/ner/goodClassifiers/english.all.3class.nodistsim.crf.ser.gz";
@@ -59,4 +59,3 @@ public void testTwoEnglishCRFs() {
5959
}
6060

6161
}
62-

scripts/ner/Makefile

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,32 @@ genia: genia-nlpba-2004.crf.gz
1414
genia-nlpba-2004.crf.gz:
1515
java -mx10g edu.stanford.nlp.ie.crf.CRFClassifier -prop genia-nlpba-2004.prop > genia-nlpba-2004.out 2>&1
1616

17-
german: german.hgc_175m_600.crf.ser.gz german.dewac_175m_600.crf.ser.gz
18-
19-
german.hgc_175m_600.crf.ser.gz:
20-
java -mx10g edu.stanford.nlp.ie.crf.CRFClassifier -prop german.hgc_175m_600.prop > german.hgc_175m_600.out 2>&1
21-
22-
german.dewac_175m_600.crf.ser.gz:
23-
java -mx10g edu.stanford.nlp.ie.crf.CRFClassifier -prop german.dewac_175m_600.prop > german.dewac_175m_600.out 2>&1
17+
# We are No longer building/distributing deWAC model. The data for distributional similarity classes wasn't clean and can't be recovered.
18+
german: german.conll.crf.ser.gz german.hgc_175m_600.crf.ser.gz german.conll.germeval2014.hgc_175m_600.crf.ser.gz german.conll.germeval2014.europeana.hgc_175m_600.crf.ser.gz
19+
20+
german.conll.crf.ser.gz:
21+
java -mx5g edu.stanford.nlp.ie.crf.CRFClassifier -prop german-2018.hgc_175m_600.prop -useDistSim false -serializeTo $@ \
22+
> $(addsuffix .out, $(basename $(basename $(basename $@)))) 2>&1
23+
24+
german.conll.hgc_175m_600.crf.ser.gz:
25+
java -mx10g edu.stanford.nlp.ie.crf.CRFClassifier -prop german-2018.hgc_175m_600.prop -serializeTo $@ \
26+
> $(addsuffix .out, $(basename $(basename $(basename $@)))) 2>&1
27+
28+
german.conll.germeval2014.hgc_175m_600.crf.ser.gz:
29+
java -mx15g edu.stanford.nlp.ie.crf.CRFClassifier -prop german-2018.hgc_175m_600.prop -serializeTo $@ \
30+
-trainFileList /u/nlp/data/german/ner/2016/deu.io.f15.utf8.train,/u/nlp/data/german/ner/2016/GermEval2014_complete_data/NER-de-train-io.tsv \
31+
> $(addsuffix .out, $(basename $(basename $(basename $@)))) 2>&1
32+
33+
# currently we exclude enp_DE.sbb.io, as the data has too many issues, but we could work to include it....
34+
# ,/u/nlp/data/german/ner/2016/Europeana-Newspapers-data/ner-corpora/enp_DE.sbb.bio/enp_DE.sbb.io
35+
german.conll.germeval2014.europeana.hgc_175m_600.crf.ser.gz:
36+
java -mx20g edu.stanford.nlp.ie.crf.CRFClassifier -prop german-2018.hgc_175m_600.prop -serializeTo $@ \
37+
-trainFileList /u/nlp/data/german/ner/2016/deu.io.f15.utf8.train,/u/nlp/data/german/ner/2016/GermEval2014_complete_data/NER-de-train-io.tsv,/u/nlp/data/german/ner/2016/Europeana-Newspapers-data/ner-corpora/enp_DE.lft.bio/enp_DE.lft.io,/u/nlp/data/german/ner/2016/Europeana-Newspapers-data/ner-corpora/enp_DE.onb.bio/enp_DE.onb.io \
38+
> $(addsuffix .out, $(basename $(basename $(basename $@)))) 2>&1
39+
40+
# We are No longer building/distributing deWAC model. The data for distributional similarity classes wasn't clean and can't be recovered.
41+
# german.dewac_175m_600.crf.ser.gz:
42+
# java -mx10g edu.stanford.nlp.ie.crf.CRFClassifier -prop german.dewac_175m_600.prop > german.dewac_175m_600.out 2>&1
2443

2544

2645
all.3class: english.all.3class.nodistsim.crf.ser.gz english.all.3class.caseless.distsim.crf.ser.gz english.all.3class.distsim.crf.ser.gz
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
trainFile = /u/nlp/data/german/ner/2016/deu.io.f15.utf8.train
2+
testFile = /u/nlp/data/german/ner/2016/deu.io.f15.utf8.testa
3+
serializeTo = german.hgc_175m_600.crf.ser.gz
4+
5+
type=crf
6+
7+
# distSimLexicon = /u/nlp/data/german/ner/hgc_175m_600
8+
distSimLexicon = /u/nlp/data/german/ner/2016/hgc-175M-600
9+
# right options for new hgc_175m_600
10+
distSimFileFormat = alexClark
11+
unknownWordDistSimClass = 599
12+
useDistSim = true
13+
numberEquivalenceDistSim = false
14+
casedDistSim = true
15+
16+
# Now using stripped 2 column files so can add extra datasets!
17+
map = word=0,answer=1
18+
19+
encoding = utf-8
20+
# saveFeatureIndexToDisk = true # now buggy but unnecessary
21+
mergeTags = false
22+
useTitle = false
23+
24+
useClassFeature=true
25+
useWord=true
26+
useNGrams=true
27+
noMidNGrams=true
28+
# Having no maxNGramLeng seemed to work marginally better, but omitted for efficiency
29+
maxNGramLeng=6
30+
usePrev=true
31+
useNext=true
32+
useLongSequences=true
33+
useSequences=true
34+
usePrevSequences=true
35+
useTypeSeqs=true
36+
useTypeSeqs2=true
37+
useTypeySequences=true
38+
# Including useOccurrencePatterns increased scores really marginally (could even disappear now we have weaker regularization)
39+
useOccurrencePatterns=true
40+
useLastRealWord=true
41+
useNextRealWord=true
42+
normalize=true
43+
# using chris4 instead hurts in most recent experiment. Earlier, an experiment had seemed to show the opposite.
44+
wordShape=chris2useLC
45+
useDisjunctive=true
46+
# Width 5 works a little better than 4
47+
disjunctionWidth=5
48+
49+
maxLeft=1
50+
readerAndWriter=edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter
51+
useObservedSequencesOnly=true
52+
useQN = true
53+
QNsize = 15
54+
# sigma 20 works better than sigma 5, which is MUCH better than sigma 1; that was the limit of hyperparameter optimization
55+
# On the basic CoNLL dataset (no distsim, no extra data), sigma=50 is a bit better still (by 0.13 F1)
56+
sigma = 20
57+
58+
# For making faster (less features); changing this to 0.025 doesn't improve performance
59+
featureDiffThresh=0.05
60+
61+
# evaluateIOB=true
62+
63+
# other notes
64+
# even though useTaggySequences will use distsim rather than POS sequences, turning it on didn't help
65+
# adding useWordPairs doesn't seem to help. (Getting them anyway in an edge feature.)

src/edu/stanford/nlp/pipeline/StanfordCoreNLP-german.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ tokenize.language = de
44

55
pos.model = edu/stanford/nlp/models/pos-tagger/german/german-hgc.tagger
66

7-
ner.model = edu/stanford/nlp/models/ner/german.conll.hgc_175m_600.crf.ser.gz
7+
ner.model = edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz
88
ner.applyNumericClassifiers = false
99
ner.useSUTime = false
1010

0 commit comments

Comments
 (0)