Permalink
Browse files

Some cleanup of ConceptTrie building + added one more testing function

to FixupTools (wrong place but quick hack for now!).
  • Loading branch information...
1 parent 2c6a64f commit add4339fc530fd796145634a959e10984510b0ad @subbuss committed May 20, 2012
@@ -574,6 +574,42 @@ else if (action.equals("test-trie")) {
pw.close();
r.close();
}
+ else if (action.equals("all-topics-trie")) {
+ // New trie
+ ConceptTrie trie = new ConceptTrie();
+
+ long t1 = System.currentTimeMillis();
+ List<Issue> issues = _db.getAllValidatedIssues();
+ int n = 0;
+ for (Issue i: issues) {
+ i.compileIntoTrie(trie);
+ n++;
+ }
+ System.out.println("trie build time for " + n + " topics: " + (System.currentTimeMillis() - t1));
+
+ // Test it
+ PrintWriter pw = IOUtils.getUTF8Writer("/tmp/test.tokens");
+ for (int j = 0; j < 1000; j++) {
+ pw.println("***** Iteration " + n + " *****");
+ Reader r;
+
+ pw.println("--- file 1 ---");
+ r = IOUtils.getUTF8Reader("/tmp/test_item");
+ trie.processArticle(r, pw);
+ r.close();
+
+ pw.println("--- file 2 ---");
+ r = IOUtils.getUTF8Reader("/tmp/test_item.2");
+ trie.processArticle(r, pw);
+ r.close();
+
+ pw.println("--- file 3 ---");
+ r = IOUtils.getUTF8Reader("/tmp/test_item.3");
+ trie.processArticle(r, pw);
+ r.close();
+ }
+ pw.close();
+ }
else {
System.out.println("Unknown action: " + action);
}
@@ -8,8 +8,6 @@
import java.util.List;
import java.util.Map;
-import newsrack.util.Tuple;
-
public class ConceptTrie {
static public class Node {
Character _c;
@@ -72,8 +70,8 @@ public Node matchString(Object startState, String str) {
}
private void processMatchedConcepts(List<Concept> matchedConcepts, String matchedText, int tokenPosn, Map<Concept, Score> tokTable, PrintWriter pw) {
- // Increment match score of the matched concept and record information
- // about where in the article it was found
+ // Increment match score of the matched concept and record information
+ // about where in the article it was found
for (Concept c: matchedConcepts) {
Score cnt = (Score)tokTable.get(c);
if (cnt == null) {
@@ -180,7 +178,7 @@ private char swallowWhiteSpace(char c, PushbackReader pbr) throws java.io.IOExce
buf.delete(0, buf.length());
// Match token from the root
- System.out.println(tokenPosn + ". TOKEN: " + token + "; separator: <" + separator + ">");
+ //System.out.println(tokenPosn + ". TOKEN: " + token + "; separator: <" + separator + ">");
Node match = matchString(null, token);
if (match != null) {
if (match._matchedConcepts != null) processMatchedConcepts(match._matchedConcepts, match._matchedString, tokenPosn, tokenMap, pw);
@@ -673,31 +673,6 @@ public void compileIntoTrie(ConceptTrie trie) {
HashMap<String, Concept> tokenMap = new HashMap<String, Concept>();
for (Iterator<Concept> e = getUsedConcepts(); e.hasNext(); ) {
Concept c = e.next();
- Concept x = tokenMap.get(c.getName());
- // No conflict!
- if (x == null) {
- tokenMap.put(c.getName(), c);
-
- // IMPORTANT: If the lexer token is already set, don't reset it!
- // Some other issue might have already set it to be a qualified name!
- if (c.getLexerToken() == null) {
- c.setLexerToken(new ConceptToken(c.getName()));
- _db.updateConceptLexerToken(c);
- }
- }
- // Conflict!! Qualify with collection name ... conflicts are expected to be rare
- else {
- String xToken = x.getCollection().getName() + ":" + x.getName();
- x.setLexerToken(new ConceptToken(xToken));
- _db.updateConceptLexerToken(x);
- tokenMap.put(xToken, x);
-
- String cToken = c.getCollection().getName() + ":" + c.getName();
- c.setLexerToken(new ConceptToken(cToken));
- _db.updateConceptLexerToken(c);
- tokenMap.put(cToken, c);
- }
-
// Add all keywords to the trie
Iterator<String> kws = c.getKeywords();
while (kws.hasNext()) {
@@ -925,7 +900,7 @@ public void compileScanners(String workDir)
* @param numTokens number of tokens encountered
* @param tokTable the table of recognized tokens/concepts
*/
- public void classifyArticle(NewsItem ni, int numTokens, HashMap tokTable)
+ public void classifyArticle(NewsItem ni, int numTokens, HashMap<String,Score> tokTable)
{
int matchScore = 0;
ArrayList<Category> matchedCats = new ArrayList<Category>();
@@ -956,7 +931,7 @@ public void classifyArticle(NewsItem ni, int numTokens, HashMap tokTable)
* Fetch the news item that is stored in 'newsItemFileName', examine the tokens
* in 'tokTable', and classify the news item accordingly.
*/
- private void classifyArticle(String newsItemFileName, HashMap tokTable, HashMap newsTable, List allArts, List unclassifiedArts)
+ private void classifyArticle(String newsItemFileName, HashMap<String,Score> tokTable, HashMap newsTable, List allArts, List unclassifiedArts)
{
NewsItem ni = (NewsItem)newsTable.get(newsItemFileName);
if (ni == null) {

0 comments on commit add4339

Please sign in to comment.