/
RavenMoreLikeThis.cs
108 lines (98 loc) · 2.31 KB
/
RavenMoreLikeThis.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
using System.Collections.Generic;
using System.IO;
using System.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Index;
using Lucene.Net.Util;
namespace Raven.Bundles.MoreLikeThis
{
class RavenMoreLikeThis : Similarity.Net.MoreLikeThis
{
private readonly IndexReader _ir;
public Dictionary<string, Analyzer> Analyzers { get; set; }
public RavenMoreLikeThis(IndexReader ir)
: base(ir)
{
_ir = ir;
}
protected override PriorityQueue RetrieveTerms(int docNum)
{
var fieldNames = GetFieldNames();
var termFreqMap = new System.Collections.Hashtable();
var d = _ir.Document(docNum);
foreach (var fieldName in fieldNames)
{
var vector = _ir.GetTermFreqVector(docNum, fieldName);
// field does not store term vector info
if (vector == null)
{
var text = d.GetValues(fieldName);
if (text != null)
{
foreach (var t in text)
{
AddTermFrequencies(new StringReader(t), termFreqMap, fieldName);
}
}
}
else
{
AddTermFrequencies(termFreqMap, vector);
}
}
return CreateQueue(termFreqMap);
}
protected new void AddTermFrequencies(System.IO.StreamReader r, System.Collections.IDictionary termFreqMap, System.String fieldName)
{
var analyzer = Analyzers[fieldName];
TokenStream ts = analyzer.TokenStream(fieldName, r);
TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute));
int tokenCount = 0;
while (ts.IncrementToken())
{
// for every token
System.String word = termAtt.Term();
tokenCount++;
if (tokenCount > GetMaxNumTokensParsed())
{
break;
}
if (IsNoiseWord(word))
{
continue;
}
// increment frequency
var cnt = (Int)termFreqMap[word];
if (cnt == null)
{
termFreqMap[word] = new Int();
}
else
{
cnt.x++;
}
}
}
protected new bool IsNoiseWord(System.String term)
{
int len = term.Length;
var minWordLen = GetMinWordLen();
var maxWordLen = GetMaxWordLen();
var stopWords = GetStopWords();
if (minWordLen > 0 && len < minWordLen)
{
return true;
}
if (maxWordLen > 0 && len > maxWordLen)
{
return true;
}
if (stopWords != null && stopWords.Contains(term.ToLower()))
{
return true;
}
return false;
}
}
}