Skip to content

Commit

Permalink
regex word filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
sbos committed May 23, 2015
1 parent 5f08e94 commit 7c41295
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 3 deletions.
5 changes: 3 additions & 2 deletions src/util.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
function read_from_file(vocab_path::String, min_freq::Int64=0, stopwords::Set{String}=Set{String}())
function read_from_file(vocab_path::String, min_freq::Int64=0, stopwords::Set{String}=Set{String}();
regex::Regex=r"")
fin = open(vocab_path)
freqs = Array(Int64, 0)
id2word = Array(String, 0)
while !eof(fin)
try
word, freq = split(readline(fin))
freq_num = int64(freq)
if freq_num < min_freq || word in stopwords continue end
if freq_num < min_freq || word in stopwords || !ismatch(regex, word) continue end
push!(id2word, word)
push!(freqs, freq_num)
catch e
Expand Down
6 changes: 5 additions & 1 deletion train.jl
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ s = ArgParseSettings()
help = "minimal probability of a meaning to contribute into gradients"
arg_type = Float64
default = 1e-10
"--regex"
help = "ignore words not matching provided regex"
arg_type = String
default = ""
end

args = parse_args(ARGS, s)
Expand All @@ -88,7 +92,7 @@ end

print("Building dictionary... ")
vm, dict = read_from_file(args["dict"], args["dim"], args["prototypes"],
args["min-freq"], args["remove-top-k"], stopwords)
args["min-freq"], args["remove-top-k"], stopwords; regex=Regex(args["regex"]))
println("Done!")

vm.alpha = args["alpha"]
Expand Down

0 comments on commit 7c41295

Please sign in to comment.