In [1]:
using Pkg
pkg"activate ."

In [None]:
open

In [35]:
replace("abc123", r"[^a-z]" => "")

"abc"

In [41]:
function parse_heading(line)
    parts = split(line, '|')
    length(parts) == 2 || @show line parts
    word = parts[1]
    num_entries = parse(Int, parts[2])
    word, num_entries
end

function parse_entry(line)
    normalize.(split(line, '|')[2:end])
end

function add_synonyms!(synonyms, word, list)
    entries = get!(() -> Set{String}(), synonyms, word)
    for entry in list
        push!(entries, entry)
    end
end

function normalize(word)
    replace(lowercase(word), r"[^a-z0-9]" => "")
end

function parse_synonyms(fname)
    synonyms = Dict{String, Set{String}}()
    open(fname) do file
        @assert readline(file) == "ISO8859-1"
        while true
            line = readline(file)
            if isempty(line)
                break
            end
            word, num_entries = parse_heading(line)
            for i in 1:num_entries
                add_synonyms!(synonyms, word, parse_entry(readline(file)))
            end
        end            
    end
    synonyms
end

function make_symmetric!(synonyms)
    for (word, entries) in synonyms
        for entry in entries
            push!(get!(() -> Set{String}(), synonyms, entry), word)
        end
    end
end
    
function remove_loops!(synonyms)
    for (word, entries) in synonyms
        if word in entries
            delete!(entries, word)
        end
    end
end

remove_loops! (generic function with 1 method)

In [42]:
synonyms = parse_synonyms("corpora/OpenOffice/MyThes-1.0/th_en_US_new.dat");
make_symmetric!(synonyms)
remove_loops!(synonyms)

In [43]:
synonyms["zyrian"]

Set(["permic", "komi"])

In [45]:
"english" in synonyms["spin"]

true

In [48]:
synonyms["coast"]

Set(["atlantic coast", "barbary coast", "pacific coast", "movement", "view", "vista", "prospect", "panorama", "littoral zone", "intertidal"  …  "seaside", "slip", "gulf coast", "littoral", "seacoast", "incline", "side", "seashore", "litoral", "aeolia", "seaboard"])

In [49]:
using Statistics

In [51]:
mean(length, values(synonyms))

6.390917271159772

In [54]:
median(length.(values(synonyms)))

3.0

In [55]:
maximum(length, values(synonyms))

1024

In [56]:
[key for (key, entries) in synonyms if length(entries) > 1000]

1-element Array{String,1}:
 "change"

In [57]:
synonyms["change"]

Set(["action", "acetylise", "redact", "laicise", "unsex", "meliorate", "raise", "clothing", "nascence", "keratinise"  …  "validate", "centralize", "unify", "part", "shorten", "synchronize", "dizzy", "minify", "concretize", "draw", "liquidise"])