In [None]:
Port of the simhash algo (from a python port) in julia lang.

In [25]:
# run to reload the workspace/types and avoid the method overwrite warning
workspace()

In [26]:
using Nettle
using Formatting
using Iterators



In [27]:
_regex = r"\w+"
# default in py : r"[\w\u4e00-\u9fcc]+"
# going with the more straightforward get the words

r"\w+"

In [28]:
type Simhash
    # this, unfortunately, does not hash on init
    value
    f::Int
    
    function Simhash(value::BigInt, f::Int)
        new(value, f)
    end
    function Simhash(value::Simhash, f::Int)
        new(value.value, f)
    end
    function Simhash(value::AbstractString, f::Int)
        v = build_by_text(value, f)
        new(v, f)
    end
    function Simhash(value::Dict, f::Int)
        v = build_by_features(value, f)
        new(v, f)
    end
    function Simhash(value::AbstractArray, f::Int)
        v = build_by_features(value, f)
        new(v, f)
    end
    function Simhash(s::Simhash)
        new(s.value, s.f)
    end
end

function _hashfunc(x)
    h = Hasher("md5")
    update!(h, x)
    return parse(BigInt, hexdigest!(h), 16)
end

function distance(one::Simhash, another::Simhash)
    # this is ridiculous, julia
    x =  BigInt((one.value $ another.value) & ((BigInt(1) << BigInt(one.f)) - BigInt(1)))
    ans = 0
    while x > 0
        ans += 1
        x &= x - 1
    end
    return ans
end

function _tokenize(content::String)
    content = lowercase(content)
    content = join(matchall(_regex, content), "")
    return _slide(content)
end

function _slide(content::String, width=4)
    return [content[i:min(i + width-1, length(content))] for i in 1:max(length(content) - width, 1) + 1] 
end

function build_by_text(content::String, f)
    features = _tokenize(content)
    fdict = Dict{String, Integer}(g[1] => length(g) for g = groupby(x -> x, sort(features)))
    return build_by_features(fdict, f)
end

function build_by_features(features::Any, f)
    v = zeros(f)
    masks = [BigInt(1) << BigInt(i-1) for i in 1:f]
    
    if isa(features, Dict)
        # to list of tuples
        features = collect(features)
    end
    
    for feat in features
        # strings are by default unicode
        if isa(feat, String)
            h = _hashfunc(feat)
            w = 1
        else
            # without assertion for the iterables
            h = _hashfunc(feat[1])
            w = feat[2] 
        end
            
        for i in 1:f
            v[i] += h & masks[i] != 0 ? w : -w
        end
    end
    
    ans = 0
    for i in 1:f
        if v[i] >= 0
            ans |= masks[i] 
        end
    end
    
    # numeric rep of the string/iterable
    return ans
end

build_by_features (generic function with 1 method)

In [21]:
s1 = Simhash(_hashfunc("23456"), 64)
s2 = Simhash(_hashfunc("23455"), 64)

distance(s1, s2)

39

In [51]:
Simhash("ab", 64)

Simhash(4600387986561040303,64)

In [21]:
s = Simhash(["aaa", "bbb"], 64)

Simhash(8637903533912358349,64)

In [52]:
using Base.Test

# basic check
@test Simhash(["aaa", "bbb"], 64).value == 8637903533912358349

# distance tests
sh = Simhash("How are you? I AM fine. Thanks. And you?", 64)
sh2 = Simhash("How old are you ? :-) i am fine. Thanks. And you?", 64)

@test distance(sh, sh2) > 0

sh3 = Simhash(sh2, 64)

@test distance(sh2, sh3) == 0

@test distance(Simhash("1", 64), Simhash("2", 64)) != 0

[1m[32mTest Passed
[0m  Expression: distance(Simhash("1",64),Simhash("2",64)) != 0
   Evaluated: 14 != 0

In [60]:
# short tests

shs = [Simhash(s, 64).value for s in ("aa", "aaa", "aaaa", "aaaab", "aaaaabb", "aaaaabbb")]

@testset begin
for (i, sh1) in enumerate(shs)
    for (j, sh2) in enumerate(shs)
        if i != j
            @test sh1 != sh2
        end
    end
end
    
end

[1m[37mTest Summary: | [0m[1m[32mPass  [0m[1m[34mTotal[0m
  test set    | [1m[32m  30  [0m[1m[34m   30[0m


Base.Test.DefaultTestSet("test set",Any[[1m[32mTest Passed
[0m  Expression: sh1 != sh2
   Evaluated: 4174764262805172083 != 8070439531712346394,[1m[32mTest Passed
[0m  Expression: sh1 != sh2
   Evaluated: 4174764262805172083 != 17870237137601691117,[1m[32mTest Passed
[0m  Expression: sh1 != sh2
   Evaluated: 4174764262805172083 != 16140833772797945853,[1m[32mTest Passed
[0m  Expression: sh1 != sh2
   Evaluated: 4174764262805172083 != 15564370821471127029,[1m[32mTest Passed
[0m  Expression: sh1 != sh2
   Evaluated: 4174764262805172083 != 15555363622182306293,[1m[32mTest Passed
[0m  Expression: sh1 != sh2
   Evaluated: 8070439531712346394 != 4174764262805172083,[1m[32mTest Passed
[0m  Expression: sh1 != sh2
   Evaluated: 8070439531712346394 != 17870237137601691117,[1m[32mTest Passed
[0m  Expression: sh1 != sh2
   Evaluated: 8070439531712346394 != 16140833772797945853,[1m[32mTest Passed
[0m  Expression: sh1 != sh2
   Evaluated: 8070439531712346394 != 15564370821

In [29]:
# port of the SimHashIndex class

_fe = FormatExpr("{1:x},{2}")

type SimhashIndex 
    # from py : list of tuples
    objs::Array{Tuple{String,Simhash}}
    f::Int
    k::Int
    count::Int
    
    # from py : default dict of sets
    bucket::Dict{String, Set} 
    
    # generate the offsets from the f, k
    offsets::Array{Int}
    
    function SimhashIndex(objs::Array{Tuple{String,Simhash}}, f::Int, k::Int)
        bucket = Dict{String, Set}(Set())
        offsets = _offsets(f, k)
        for obj in objs
            add_to_index(bucket, obj, offsets)
        end
        new(objs, f, k, length(objs), bucket, offsets)
    end
end

function add_to_index(bucket, obj, offsets)
    # obj = Tuple(string, simhash)
    for key in _get_keys(obj[2], offsets)
        v = format(_fe, obj[2].value, obj[1]) 
        if !haskey(bucket, key)
            bucket[key] = Set()
        end
        push!(bucket[key], v)
    end
end

function _offsets(f, k)
    # per the orig.: You may optimize this method according to 
    #     <http://www.wwwconference.org/www2007/papers/paper215.pdf>
    return [fld(f, (k + 1)) * (i - 1) for i in 1:(k + 1)] 
end

function _get_keys(simhash, offsets)
    keys = []
    fe = FormatExpr("{1:x}:{2:x}")
    for (i, offset) in enumerate(offsets)
        if i == length(offsets)
            m = 2 ^ (simhash.f - offset) - 1
        else
            m = 2 ^ (offsets[i+1] - offset) - 1
        end
        c = (simhash.value >> offset) & m
        push!(keys, format(fe, c, i))
    end
    return keys
end

function get_near_dups(index, simhash,)
    # simhash = simhash obj to check against index
    ans = Set()
    
    for key in _get_keys(simhash, index.offsets)
        dups = index.bucket[key]
        
        if length(dups) > 200
            println("Warning: big bucket found: ", key) 
        end
        
        for dup in dups
            simhash2, obj_id = split(dup, ',', limit=2)
            simhash2 = Simhash(parse(BigInt, simhash2, 16), simhash.f)
            d = distance(simhash, simhash2)
            if d <= index.k
                push!(ans, obj_id)
            end
        end
    end
    
    return collect(ans)
end

function delete_from_index(obj_id, simhash)
    # string, simhash
    for key in _get_keys(simhash)
        v = format(_fe, simhash.value, obj_id)
        delete!(bucket[key], v)
    end
end


delete_from_index (generic function with 1 method)

In [30]:
using Base.Test

data = Dict{Int, String}(1=>"How are you? I Am fine. blar blar blar blar blar Thanks.",
    2=>"How are you i am fine. blar blar blar blar blar than",
    3=>"This is simhash test.",
    4=>"How are you i am fine. blar blar blar blar blar thank1"
)

objs = [("$(k)", Simhash(v, 64)) for (k,v) in collect(data)]

simhash_index = SimhashIndex(objs, 64, 10)

SimhashIndex(Tuple{String,Simhash}[("4",Simhash(17663612459742043242,64)),("2",Simhash(8440240356449459322,64)),("3",Simhash(9984379969213434071,64)),("1",Simhash(8440240427182201978,64))],64,10,4,Dict{String,Set}(Pair{String,Set}("7:3",Set(Any["f521c1f241169c6a,4","7521c1e2c9161c7a,2","7521c1f341161c7a,1"])),Pair{String,Set}("2:3",Set(Any["8a8fa4aeb77e08d7,3"])),Pair{String,Set}("a:1",Set(Any["f521c1f241169c6a,4"])),Pair{String,Set}("1:9",Set(Any["f521c1f241169c6a,4","7521c1e2c9161c7a,2","7521c1f341161c7a,1"])),Pair{String,Set}("4:9",Set(Any["8a8fa4aeb77e08d7,3"])),Pair{String,Set}("22a3:b",Set(Any["8a8fa4aeb77e08d7,3"])),Pair{String,Set}("9:7",Set(Any["f521c1f241169c6a,4"])),Pair{String,Set}("3d48:b",Set(Any["f521c1f241169c6a,4"])),Pair{String,Set}("1e:8",Set(Any["f521c1f241169c6a,4","7521c1f341161c7a,1"])),Pair{String,Set}("1d:a",Set(Any["8a8fa4aeb77e08d7,3"]))…),[0,5,10,15,20,25,30,35,40,45,50])

In [31]:
@testset "Get Near Dupes" begin
    s1 = Simhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank", 64)
    dups = get_near_dups(simhash_index, s1)
    @test length(dups) == 3
end

[1m[37mGet Near Dupes: [0m[1m[31mError During Test
[0m  Got an exception of type KeyError outside of a @test
  KeyError: key "5:6" not found
   in getindex at ./dict.jl:688 [inlined]
   in get_near_dups(::SimhashIndex, ::Simhash) at ./In[29]:65
   in macro expansion; at ./In[31]:4 [inlined]
   in macro expansion; at ./test.jl:672 [inlined]
   in anonymous at ./<missing>:?
   in include_string(::String, ::String) at ./loading.jl:441
   in execute_request(::LastMain.LastMain.LastMain.LastMain.ZMQ.Socket, ::LastMain.LastMain.LastMain.LastMain.IJulia.Msg) at /Users/sparky/.julia/v0.5/IJulia/src/execute_request.jl:157
   in eventloop(::LastMain.LastMain.LastMain.LastMain.ZMQ.Socket) at /Users/sparky/.julia/v0.5/IJulia/src/eventloop.jl:8
   in (::LastMain.LastMain.LastMain.LastMain.IJulia.##13#19)() at ./task.jl:360
[1m[37mTest Summary:  | [0m[1m[31mError  [0m[1m[34mTotal[0m
  Get Near Dupes | [1m[31m    1  [0m[1m[34m    1[0m


LoadError: Some tests did not pass: 0 passed, 0 failed, 1 errored, 0 broken.