Port of the simhash algo (from a python port) in julia lang.

In [6]:
# run to reload the workspace/types and avoid the method overwrite warning
workspace()

In [7]:
using Nettle
using Formatting
using Iterators



In [8]:
_regex = r"\w+"
# default in py : r"[\w\u4e00-\u9fcc]+"
# going with the more straightforward get the words

r"\w+"

In [9]:
type Simhash
    # this, unfortunately, does not hash on init
    value
    f::Int
    
    function Simhash(value::BigInt, f::Int)
        # assuming this is the hashed simhash value
        # that was generated previously
        new(value, f)
    end
    function Simhash(value::AbstractString, f::Int)
        v = build_by_text(value, f)
        new(v, f)
    end
    function Simhash(value::Dict, f::Int)
        v = build_by_features(value, f)
        new(v, f)
    end
    function Simhash(value::AbstractArray, f::Int)
        v = build_by_features(value, f)
        new(v, f)
    end
    function Simhash(s::Simhash)
        new(s.value, s.f)
    end
end

function _hashfunc(x)
    h = Hasher("md5")
    update!(h, x)
    return parse(BigInt, hexdigest!(h), 16)
end

function distance(one::Simhash, another::Simhash)
    # this is ridiculous, julia
    x =  BigInt((one.value $ another.value) & ((BigInt(1) << BigInt(one.f)) - BigInt(1)))
    ans = 0
    while x > 0
        ans += 1
        x &= x - 1
    end
    return ans
end

function _tokenize(content::String)
    content = lowercase(content)
    content = join(matchall(_regex, content), "")
    return _slide(content)
end

function _slide(content::String, width=4)
    return [content[i:i + width - 1] for i in 1:max(length(content) - width, 1) + 1] 
end

function build_by_text(content::String, f)
    features = _tokenize(content)
    fdict = Dict{String, Integer}(g[1] => length(g) for g = groupby(x -> x, sort(features)))
    return build_by_features(fdict, f)
end

function build_by_features(features::Any, f)
    v = zeros(f)
    masks = [BigInt(1) << BigInt(i-1) for i in 1:f]
    
    if isa(features, Dict)
        # to list of tuples
        features = collect(features)
    end
    
    for feat in features
        # strings are by default unicode
        if isa(feat, String)
            h = _hashfunc(feat)
            w = 1
        else
            # without assertion for the iterables
            h = _hashfunc(feat[1])
            w = feat[2] 
        end
            
        for i in 1:f
            v[i] += h & masks[i] != 0 ? w : -w
        end
    end
    
    ans = 0
    for i in 1:f
        if v[i] >= 0
            ans |= masks[i] 
        end
    end
    
    # numeric rep of the string/iterable
    return ans
end

build_by_features (generic function with 1 method)

In [10]:
s1 = Simhash(_hashfunc("23456"), 64)
s2 = Simhash(_hashfunc("23455"), 64)

# println(distance(s1, s2))

distance(s1, s2)



39

In [11]:
s = Simhash(["aaa", "bbb"], 64)
# this says there's something off with the bitwise ops still
# regardless of the py long -> jl int128 problems

Simhash(8637903533912358349,64)

In [40]:
typeof(s.value)

UInt128

In [12]:
using Base.Test

@test Simhash(["aaa", "bbb"], 64).value == 8637903533912358349


[1m[32mTest Passed
[0m  Expression: Simhash(["aaa","bbb"],64).value == 8637903533912358349
   Evaluated: 8637903533912358349 == 8637903533912358349

In [None]:
# port of the SimHashIndex class

_fe = FormatExpr("{1:x},{2}")

type SimhashIndex 
    # from py : list of tuples
    objs
    f::Int64
    k::Float16
    
    count::Int16
    
    # from py : default dict of sets
    bucket 
    
    # init the bucket with the objs in the list
    function _init
        bucket = Dict{String, Set}
        
        for i, q in enumerate(objs)
            # ignoring the logging here
            (bucket).add(*q) # SUPER DUPER UNPORTED
            
            # *[list]
        end
    end
end

function _get_keys(simhash)
    # UNPORTED
    
end

# ADD ALL THE INDEX REFS BITS
function get_near_dups(simhash)
    # simhash = simhash obj to check against index
    ans = Set()
    
    for key in _get_keys(simhash)
        dups = bucket[key]
        
        if length(dups) > 200
            println("Warning: big bucket found: ", key) 
        end
        
        for dup in dups
           # UNPORTED
            simhash2, obj_id = split(dup, ',') # dup.split(',', 1)
            simhash2 = Simhash(simhash2, f) # Simhash(long(sim2, 16), self.f)
            
            d = simhash.distance(simhash2)
            if d <= k
                ans.add(obj_id) 
            end
        end
    end
    
    return ans # CONVERT TO LIST from a set
end

function add_to_index(obj_id, simhash)
    # string, simhash
    for key in _get_keys(simhash)
        v = printfmt(fe, simhash.value, obj_id) 
        push!(bucket[key], v)
    end
end

function delete_from_index(obj_id, simhash)
    # string, simhash
    for key in _get_keys(simhash)
        v = printfmt(fe, simhash.value, obj_id)
        delete!(bucket[key], v)
    end
end

# @property equivalent
function offsets
    # per the orig.: You may optimize this method according to 
    #     <http://www.wwwconference.org/www2007/papers/paper215.pdf>
    
    # [self.f // (self.k + 1) * i for i in range(self.k + 1)]
    return [f // (k+1) * i for i in 1:(k+1)]
end


In [30]:
content = "1798877888 6168394363 bob"
# _regex = r"[\w\u4e00-\u9fcc]+"
# _regex = r"[\w\x{4E00}-\x{9FCC}]+"
_regex = r"\w+"


join(matchall(_regex, content), "")

"17988778886168394363bob"

In [1]:
using Iterators
features = ["1383","0552","3835","8350","3505","5058","0580","5805","8055","0552","5528","5282","2821","8216","2163","1637","6371","3712"]

println(typeof(features))
#fdict = {k:sum(1 for _ in g) for k, g in groupby(sort(features))}
# fd = {k[1]:length(k) for k in groupby(sort(features))}

# sorted = sort(features)

fd = Dict{String, Integer}(g[1] => length(g) for g = groupby(x->x, sort(features)))
#(String, Integer) => length(g) for g = groupby(sort(features))

fd

Array{String,1}


Dict{String,Integer} with 17 entries:
  "6371" => 1
  "5528" => 1
  "5058" => 1
  "3505" => 1
  "5805" => 1
  "0552" => 2
  "8216" => 1
  "2163" => 1
  "3835" => 1
  "2821" => 1
  "5282" => 1
  "1637" => 1
  "8350" => 1
  "0580" => 1
  "1383" => 1
  "3712" => 1
  "8055" => 1

In [4]:
collect(fd)

17-element Array{Pair{String,Integer},1}:
 Pair{String,Integer}("6371",1)
 Pair{String,Integer}("5528",1)
 Pair{String,Integer}("5058",1)
 Pair{String,Integer}("3505",1)
 Pair{String,Integer}("5805",1)
 Pair{String,Integer}("0552",2)
 Pair{String,Integer}("8216",1)
 Pair{String,Integer}("2163",1)
 Pair{String,Integer}("3835",1)
 Pair{String,Integer}("2821",1)
 Pair{String,Integer}("5282",1)
 Pair{String,Integer}("1637",1)
 Pair{String,Integer}("8350",1)
 Pair{String,Integer}("0580",1)
 Pair{String,Integer}("1383",1)
 Pair{String,Integer}("3712",1)
 Pair{String,Integer}("8055",1)

In [32]:
workspace()
using Nettle
function bp(features, f)
    v = zeros(f)
    # check for the end here
    masks = [UInt128(1) << UInt128(i) for i in 1:f]
    
    if isa(features, Dict)
        # to list of tuples
        features = collect(features)
    end
    
    for f in features
        # strings are by default unicode
        if isa(f, String)
#             h = _hashfunc(f)
            h = Hasher("md5")
            update!(h, f)
            h = parse(UInt128, hexdigest!(h), 16)
            w = 1
        else
                # without assertion for the iterables
#             h = _hashfunc(f[1])
            h = Hasher("md5")
            update!(h, f[1])
            h = parse(UInt128, hexdigest!(h), 16)
            w = f[2] 
        end
        
        for i in 1:length(f)
            # so UNPORTED here
            v[i] += h & masks[i] != nothing ? w : -w
        end
    end
    
    ans = 0
    for i in 1:length(f)
        if v[i] >= 0
            ans |= masks[i] 
        end
    end
    
    # numeric rep of the string/iterable
    return ans
end

bp(["aaa", "bbb"], 64)



0x0000000000000000000000000000000e