In [0]:
# Importing the 3 libraries we need
import base64
import hashlib
import binascii

In [0]:
# All required methods for translating sequences to the **ga4gh identifier** 
# space and the **now retired** TRUNC512 scheme

def trunc512_digest(seq, offset=24):
    digest = hashlib.sha512(seq.encode('utf-8')).digest()
    hex_digest = binascii.hexlify(digest[:offset])
    return hex_digest.decode("utf-8") 

def ga4gh_digest(seq, digest_size=24):
    # b64 encoding results in 4/3 size expansion of data and padded if
    # not multiple of 3, which doesn't make sense for this use
    assert digest_size % 3 == 0, "digest size must be multiple of 3"
    digest = hashlib.sha512(seq.encode('utf-8')).digest()
    return _ga4gh_format(digest, digest_size)

def _ga4gh_format(digest, digest_size=24):
    tdigest_b64us = base64.urlsafe_b64encode(digest[:digest_size])
    return "ga4gh:SQ.{}".format(tdigest_b64us.decode("utf-8"))

def ga4gh_to_trunc512(vmc):
    base64_strip = vmc.replace("ga4gh:SQ.","")
    digest = base64.urlsafe_b64decode(base64_strip)
    hex_digest = binascii.hexlify(digest)
    return hex_digest.decode("utf-8") 

def trunc512_to_ga4gh(trunc512):
    digest_length = len(trunc512)*2
    digest = binascii.unhexlify(trunc512)
    return _ga4gh_format(digest, digest_length)

Output from the various functions. We show the creation of the TRUNC512 digest, the ga4gh identifier and how you can move between the two schemes with minimal effort.

In [21]:
print("TRUNC512 digest: {}".format(trunc512_digest('ACGT')))
print("TRUNC512 digest +2 bits: {}".format(trunc512_digest('ACGT', 26)))
print("")
print("ga4gh identifier: {}".format(ga4gh_digest('ACGT')))
print("TRUNC512 {} can be translated to {}".format(trunc512_digest('ACGT'), trunc512_to_ga4gh(trunc512_digest('ACGT'))))
print("")
print("Empty digest: {}".format(ga4gh_digest("")))

TRUNC512 digest: 68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36
TRUNC512 digest +2 bits: 68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36cf70

ga4gh identifier: ga4gh:SQ.aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2
TRUNC512 68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36 can be translated to ga4gh:SQ.aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2

Empty digest: ga4gh:SQ.z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc
