# refget-py tutorial

In [1]:
import refget
from refget import trunc512_digest

In [2]:
rgdb = refget.RefDB(refget.RedisDict())

## Comparing sequence collections

We may be interested in if collections have the same sequences in different order, or with different names. The `compare` function can provide this information. Let's load an example that adds another sequence:

In [3]:
fa_file = "demo_fasta/demo.fa"
checksum, content = rgdb.load_fasta(fa_file)

In [4]:
fa_file = "demo_fasta/demo2.fa"
checksum2, content2 = rgdb.load_fasta(fa_file)

In [5]:
rgdb.refget(checksum2)

OrderedDict([('chr1', 'ACGT'), ('chr2', 'TCGA'), ('chrX', 'TTCCGGAA')])

In [6]:
rgdb.refget(checksum)

OrderedDict([('chr1', 'ACGT'), ('chr2', 'TCGA')])

In [7]:
rgdb.compare(checksum, checksum2)

'A is a sequence-level subset of B'

In [8]:
fa_file = "demo_fasta/demo3.fa"
checksum3, content3 = rgdb.load_fasta(fa_file)
rgdb.refget(checksum3)

OrderedDict([('chr1', 'ACGT'), ('chrX', 'TTCCGGAA')])

In [9]:
rgdb.compare(checksum, checksum3)

'A and B share some sequences'

In [10]:
fa_file = "demo_fasta/demo4.fa"
checksum4, content4 = rgdb.load_fasta(fa_file)
rgdb.refget(checksum4)

OrderedDict([('chrX', 'TTCCGGAA')])

In [11]:
rgdb.compare(checksum, checksum4)

'No sequences shared'

In [12]:
rgdb.compare(checksum2, checksum4)

'B is a sequence-level subset of A'

In [13]:
fa_file = "demo_fasta/demo5.fa"
checksum5, content5 = rgdb.load_fasta(fa_file)
rgdb.refget(checksum5)

OrderedDict([('chrX', 'TTCCGGAA'), ('chr1', 'ACGT')])

In [14]:
rgdb.compare(checksum, checksum5)

'A and B share some sequences'

In [15]:
rgdb.compare(checksum3, checksum5)

'Sequence-level identical, order mismatch'

In [16]:
fa_file = "demo_fasta/demo6.fa"
checksum6, content6 = rgdb.load_fasta(fa_file)
rgdb.refget(checksum6)

OrderedDict([('1', 'ACGT'), ('2', 'TCGA'), ('X', 'TTCCGGAA')])

In [17]:
rgdb.compare(checksum2, checksum6)

'Sequence-level identical, names mismatch'

# 3-layer refget

Yes, you can also continue this recursion indefinitely. To demonstrate, let's make a sequence that consists of these 2 fasta files checksums, and load that into the database.

In [18]:
layer3seq = "demo1:{};demo2:{}".format(checksum, checksum2)
layer3seq

'demo1:fa7f432a0ca2a8bb224562f1c2f091e37937c4b0a5f5acf7;demo2:6f2e5266e233ef82ae4fa15fb621ed2843b19cbb5f099391'

In [19]:
rgdb.load_seq(layer3seq)

'96b06cb2ef1fd4d8efb6c1d3a2be2415330c8fa77ebafc9d'

Can we retreieve it? You bet! At any recursion level:

In [20]:
rgdb.refget("96b06cb2ef1fd4d8efb6c1d3a2be2415330c8fa77ebafc9d")

OrderedDict([('demo1', OrderedDict([('chr1', 'ACGT'), ('chr2', 'TCGA')])),
             ('demo2',
              OrderedDict([('chr1', 'ACGT'),
                           ('chr2', 'TCGA'),
                           ('chrX', 'TTCCGGAA')]))])

In [21]:
rgdb.refget("96b06cb2ef1fd4d8efb6c1d3a2be2415330c8fa77ebafc9d", reclimit=1)

OrderedDict([('demo1', 'fa7f432a0ca2a8bb224562f1c2f091e37937c4b0a5f5acf7'),
             ('demo2', '6f2e5266e233ef82ae4fa15fb621ed2843b19cbb5f099391')])

In [22]:
rgdb.refget("96b06cb2ef1fd4d8efb6c1d3a2be2415330c8fa77ebafc9d", reclimit=2)

OrderedDict([('demo1',
              OrderedDict([('chr1',
                            '68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36'),
                           ('chr2',
                            '3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce')])),
             ('demo2',
              OrderedDict([('chr1',
                            '68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36'),
                           ('chr2',
                            '3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce'),
                           ('chrX',
                            'ef7dd6215b2dabc81d0a5eb0f0d84a739b0674de92679f88')]))])