# refget-py tutorial

In [1]:
import refget
from refget import trunc512_digest
from platform import python_version 
python_version()

'3.7.6'

In [2]:
# rgdb = refget.RefDB(refget.RedisDict())

In [3]:
rgdb = refget.RefDB(refget.MongoDict(host='localhost', port=27017, database='my_dict',
                        collection='store'))

In [4]:
rgdb

<refget.refget.RefDB at 0x7f51ac3091d0>

## Comparing sequence collections

We may be interested in if collections have the same sequences in different order, or with different names. The `compare` function can provide this information. Let's load an example that adds another sequence:

In [5]:
fa_file = "demo_fasta/demo.fa"
checksum, content = rgdb.load_fasta(fa_file)

In [6]:
fa_file = "demo_fasta/demo2.fa"
checksum2, content2 = rgdb.load_fasta(fa_file)

In [7]:
content2

{'chr1': {'length': 4,
  'seq': '68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36'},
 'chr2': {'length': 4,
  'seq': '3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce'},
 'chrX': {'length': 8,
  'seq': 'ef7dd6215b2dabc81d0a5eb0f0d84a739b0674de92679f88'}}

In [8]:
rgdb.refget(checksum2)

OrderedDict([('chr1', {'length': '4', 'seq': 'ACGT'}),
             ('chr2', {'length': '4', 'seq': 'TCGA'}),
             ('chrX', {'length': '8', 'seq': 'TTCCGGAA'})])

In [9]:
DELIM_LVL1 = u"\u00B7"
DELIM_LVL2 = u"\u2016"
DELIM_LVL3 = u"\u2980"
x=rgdb.refget(checksum2)
x.split(DELIM_LVL3)

AttributeError: 'collections.OrderedDict' object has no attribute 'split'

In [10]:
rgdb.refget(checksum)

OrderedDict([('chr1', {'length': '4', 'seq': 'ACGT'}),
             ('chr2', {'length': '4', 'seq': 'TCGA'})])

In [11]:
rgdb.compare(checksum, checksum2)

'A is a sequence-level subset of B'

In [12]:
fa_file = "demo_fasta/demo3.fa"
checksum3, content3 = rgdb.load_fasta(fa_file)
rgdb.refget(checksum3)

OrderedDict([('chr1', {'length': '4', 'seq': 'ACGT'}),
             ('chrX', {'length': '8', 'seq': 'TTCCGGAA'})])

In [13]:
rgdb.compare(checksum, checksum3)

'A and B share some sequences'

In [14]:
fa_file = "demo_fasta/demo4.fa"
checksum4, content4 = rgdb.load_fasta(fa_file)
rgdb.refget(checksum4)

OrderedDict([('chrX', {'length': '8', 'seq': 'TTCCGGAA'})])

In [15]:
rgdb.compare(checksum, checksum4)

'No sequences shared'

In [16]:
rgdb.compare(checksum2, checksum4)

'B is a sequence-level subset of A'

In [17]:
fa_file = "demo_fasta/demo5.fa"
checksum5, content5 = rgdb.load_fasta(fa_file)
rgdb.refget(checksum5)

OrderedDict([('chrX', {'length': '8', 'seq': 'TTCCGGAA'}),
             ('chr1', {'length': '4', 'seq': 'ACGT'})])

In [18]:
rgdb.compare(checksum, checksum5)

'A and B share some sequences'

In [19]:
rgdb.compare(checksum3, checksum5)

'Sequence-level identical, order mismatch'

In [20]:
fa_file = "demo_fasta/demo6.fa"
checksum6, content6 = rgdb.load_fasta(fa_file)
rgdb.refget(checksum6)

OrderedDict([('1', {'length': '4', 'seq': 'ACGT'}),
             ('2', {'length': '4', 'seq': 'TCGA'}),
             ('X', {'length': '8', 'seq': 'TTCCGGAA'})])

In [21]:
rgdb.compare(checksum2, checksum6)

'Sequence-level identical, names mismatch'

In [22]:
fa_file = "demo_fasta/demo7.fa"
checksum7, content7 = rgdb.load_fasta(fa_file)
rgdb.refget(checksum7)

OrderedDict([('1', {'length': '4', 'seq': 'AGGG'}),
             ('2', {'length': '4', 'seq': 'TGGG'}),
             ('X', {'length': '8', 'seq': 'TTCCGGGG'})])

In [23]:
rgdb.compare(checksum6, checksum7)

'No sequences shared'

In [24]:
[x['length'] for x in content6.values()]


[4, 4, 8]

In [25]:
[[name, val['length']] for name, val in content6.items()]

[['1', 4], ['2', 4], ['X', 8]]

# 3-layer refget

Yes, you can also continue this recursion indefinitely. To demonstrate, let's make a sequence that consists of these 2 fasta files checksums, and load that into the database.

In [52]:
DELIM_LVL1 = u"\u00B7"
DELIM_LVL2 = u"\u2016"
DELIM_LVL3 = u"\u2980"
layer3seq = "dem\"o:'1{dl2}{}{dl3}demo2{dl2}{}".format(checksum, checksum2, dl2=DELIM_LVL2, dl3=DELIM_LVL3)
layer3seq

'dem"o:\'1‖72e94ad00e97f3b46937c5dd02610006dc96cb782ed29cd2⦀demo2‖9f5e7027f4ef9d59a116d3db8446e5a9685213bc475c3abf'

In [53]:
rgdb.load_seq(layer3seq)

'3c7e8defcd28a5245c477a59da5a2a9f2bc96e7d55f71601'

Can we retrieve it? You bet! At any recursion level:

In [54]:
rgdb.refget(rgdb.load_seq(layer3seq))

OrderedDict([('dem"o:\'1',
              {'length': None,
               'seq': OrderedDict([('chr1', {'length': '4', 'seq': 'ACGT'}),
                            ('chr2', {'length': '4', 'seq': 'TCGA'})])}),
             ('demo2',
              {'length': None,
               'seq': OrderedDict([('chr1', {'length': '4', 'seq': 'ACGT'}),
                            ('chr2', {'length': '4', 'seq': 'TCGA'}),
                            ('chrX', {'length': '8', 'seq': 'TTCCGGAA'})])})])

In [36]:
rgdb.refget(rgdb.load_seq(layer3seq), reclimit=1)

OrderedDict([('demo1',
              {'length': None,
               'seq': '72e94ad00e97f3b46937c5dd02610006dc96cb782ed29cd2'}),
             ('demo2',
              {'length': None,
               'seq': '9f5e7027f4ef9d59a116d3db8446e5a9685213bc475c3abf'})])

In [37]:
rgdb.refget(rgdb.load_seq(layer3seq), reclimit=2)

OrderedDict([('demo1',
              {'length': None,
               'seq': OrderedDict([('chr1',
                             {'length': '4',
                              'seq': '68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36'}),
                            ('chr2',
                             {'length': '4',
                              'seq': '3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce'})])}),
             ('demo2',
              {'length': None,
               'seq': OrderedDict([('chr1',
                             {'length': '4',
                              'seq': '68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36'}),
                            ('chr2',
                             {'length': '4',
                              'seq': '3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce'}),
                            ('chrX',
                             {'length': '8',
                              'seq': 'ef7dd6215b2dabc81d0a5eb0f0d84a739b0674de92679f88'})])})])

In [38]:
        contents1 = rgdb.refget(checksum, reclimit=1)
        contents2 = rgdb.refget(checksum2, reclimit=1)

In [13]:
contents1

OrderedDict([('chr1',
              {'length': '4',
               'seq': '68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36'}),
             ('chr2',
              {'length': '4',
               'seq': '3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce'})])

In [14]:
contents2

OrderedDict([('chr1',
              {'length': '4',
               'seq': '68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36'}),
             ('chr2',
              {'length': '4',
               'seq': '3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce'}),
             ('chrX',
              {'length': '8',
               'seq': 'ef7dd6215b2dabc81d0a5eb0f0d84a739b0674de92679f88'})])