# seqcol tutorial

In [1]:
import seqcol
from seqcol import trunc512_digest

Show some results for sequence digests:

In [2]:
trunc512_digest('ACGT')

'68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36'

In [3]:
trunc512_digest('TCGA')

'3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce'

In [4]:
trunc512_digest('ACGT', 26)

'68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36cf70'

## Use a database

Now, instantiate a RefDB object. You have to provide a database where you will store lookup values. For a demo, you can also use a basic dictionary as a lookup database, but this will obviously not persist. 

Seed our database with a few pre-existing entries:

In [5]:
local_lookup_dict = {
    trunc512_digest('ACGT'): "ACGT",
    trunc512_digest('TCGA'): "TCGA"
}

scdb_local = seqcol.SeqColClient(local_lookup_dict)


In [9]:
scdb_local.insert("ACGT")

TypeError: insert() missing 1 required positional argument: 'item_type'

Retrieve sequences using the checksum

In [8]:
from henge import ITEM_TYPE
ITEM_TYPE

'_item_type'

In [6]:
scdb_local.retrieve(trunc512_digest('TCGA'))

NotFoundException: 3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce not found in database

In [23]:
scdb_local.show()

68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36 ACGT
3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce TCGA


We can also add new sequences into the database:

In [7]:
scdb_local.seqcol(trunc512_digest('TCGATCGA'))  # This sequence is not found in our database yet

'Not found'

In [8]:
checksum = scdb_local.load_seq("TCGATCGA")  # So, let's add it into database

In [9]:
scdb_local.seqcol(checksum)  # This time it returns

'TCGATCGA'

## Switching to a Redis back-end
Using a dict as a database will not persist. Let's instead use a redis back-end. If you're running a local redis server, you can use that as a back-end. First, start up a server like this:

```
docker run --rm --network='host' --workdir="`pwd`" redis:5.0.5 redis-server
```

 Then you can instantiate a new RefDB object that uses it like this:

In [10]:
rgdb = seqcol.RefDB(seqcol.RedisDict())

## Database insertion

Insert a sequence into the database, then retrieve it via checksum

In [11]:
checksum = rgdb.load_seq("GGAA")
rgdb.seqcol(checksum)

ConnectionError: Error 111 connecting to localhost:6379. Connection refused.

## Insert and retrieve a sequence collection (fasta file)

In [12]:
fa_file = "demo_fasta/demo.fa"
checksum, content = rgdb.load_fasta(fa_file)

Here we retrieve all the sequences in the fasta file:

In [13]:
rgdb.seqcol(checksum)

OrderedDict([('chr2', 'TCGA'), ('chr1', 'ACGT')])

If you want it in fasta format there's a helper function for that:

In [14]:
print(rgdb.fasta_fmt(rgdb.seqcol(checksum)))

>chr2
TCGA
>chr1
ACGT


You can limit recursion to get just the checksums for individual sequences, rather than the sequences themselves:

In [15]:
rgdb.seqcol(checksum, reclimit=1)

OrderedDict([('chr2', '3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce'),
             ('chr1', '68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36')])

The individual sequences are also retrievable independently because each sequence from the fasta file is stored as a primary unit. Test some single-sequence lookups from the database:

In [16]:
rgdb.seqcol(content["chr1"])

'ACGT'

In [17]:
rgdb.seqcol(trunc512_digest('ACGT'))

'ACGT'

Now if we kill that object and create a new object using the same redis back-end, the data persists because it's stored in the redis back-end:

In [20]:
rgdb = None
rgdb = seqcol.RefDB(seqcol.RedisDict())
rgdb.seqcol(checksum)

OrderedDict([('chr2', 'TCGA'), ('chr1', 'ACGT')])

In [21]:
checksum

'ed0418293aef247863a4716d7ea806eb5e6b4126e963787d'

In [22]:
rgdb.seqcol("3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce")

'TCGA'

# Using MongoDB backend


In [1]:
import seqcol
from seqcol import trunc512_digest
from mongodict import MongoDict
import pymongo
pymongo.Connection = lambda host, port, **kwargs: pymongo.MongoClient(host=host, port=port)
from platform import python_version 
python_version()

'3.7.5'

In [2]:
#my_dict = MongoDict(host='localhost', port=27017, database='my_dict',
                        collection='store')

In [3]:
#my_dict

<mongodict.MongoDict at 0x7f1d66a0cf90>

In [4]:
rgdb = seqcol.SeqColClient(my_dict)

In [5]:
fa_file = "demo_fasta/demo.fa"
checksum, content = rgdb.load_fasta(fa_file)

In [7]:
rgdb.seqcol(checksum)

OrderedDict([('chr1', {'length': '4', 'seq': 'ACGT'}),
             ('chr2', {'length': '4', 'seq': 'TCGA'})])

In [8]:
rgdb.seqcol(checksum, reclimit=1)

OrderedDict([('chr1',
              {'length': '4',
               'seq': '68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36'}),
             ('chr2',
              {'length': '4',
               'seq': '3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce'})])

In [12]:
fa_object = seqcol.parse_fasta(fa_file)

In [20]:
        content_checksums = {}
        for k in fa_object.keys():
            seq = str(fa_object[k])
            content_checksums[k] = {'length': len(seq), 'seq': rgdb.load_seq(seq)}

In [21]:
content_checksums

{'chr1': {'length': 4,
  'seq': '68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36'},
 'chr2': {'length': 4,
  'seq': '3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce'}}

In [23]:
collection_string = ";".join(["{}:{}/{}".format(name, value["length"], value["seq"]) 
                                for name, value in content_checksums.items()])

In [24]:
collection_string

'chr1:4/68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36;chr2:4/3912dddce432f3085c6b4f72a644c4c4c73f07215a9679ce'

I want to use an API as a dict.

In [33]:
import abc
from collections.abc import Mapping, MutableMapping
import requests
class APIDict(MutableMapping):
    def __getattr__(self, item, default=None):
        base = "https://refget.herokuapp.com/sequence/"
        url = "{base}{digest}?accept=text/plain".format(
            base=base,
            digest=item)
        r = requests.get(url)
        result = r.content.decode('utf-8')
        print(url)
        return result

    
    def __delitem__(self, item):
        pass
    
    def __getitem__(self, item):
        base = "https://refget.herokuapp.com/sequence/"
        url = "{base}{digest}?accept=text/plain".format(
            base=base,
            digest=item)
        r = requests.get(url)
        result = r.content.decode('utf-8')
        print(url)
        return result

    def __setitem__(self, key, value):
        pass

    def __iter__(self):
        return iter([k for k in self.__dict__.keys()])

    def __len__(self):
        return sum(1 for _ in iter(self))

    def __repr__(self):
        return self._render(self._simplify_keyvalue(
            self._data_for_repr(), self._new_empty_basic_map))


In [34]:
a = APIDict()
a["68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36"]


https://refget.herokuapp.com/sequence/68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36?accept=text/plain


'Not Found'

In [35]:
x = a["6681ac2f62509cfc220d78751b8dc524"]

https://refget.herokuapp.com/sequence/6681ac2f62509cfc220d78751b8dc524?accept=text/plain


In [36]:
print(x)

CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACACATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTTACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAACCACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATCCAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATACTGTTCTTCTACCCACCATATTGAAACGCTAACAAATGATCGTAAATAACACACACGTGCTTACCCTACCACTTTATACCACCACCACATGCCATACTCACCCTCACTTGTATACTGATTTTACGTACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTCTCAGATTCCACTTCACTCCATGGCCCATCTCTCACTGAATCAGTACCAAATGCACTCACATCATTATGCACGGCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACCCATAACGCCCATCATTATCCACATTTTGATATCTATATCTCATTCGGCGGTCCCAAATATTGTATAACTGCCCTTAATACATACGTTATACCACTTTTGCACCATATACTTACCACTCCATTTATATACACTTATGTCAATATTACAGAAAAATCCCCACAAAAATCACCTAAACATAAAAATATTCTACTTTTCAACAATAATACATAAACATATTGGCTTGTGGTAGCAACACTATCATGGTATCACTAACGTAAAAGTTCCTCAATATTGCAATTTGCTTGAACGGATGCTATTTCAGAATATTTCGTACTTACACAGGCCATACATTAGAATAATATGTCACATCACTGTCGTAACACTCTTTATTCACCGAGCAATAATACGGTAGTGGCTCAAACTCATGCGGGTGCTATGA

In [54]:
import refget
from sqlitedict import SqliteDict
mydict = SqliteDict('/home/nsheff/code/refget/docs_jupyter/my_db.sqlite', autocommit=True)

In [55]:
import henge 
sc = seqcol.SeqColClient(database=mydict, schemas=["/home/nsheff/code/seqcol/seqcol/schemas/RawSeqCol.yaml"])

In [56]:
sc.show()

6681ac2f62509cfc220d78751b8dc524 CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACACATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTTACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAACCACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATCCAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATACTGTTCTTCTACCCACCATATTGAAACGCTAACAAATGATCGTAAATAACACACACGTGCTTACCCTACCACTTTATACCACCACCACATGCCATACTCACCCTCACTTGTATACTGATTTTACGTACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTCTCAGATTCCACTTCACTCCATGGCCCATCTCTCACTGAATCAGTACCAAATGCACTCACATCATTATGCACGGCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACCCATAACGCCCATCATTATCCACATTTTGATATCTATATCTCATTCGGCGGTCCCAAATATTGTATAACTGCCCTTAATACATACGTTATACCACTTTTGCACCATATACTTACCACTCCATTTATATACACTTATGTCAATATTACAGAAAAATCCCCACAAAAATCACCTAAACATAAAAATATTCTACTTTTCAACAATAATACATAAACATATTGGCTTGTGGTAGCAACACTATCATGGTATCACTAACGTAAAAGTTCCTCAATATTGCAATTTGCTTGAACGGATGCTATTTCAGAATATTTCGTACTTACACAGGCCATACATTAGAATAATATGTCACATCACTGTCGTAACACTCTTTATTCACCGAGCAATAATA

In [57]:
fa1 = "/home/nsheff/code/seqcol/demo_fasta/demo.fa.gz"

In [58]:
sc.load_fasta2(fa1)

('0e6a942e25005983bf54622997ec90cbf34b1c7dce597636',
 [{'name': 'chr1', 'length': 4, 'topology': 'linear', 'sequence': 'ACGT'},
  {'name': 'chr2', 'length': 4, 'topology': 'linear', 'sequence': 'TCGA'}])

In [59]:
sc.show()

6681ac2f62509cfc220d78751b8dc524 CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACACATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTTACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAACCACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATCCAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATACTGTTCTTCTACCCACCATATTGAAACGCTAACAAATGATCGTAAATAACACACACGTGCTTACCCTACCACTTTATACCACCACCACATGCCATACTCACCCTCACTTGTATACTGATTTTACGTACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTCTCAGATTCCACTTCACTCCATGGCCCATCTCTCACTGAATCAGTACCAAATGCACTCACATCATTATGCACGGCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACCCATAACGCCCATCATTATCCACATTTTGATATCTATATCTCATTCGGCGGTCCCAAATATTGTATAACTGCCCTTAATACATACGTTATACCACTTTTGCACCATATACTTACCACTCCATTTATATACACTTATGTCAATATTACAGAAAAATCCCCACAAAAATCACCTAAACATAAAAATATTCTACTTTTCAACAATAATACATAAACATATTGGCTTGTGGTAGCAACACTATCATGGTATCACTAACGTAAAAGTTCCTCAATATTGCAATTTGCTTGAACGGATGCTATTTCAGAATATTTCGTACTTACACAGGCCATACATTAGAATAATATGTCACATCACTGTCGTAACACTCTTTATTCACCGAGCAATAATA