In [1]:
"""
    Show a simple visualization of variants to reference.
"""
from biograph import seqset, readmap, reference, find_variants, visualize, sequence

In [2]:
# Import the seqset.
na12878 = seqset("/datasets/biograph/NA12878_S1.seqset")

# The readmap is optional, but provides more precise coverage.
na12878_rm = readmap("/datasets/biograph/NA12878_S1.readmap")

In [3]:
# Import a reference
grch37 = reference("/reference/human_g1k_v37/")

In [16]:
r = grch37.make_range('1', 832318-25, 832318+25, True)
r.sequence

libspiral.sequence('ATGTCTGCTCCTGTATCTACCAAACCTTTAAAGTTCTTTCCCTGAATAGT')

In [22]:
len(na12878.find('ATGTCTGCTCCTGTATCTACCAAAACTTTAAATTTCTTTCCCTGAATAGT').find_reads(100, na12878_rm))

14

In [19]:
na12878.find_near(sequence('ATGTCTGCTCCTGTATCTACCAAACCTTTAAAGTTCTTTCCCTGAATAGT'), 1, 100)[0].sequence


libspiral.sequence('ATGTCTGCTCCTGTATCTACCAAACCTTTAAATTTCTTTCCCTGAATAGT')

In [28]:
# Find variants
na12878_vs_grch37 = find_variants(na12878, grch37, "1", 860461-500, 860461+500, na12878_rm, min_overlap=70)

In [29]:
# Let's see them
for assembly in na12878_vs_grch37:
    visualize(assembly)

                                .
                                .
                                .
           1:860413         28 C│ 
           1:860414         28 T│ 
           1:860415         28 C│ 
                             0  ├───╮  28
           1:860416          0 G│   │A 29
                             0  ├───╯  29
           1:860417         29 G│ 
           1:860418         29 C│ 
           1:860419         29 C│ 
                                .
                                .
                                .
           1:860458         29 C│ 
           1:860459         29 C│ 
           1:860460         28 C│ 
                             0  ├───╮  28
           1:860461          0 G│   │A 28
                             0  ├───╯  28
           1:860462         28 C│ 
           1:860463         28 C│ 
           1:860464         28 G│ 
                                .
                                .
                                .
           1:860518   

In [6]:
# List all of the variants
na12878_vs_grch37[0].variants

[<libspiral.variant 1:245824428_1:1>,
 <libspiral.variant 1:245822863_1:1>,
 <libspiral.variant 1:245823631-_3865_1:245823568->,
 <libspiral.variant 1:245824141_1:1>]

In [7]:
# The large deletion is a structural variant with both breakends in the reverse direction
sv = na12878_vs_grch37[0].variants[2]
print sv.is_structural, sv.left_forward, sv.right_forward

True False False


In [8]:
# Since both breakends are in the reverse direction, show the reverse complement of the sequence
print sv.flip().sequence

CTGAGGATCTAAGTACAGGGGAGTTCTGGGCATGCATTAGTGCTCTAAGGAGAGGCCTTCGCTGGTTAATGTCCCCAGGTTTGGGTTGGTAGCCTACCATTTGGTATCATACTGAGCTCCCTAATTTTGAGCTACATCTAGAAGACCACGTGCCTGGTGCCAAGAAGGACACTTAGACTATGTGTGGATGGAGCTTCATCAGCTGTTCTAATGAACGTTTTGCAAAAGACCATTGTGAGTGAGATGGGAGTTTCTCTGTTTTCAAGCCCAGGGCCCTGCCTGCTGGTGTGTCCCTCTTCTGAGTCATTCAGCCTTGACCCACTCTAGGGAACCGGAGGAATCCACAGCCCAGAGCTCTCCCATGGTGGCCTGTTTGACAGTCACATTCTGAAAAAGCAAACCTCTTGGCTTGATTTCTTGCTGGAATAATGAAACCCAGCTCGGCCCCTGCTTGATAATTGCCTTTGGGAAAAAGGTAATAGCCACCTTGAAACACCATTCAGTTGGACCTTTCCAAAAGCCACGGGTTTTTCTTCACCAAAGTCCTTCATTTGAGCCTATAATCTGTGAATGGAAAGAAGCGATGATGCCTCTAAAAGCCTTTTGGTTGGAGTCAGAGTCACCCTTTGAGCACTGAGTTCTCTTTCCAGGGACCTCTCCCATGTGGTCTTGGAGCCATGGGTGTTGTACAAAAGCAACAGGCCTAGCAGAGCTACCAGTGTGTGAGCGTTTGCTGTGTGAGCCCAGGCTTTCCACAGGACGAAGATTCTCTCACCAGAGTGTGAGCATTTGCAGTGTGAGCCCAGGCTCTCTACAGGACGAAGATTCTCTTACCAGCATGTGAACATTTGCAGTGTGAGCCCAGGCTCTCTACAGGACGAAGATTCTCTCACCAGAGTGTGAGCATTTGCTAGTGTGAGCCCAGGCTCTCTACAGGACGAAGATTCTCTTACCAGCATGTGAGCATTTGCAGTGTGAGCCCAGGCTCTCCACAGGACAA

What is this sequence? We could <a href="http://blast.ncbi.nlm.nih.gov/Blast.cgi">BLAST</a> it to find out <a href="http://www.ncbi.nlm.nih.gov/nucleotide/568815500?report=genbank&log$=nuclalign&blast_rank=1&RID=K493Z58P014">what it really is</a>.

...or see the <a href='Multiple%20references.ipynb'>Multiple references</a> demonstration.
