In [1]:
"""
    Show an interactive read pileup over an assembly
"""

from IPython.core.display import display, HTML
display(HTML("""
<style>
.container { width:100% !important; } 
.widget-hslider { width:600px; }
.read {
    font-family: Courier;
    clear: both; 
    white-space: pre;
    color: grey;
}
.fwd { 
    background: #f0f0ff;
}
.rev { 
    background: #f0fff0;
}
.duplicate {
    background: #ffe0e0;
}

div.output_scroll {
    height: 100%;
}
.info {
    visibility: hidden;
    cursor: pointer;
    z-index: 3;
    right: 50%;
    position: absolute;
    background: white;
    padding: 5px;
    border: 3px solid orange;
    border-radius: 3px;
}
.read:hover + .info {
    visibility: visible;
}
.reference {
    font-weight: bold;
}
.nucleotide {
    font-weight: bold;
}
.a, .A{
    color: red;
}
.c, .C {
    color: green;
}
.g, .G {
    color: orange;
}
.t, .T {
    color: blue;
}
</style>
"""))

In [2]:
from __future__ import print_function
from biograph import BioGraph, Reference
from ipywidgets import interact, interact_manual, interactive, FloatSlider, IntSlider
from random import randint

In [3]:
# Import seqset and readmap
my_bg = BioGraph("Rep1.bg")
rm = my_bg.open_readmap()

In [4]:
# Import a reference
# hs37d5 = Reference("/reference/hs37d5/")
hs37d5 = Reference('/dev/shm/hs37d5/')

In [5]:
min_overlap = int(rm.max_read_len() * 0.7)
print(min_overlap)

105


In [6]:
import vcf
variants = list()
fp_aids = set()

vcf_reader = vcf.Reader(open('Rep1.bg/fp.vcf', 'r'))
for v in vcf_reader:
    for i in v.INFO['AID']:
        fp_aids.add(int(i))
    variants.append(v)

In [7]:
# extract relevant assemblies from a much larger csv

# import csv

# assemblies = dict()

# csvfile = open('Rep1.bg/assemblies.csv')
# acsv = csv.DictReader(csvfile)
# with open('Rep1.bg/fp.assemblies.csv', 'w') as out:
#     writer = csv.DictWriter(out, dialect=acsv.dialect, fieldnames=acsv.fieldnames)
#     writer.writeheader()
#     for a in acsv:
#         if int(a['aid']) in fp_aids:
#             writer.writerow(a)
# csvfile.close()

In [8]:
# import all assemblies, converting numeric fields to int

import csv

assemblies = dict()

csvfile = open('Rep1.bg/fp.assemblies.csv')
acsv = csv.DictReader(csvfile)
for a in acsv:
    aid = int(a['aid'])
    assemblies[aid] = dict()
    for k in a.keys():
        if k in ('scaffold_name', 'seq', 'variants(refrange:varseq:refseq)'):
            assemblies[aid][k] = a[k]
        else:
            assemblies[aid][k] = int(a[k])

csvfile.close()

In [9]:
def colorize(s, offsets=None):
    o = []
    if offsets is None:
        offsets = range(len(s))
    if type(offsets) != list:
        offsets = [offsets]
    for i, base in enumerate(s):
        if i in offsets:
            o.append('<span class="nucleotide {b}">{b}</span>'.format(b=base))
        else:
            o.append(base)
    return ''.join(o)

s = 'ACGTacgtNnACgT'
lines = []
lines.append("<div class='read'>{}</div>".format(colorize(s)))
# for i in range(len(s)):
#     lines.append("<div class='read'>{}</div>".format(colorize(s, i)))
# lines.append("<div class='read'>{}</div>".format(colorize(s, range(0, len(s), 2))))
# lines.append("<div class='read'>{}</div>".format(colorize(s, range(1, len(s), 2))))
display(HTML(''.join(lines)))

In [10]:
len(assemblies)

6861

In [11]:
# tie together assemblies and the variant at a given (or random) index

def random_variant(vid=None):
    if vid is None:
        vid = randint(0, len(variants))

    var = dict()
    
    var['vcf'] = variants[vid]
    var['vid'] = vid
    # include all assemblies
    var['assemblies'] = list()
    for i in var['vcf'].INFO['AID']:      
        if i in assemblies:
            var['assemblies'].append(assemblies[i])
    return var

In [12]:
# get the read pileup for a given variant

def get_pileup(variant):
    pileup = dict()
    for a in variant['assemblies']:
        pileup[a['aid']] = list()

        # Start from the right to find all sequences present
        entry = my_bg.seqset.empty_entry()
        offset = len(a['seq'])

        for base in reversed(a['seq']):
            offset = offset - 1
            entry = entry.push_front_drop(base)
            for r in rm.get_prefix_reads(entry):
                read_start = (a['left_offset'] + offset)
                rms = 0
                sms = 0

                read_seq = str(r.get_seqset_entry().sequence())

                ref_map = []
                seq_map = []
                for i in range(0, len(read_seq) - min_overlap):
                    # Where else might this map to reference?
                    ref_map.append(hs37d5.find(read_seq[i:i+min_overlap]).matches)
                    # How many reads also contain this kmer?
                    s = my_bg.seqset.find(read_seq[i:i+min_overlap])
                    seq_map.append(s.get_end_entry_id() - s.get_begin_entry_id())

                if len(ref_map):
                    rms = float(sum(ref_map)) / len(ref_map)
                if len(seq_map):
                    sms = float(sum(seq_map)) / len(seq_map)

                pileup[a['aid']].append((read_start, r, rms, sms))
    return pileup

In [13]:
read_ids = set()

# Show the pileup. This is called every time the pos slider is moved, so keep it light.
def show(variant, pileup, pos, aid, screen_width, variant_reads_only=True):
    out = []
    v = variant['vcf']
    ad = v.samples[0]['AD']

    reads = []
    read_ids = set()
    fwd = 0
    rev = 0

    # Show some context
    out.append('assembly id: {}<br />'.format(aid))
    out.append('variant #: {}<br />'.format(variant['vid']))
    out.append(str(v) + ' {}<br />'.format(ad))

    # Show all assemblies, or just one?
    if aid == 'ALL':
        display_ids = [a['aid'] for a in variant['assemblies']]
    else:
        display_ids = [aid]

    # Extract and mark up relevant pileup reads for the selected assemblies
    for a in variant['assemblies']:
        aid = a['aid']
        if aid not in display_ids:
            continue
            
        for read_start, r, ref_map, seq_map in pileup[aid]:
            # Is part of the read on screen?
            read_end = read_start + len(r.get_seqset_entry().sequence())
            # Off screen to the left? Ignore.
            if read_end <= pos:
                continue
            # Off screen to the right? Ignore.
            if read_start > pos + screen_width:
                continue
            # Does it cover the variant?
            if variant_reads_only and (read_start > v.POS - 1 or read_end <= v.POS - 1):
                continue

            # Mark duplicate reads between assemblies.
            prev_aid = {a['aid']}
            if r.get_read_id() in read_ids:
                # not efficient, but this list is small
                for i, prev in enumerate(reads):
                    if prev[1].get_read_id() == r.get_read_id():
                        for pa in prev[2]:
                            prev_aid.add(pa)
                        reads[i] = (prev[0], r, prev_aid, ref_map, seq_map)
            else:
                read_ids.add(r.get_read_id())
            
            # Count direction. Reverse is lowercase.
            if r.is_original_orientation():
                fwd = fwd + 1
            else:
                rev = rev + 1
                
            reads.append((read_start, r, prev_aid, ref_map, seq_map))

    # read count
    if variant_reads_only:
        out.append("{unique} unique reads ({count} total) include this variant<br />".format(unique=len(read_ids), count=len(reads)))
    else:
        out.append("{unique} unique reads ({count} total) in this region<br />".format(unique=len(read_ids), count=len(reads)))

    # variant position indicator
    spaces = (v.POS - 1 - pos)
    if spaces > screen_width - 1:
        line = ' ' * screen_width + ">"
    elif spaces > 0:
        line = '{sp}&#9662;{var_start}'.format(sp=(' ' * spaces), var_start=v.POS - 1)
    else:
        line = '<'
    out.append("<div class='read reference'>{}</div>".format(line))

    # reference ruler
    ruler = [[],[]]
    for i in range(pos + 1, pos + screen_width):
        if not i % 10:
            ruler[0].append(str(i)[-4:])
            ruler[0].append(' ' * 6)
            ruler[1].append('|' + ' ' * 9)

    out.append("<div class='read reference'>{}&hellip;</div>".format(str(pos)[:-4]))
    rstr = []
    for l in ruler:
        rstr.append("<div class='read reference'>{}</div>".format(' ' * (10 - (pos % 10)) + ''.join(l)))
    out.append(''.join(rstr))


    # position relative to the start of the first assembly
    a = variant['assemblies'][0]
    rel_pos = pos - a['left_offset']

    # colorized reference string
    rs = hs37d5.find_ranges(a['scaffold_name'], a['left_offset'] - min_overlap, v.POS - 1)
    ref_left = str(rs[0].sequence)

    rs = hs37d5.find_ranges(a['scaffold_name'], v.POS - 1, v.POS - 1 + screen_width)
    ref_right = str(rs[0].sequence)

    ref_seq = ref_left + ref_right
    ref_seq = ref_seq[rel_pos + min_overlap + 1:rel_pos + min_overlap + screen_width]

    out.append("<div class='read reference'>&hellip;{}&hellip;</div>".format(colorize(ref_seq)))

    # the assemblies
    for a in variant['assemblies']:
        if a['aid'] not in display_ids:
            continue
        rel_pos = pos - a['left_offset']
        if rel_pos >= 0:
            disp = a['seq'][rel_pos : rel_pos + screen_width]
        else:
            indent = abs(rel_pos)
            disp = '{}{}'.format(' ' * indent, a['seq'][:screen_width - indent])
        out.append("<div class='read'>{}</div>".format(disp))

#     # Only works for SNPs now: is the next base a reference base?
#     ref_entry = entry.push_front_drop(str(hs37d5.find_ranges(a['scaffold_name'], a['left_offset'] + offset, a['left_offset'] + offset + 1)[0].sequence))
#     for r in rm.get_prefix_reads(ref_entry):
#         reads.extend(list(rm.get_prefix_reads(ref_entry)))
        
    # the read pileup
    for read_start, r, aid, ref_map, seq_map in sorted(reads):
        trim = 0
        indent = 0
        pre = ''
        post = ''
        c = []

        read_seq = str(r.get_seqset_entry().sequence())
        # left trim it?
        if (pos >= read_start):
            trim = pos - read_start
            pre = '&hellip;'
            line = read_seq[trim + 1:trim + screen_width]
            for i, base in enumerate(line):
                if i >= len(ref_seq):
                    break
                if ref_seq[i] != base:
                    # colorize it
                    c.append(i)
        # otherwise, indent it 0 or more spaces
        else:
            indent = abs(pos - read_start)
            s = '{}{}'.format(' ' * indent, read_seq)
            line = s[:screen_width]
            for i, base in enumerate(line):
                if i >= len(ref_seq):
                    break
                if base != ' ' and base != ref_seq[i - 1]:
                    c.append(i)

        if len(r) - trim + indent > screen_width:
            post = '&hellip;'
        if r.is_original_orientation():
            direction = ' fwd'
        else:
            direction = ' rev'
            line = line.lower()
        read_id = str(r.get_read_id())

        if len(aid) > 1:
            dup = ' duplicate'
        else:
            dup = ''
            
        # the indented or trimmed read
        out.append('<div id="' + read_id + '" class="read' + direction + dup + '">' + pre + colorize(line, c) + post + '</div>')

        # hover info pane
        read_summary = """
        read id: {read_id}<br />
        assembly id: {aid}<br />
        position: {read_start}<br />
        direction: {direction}<br />
        length: {read_len}<br />
        paired: {paired}<br />
        reference mappability: {ref_map:.2f}<br />
        sequence mappability: {seq_map:.2f}<br />
        """.format(read_id=read_id, paired=r.has_mate(), direction=direction, ref_map=ref_map, seq_map=seq_map, read_len=len(r.get_seqset_entry().sequence()), read_start=read_start, aid=', '.join([str(a) for a in aid]))

        out.append('<div id="' + read_id + '_info" class="info">' + read_summary + '</div>')

    display(HTML(''.join(out)))

In [14]:
screen_width = 240
# variant = random_variant()

# variant = random_variant(980) # [43, 40] with two assemblies, only finds 10 or 11 variant reads, overlapping assemblies agree
# variant = random_variant(4496) # [0, 20] with two assemblies, only finds 15 or 16 variant reads, overlapping assemblies agree
variant = random_variant(1112) # [36, 31] with two assemblies, only finds 25 or 27 variant reads, overlapping assemblies disagree
# variant = random_variant(4433) # [23, 11] with one assembly, only finds 10 variant reads

pileup = get_pileup(variant)

from ipywidgets import IntSlider, BoundedIntText, fixed
i = interactive(show,
                variant=fixed(variant),
                pileup=fixed(pileup),
                pos=IntSlider(
                    min=variant['assemblies'][0]['left_offset'] - min_overlap, 
                    max=variant['assemblies'][-1]['left_offset'] + len(variant['assemblies'][-1]['seq']) - 1, 
                    step=1, 
                    value=variant['vcf'].POS - 1 - (screen_width / 2), 
#                     value=13001466,
                    continuous_update=True
                ), 
                aid=['ALL'] + [a['aid'] for a in variant['assemblies']],
                screen_width=BoundedIntText(
                    value=230,
                    min=100,
                    max=1000,
                    step=1,
                    description='Screen width:',
                    disabled=False,
                    continuous_update=False
                )
               )
o = i.children[-1]
o.layout.height='2048px'
i

aW50ZXJhY3RpdmUoY2hpbGRyZW49KEludFNsaWRlcih2YWx1ZT0xMzMwNjA4MDQsIGRlc2NyaXB0aW9uPXUncG9zJywgbWF4PTEzMzA2MTEzMywgbWluPTEzMzA2MDUwNCksIERyb3Bkb3duKGTigKY=
