# Practica 1 _Semana2
## Implementing Boyer-Moore

In [1]:
import string

def z_array(s):
    """ Use Z algorithm (Gusfield theorem 1.4.1) to preprocess s """
    assert len(s) > 1
    z = [len(s)] + [0] * (len(s)-1)
    # Initial comparison of s[1:] with prefix
    for i in range(1, len(s)):
        if s[i] == s[i-1]:
            z[1] += 1
        else:
            break
    r, l = 0, 0
    if z[1] > 0:
        r, l = z[1], 1
    for k in range(2, len(s)):
        assert z[k] == 0
        if k > r:
            # Case 1
            for i in range(k, len(s)):
                if s[i] == s[i-k]:
                    z[k] += 1
                else:
                    break
            r, l = k + z[k] - 1, k
        else:
            # Case 2
            # Calculate length of beta
            nbeta = r - k + 1
            zkp = z[k - l]
            if nbeta > zkp:
                # Case 2a: Zkp wins
                z[k] = zkp
            else:
                # Case 2b: Compare characters just past r
                nmatch = 0
                for i in range(r+1, len(s)):
                    if s[i] == s[i - k]:
                        nmatch += 1
                    else:
                        break
                l, r = k, r + nmatch
                z[k] = r - k + 1
    return z


def n_array(s):
    """ Compile the N array (Gusfield theorem 2.2.2) from the Z array """
    return z_array(s[::-1])[::-1]


def big_l_prime_array(p, n):
    """ Compile L' array (Gusfield theorem 2.2.2) using p and N array.
        L'[i] = largest index j less than n such that N[j] = |P[i:]| """
    lp = [0] * len(p)
    for j in range(len(p)-1):
        i = len(p) - n[j]
        if i < len(p):
            lp[i] = j + 1
    return lp


def big_l_array(p, lp):
    """ Compile L array (Gusfield theorem 2.2.2) using p and L' array.
        L[i] = largest index j less than n such that N[j] >= |P[i:]| """
    l = [0] * len(p)
    l[1] = lp[1]
    for i in range(2, len(p)):
        l[i] = max(l[i-1], lp[i])
    return l


def small_l_prime_array(n):
    """ Compile lp' array (Gusfield theorem 2.2.4) using N array. """
    small_lp = [0] * len(n)
    for i in range(len(n)):
        if n[i] == i+1:  # prefix matching a suffix
            small_lp[len(n)-i-1] = i+1
    for i in range(len(n)-2, -1, -1):  # "smear" them out to the left
        if small_lp[i] == 0:
            small_lp[i] = small_lp[i+1]
    return small_lp


def good_suffix_table(p):
    """ Return tables needed to apply good suffix rule. """
    n = n_array(p)
    lp = big_l_prime_array(p, n)
    return lp, big_l_array(p, lp), small_l_prime_array(n)


def good_suffix_mismatch(i, big_l_prime, small_l_prime):
    """ Given a mismatch at offset i, and given L/L' and l' arrays,
        return amount to shift as determined by good suffix rule. """
    length = len(big_l_prime)
    assert i < length
    if i == length - 1:
        return 0
    i += 1  # i points to leftmost matching position of P
    if big_l_prime[i] > 0:
        return length - big_l_prime[i]
    return length - small_l_prime[i]


def good_suffix_match(small_l_prime):
    """ Given a full match of P to T, return amount to shift as
        determined by good suffix rule. """
    return len(small_l_prime) - small_l_prime[1]


def dense_bad_char_tab(p, amap):
    """ Given pattern string and list with ordered alphabet characters, create
        and return a dense bad character table.  Table is indexed by offset
        then by character. """
    tab = []
    nxt = [0] * len(amap)
    for i in range(0, len(p)):
        c = p[i]
        assert c in amap
        tab.append(nxt[:])
        nxt[amap[c]] = i+1
    return tab


class BoyerMoore(object):
    """ Encapsulates pattern and associated Boyer-Moore preprocessing. """
    
    def __init__(self, p, alphabet='ACGT'):
        self.p = p
        self.alphabet = alphabet
        # Create map from alphabet characters to integers
        self.amap = {}
        for i in range(len(self.alphabet)):
            self.amap[self.alphabet[i]] = i
        # Make bad character rule table
        self.bad_char = dense_bad_char_tab(p, self.amap)
        # Create good suffix rule table
        _, self.big_l, self.small_l_prime = good_suffix_table(p)
    
    def bad_character_rule(self, i, c):
        """ Return # skips given by bad character rule at offset i """
        assert c in self.amap
        ci = self.amap[c]
        assert i > (self.bad_char[i][ci]-1)
        return i - (self.bad_char[i][ci]-1)
    
    def good_suffix_rule(self, i):
        """ Given a mismatch at offset i, return amount to shift
            as determined by (weak) good suffix rule. """
        length = len(self.big_l)
        assert i < length
        if i == length - 1:
            return 0
        i += 1  # i points to leftmost matching position of P
        if self.big_l[i] > 0:
            return length - self.big_l[i]
        return length - self.small_l_prime[i]
    
    def match_skip(self):
        """ Return amount to shift in case where P matches T """
        return len(self.small_l_prime) - self.small_l_prime[1]

In [2]:
# GCTAGCTCTACGAGTCTA
# TCAA
p = 'TCAA'
p_bm = BoyerMoore(p)  #BoyerMoore(p, alphabet='ACGT')
p_bm.bad_character_rule(2, 'T') # mismatch en offset 2 (T-A)

2

In [19]:
# GCTAGCTCTACGAGTCTA
# ACTA
p = 'ACTA'
p_bm = BoyerMoore(p, alphabet='ACGT')
p_bm.good_suffix_rule(0) #mismatch en offset 0 (G-A)

3

In [12]:
# ACACGCTCTACGAGTCTA
# ACAC
p = 'ACAC'
p_bm = BoyerMoore(p, alphabet='ACGT')
p_bm.match_skip()

2

In [24]:
def boyer_moore(p, p_bm, t):
    """ Do Boyer-Moore matching """
    i = 0
    occurrences = []
    while i < len(t) - len(p) + 1:
        shift = 1
        mismatched = False
        for j in range(len(p)-1, -1, -1): #descender desde offset p-1 hasta cero
            if p[j] != t[i+j]:
                skip_bc = p_bm.bad_character_rule(j, t[i+j])
                skip_gs = p_bm.good_suffix_rule(j)
                shift = max(shift, skip_bc, skip_gs)
                mismatched = True
                break
        if not mismatched:
            occurrences.append(i)
            skip_gs = p_bm.match_skip()
            shift = max(shift, skip_gs)
        i += shift
    return occurrences

In [23]:
range(10,-1, -1)

[10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

In [29]:
t = 'GCTAGCTCTACGAGTCTA'
p = 'TCTA'
p_bm = BoyerMoore(p, alphabet='ACGT')

In [30]:
boyer_moore(p, p_bm, t)

[6, 14]

# Practica 2 _Semana2
## Implementing a k-mer index

In [32]:
# naive is online
# BoyerMoore is online
# Web-query is offline

In [6]:
import bisect
import sys

In [7]:


class Index(object):
    def __init__(self, t, k):
        ''' Create index from all substrings of size 'length' '''
        self.k = k  # k-mer length (k)
        self.index = []
        for i in range(len(t) - k + 1):  # for each k-mer
            self.index.append((t[i:i+k], i))  # add (k-mer, offset) pair
        self.index.sort()  # alphabetize by k-mer
    
    def query(self, p):
        ''' Return index hits for first k-mer of P '''
        kmer = p[:self.k]  # query with first k-mer
        i = bisect.bisect_left(self.index, (kmer, -1))  # binary search
        hits = []
        while i < len(self.index):  # collect matching index entries
            if self.index[i][0] != kmer:
                break
            hits.append(self.index[i][1])
            i += 1
        return hits



In [8]:
def queryIndex(p, t, index):
    k = index.k
    offsets = []
    for i in index.query(p):
        if p[k:] == t[i+k:i+len(p)]:  # verify that rest of P matches
            offsets.append(i)
    return offsets


In [9]:
t = 'ACTTGGAGATCTTTGAGGCTAGGTATTCGGGATCGAAGCTCATTTCGGGGATCGATTACGATATGGTGGGTATTCGGGA'
p = 'GGTATTCGGGA'

In [10]:
index = Index(t, 4)
print(index)

<__main__.Index object at 0x7f82d16709e8>


In [11]:
print(queryIndex(p, t, index))

[21, 68]


In [12]:
t[68:79]

'GGTATTCGGGA'

In [None]:
k-mer index
subsequence index
suffix index (array, tree, FM)


# Approximate Matching

# Practica 3 _Semana2
## Implementing the pigeonhole principle

In [16]:
import string

def z_array(s):
    """ Use Z algorithm (Gusfield theorem 1.4.1) to preprocess s """
    assert len(s) > 1
    z = [len(s)] + [0] * (len(s)-1)
    # Initial comparison of s[1:] with prefix
    for i in range(1, len(s)):
        if s[i] == s[i-1]:
            z[1] += 1
        else:
            break
    r, l = 0, 0
    if z[1] > 0:
        r, l = z[1], 1
    for k in range(2, len(s)):
        assert z[k] == 0
        if k > r:
            # Case 1
            for i in range(k, len(s)):
                if s[i] == s[i-k]:
                    z[k] += 1
                else:
                    break
            r, l = k + z[k] - 1, k
        else:
            # Case 2
            # Calculate length of beta
            nbeta = r - k + 1
            zkp = z[k - l]
            if nbeta > zkp:
                # Case 2a: Zkp wins
                z[k] = zkp
            else:
                # Case 2b: Compare characters just past r
                nmatch = 0
                for i in range(r+1, len(s)):
                    if s[i] == s[i - k]:
                        nmatch += 1
                    else:
                        break
                l, r = k, r + nmatch
                z[k] = r - k + 1
    return z


def n_array(s):
    """ Compile the N array (Gusfield theorem 2.2.2) from the Z array """
    return z_array(s[::-1])[::-1]


def big_l_prime_array(p, n):
    """ Compile L' array (Gusfield theorem 2.2.2) using p and N array.
        L'[i] = largest index j less than n such that N[j] = |P[i:]| """
    lp = [0] * len(p)
    for j in range(len(p)-1):
        i = len(p) - n[j]
        if i < len(p):
            lp[i] = j + 1
    return lp


def big_l_array(p, lp):
    """ Compile L array (Gusfield theorem 2.2.2) using p and L' array.
        L[i] = largest index j less than n such that N[j] >= |P[i:]| """
    l = [0] * len(p)
    l[1] = lp[1]
    for i in range(2, len(p)):
        l[i] = max(l[i-1], lp[i])
    return l


def small_l_prime_array(n):
    """ Compile lp' array (Gusfield theorem 2.2.4) using N array. """
    small_lp = [0] * len(n)
    for i in range(len(n)):
        if n[i] == i+1:  # prefix matching a suffix
            small_lp[len(n)-i-1] = i+1
    for i in range(len(n)-2, -1, -1):  # "smear" them out to the left
        if small_lp[i] == 0:
            small_lp[i] = small_lp[i+1]
    return small_lp


def good_suffix_table(p):
    """ Return tables needed to apply good suffix rule. """
    n = n_array(p)
    lp = big_l_prime_array(p, n)
    return lp, big_l_array(p, lp), small_l_prime_array(n)


def good_suffix_mismatch(i, big_l_prime, small_l_prime):
    """ Given a mismatch at offset i, and given L/L' and l' arrays,
        return amount to shift as determined by good suffix rule. """
    length = len(big_l_prime)
    assert i < length
    if i == length - 1:
        return 0
    i += 1  # i points to leftmost matching position of P
    if big_l_prime[i] > 0:
        return length - big_l_prime[i]
    return length - small_l_prime[i]


def good_suffix_match(small_l_prime):
    """ Given a full match of P to T, return amount to shift as
        determined by good suffix rule. """
    return len(small_l_prime) - small_l_prime[1]


def dense_bad_char_tab(p, amap):
    """ Given pattern string and list with ordered alphabet characters, create
        and return a dense bad character table.  Table is indexed by offset
        then by character. """
    tab = []
    nxt = [0] * len(amap)
    for i in range(0, len(p)):
        c = p[i]
        assert c in amap
        tab.append(nxt[:])
        nxt[amap[c]] = i+1
    return tab


class BoyerMoore(object):
    """ Encapsulates pattern and associated Boyer-Moore preprocessing. """
    
    def __init__(self, p, alphabet='ACGT'):
        self.p = p
        self.alphabet = alphabet
        # Create map from alphabet characters to integers
        self.amap = {}
        for i in range(len(self.alphabet)):
            self.amap[self.alphabet[i]] = i
        # Make bad character rule table
        self.bad_char = dense_bad_char_tab(p, self.amap)
        # Create good suffix rule table
        _, self.big_l, self.small_l_prime = good_suffix_table(p)
    
    def bad_character_rule(self, i, c):
        """ Return # skips given by bad character rule at offset i """
        assert c in self.amap
        ci = self.amap[c]
        assert i > (self.bad_char[i][ci]-1)
        return i - (self.bad_char[i][ci]-1)
    
    def good_suffix_rule(self, i):
        """ Given a mismatch at offset i, return amount to shift
            as determined by (weak) good suffix rule. """
        length = len(self.big_l)
        assert i < length
        if i == length - 1:
            return 0
        i += 1  # i points to leftmost matching position of P
        if self.big_l[i] > 0:
            return length - self.big_l[i]
        return length - self.small_l_prime[i]
    
    def match_skip(self):
        """ Return amount to shift in case where P matches T """
        return len(self.small_l_prime) - self.small_l_prime[1]

In [14]:
def boyer_moore(p, p_bm, t):
    """ Do Boyer-Moore matching """
    i = 0
    occurrences = []
    while i < len(t) - len(p) + 1:
        shift = 1
        mismatched = False
        for j in range(len(p)-1, -1, -1):
            if p[j] != t[i+j]:
                skip_bc = p_bm.bad_character_rule(j, t[i+j])
                skip_gs = p_bm.good_suffix_rule(j)
                shift = max(shift, skip_bc, skip_gs)
                mismatched = True
                break
        if not mismatched:
            occurrences.append(i)
            skip_gs = p_bm.match_skip()
            shift = max(shift, skip_gs)
        i += shift
    return occurrences

In [15]:
def approximate_match(p, t, n):
    segment_length = int(round(len(p) / (n+1)))
    all_matches = set()
    for i in range(n+1):
        start = i*segment_length
        end = min((i+1)*segment_length, len(p))
        p_bm = BoyerMoore(p[start:end], alphabet='ACGT')
        matches = boyer_moore(p[start:end], p_bm, t)
        # Extend matching segments to see if whole p matches
        for m in matches:
            if m < start or m-start+len(p) > len(t):
                continue
            mismatches = 0
            for j in range(0, start):
                if not p[j] == t[m-start+j]:
                    mismatches += 1
                    if mismatches > n:
                        break
            for j in range(end, len(p)):
                if not p[j] == t[m-start+j]:
                    mismatches += 1
                    if mismatches > n:
                        break
            if mismatches <= n:
                all_matches.add(m - start)
    return list(all_matches)

In [47]:
p = 'AACTTG'
t = 'CACTTAATTTG'
print(approximate_match(p, t, 2))

[0, 5]


In [48]:
print(t[5:])

AATTTG


# QUIZ 2.1

In [21]:
# t: 'GGCTATAATGCGTA'
# p:   TAATAAA
t = 'GGCTATAATGCGTA'
p = 'TAATAAA'
p_bm = BoyerMoore(p)

In [23]:
p_bm.bad_character_rule(4, 'T')

1

In [24]:
p = 'TAATTAA'
p_bm = BoyerMoore(p)
p_bm.good_suffix_rule(3)

4

In [54]:
boyer_moore(p, p_bm, t)

[]

# Homework 2

In a practical, we saw Python code implementing the Boyer-Moore algorithm. Some of the code is for preprocessing the pattern P into the tables needed to execute the bad character and good suffix rules — we did not discuss that code. But we did discuss the code that performs the algorithm given those tables:

In [1]:
def boyer_moore(p, p_bm, t):
    """ Do Boyer-Moore matching """
    i = 0
    occurrences = []
    while i < len(t) - len(p) + 1:
        shift = 1
        mismatched = False
        for j in range(len(p)-1, -1, -1):
            if p[j] != t[i+j]:
                skip_bc = p_bm.bad_character_rule(j, t[i+j])
                skip_gs = p_bm.good_suffix_rule(j)
                shift = max(shift, skip_bc, skip_gs)
                mismatched = True
                break
        if not mismatched:
            occurrences.append(i)
            skip_gs = p_bm.match_skip()
            shift = max(shift, skip_gs)
        i += shift
    return occurrences

#### Measuring Boyer-Moore's benefit. 
First, download the Python module for Boyer-Moore preprocessing:

http://d28rh4a8wq0iu5.cloudfront.net/ads1/code/bm_preproc.py

This module provides the BoyerMoore class, which encapsulates the preprocessing info used by the boyer_moore function above.

In [161]:
!wget http://d28rh4a8wq0iu5.cloudfront.net/ads1/code/bm_preproc.py

--2017-07-05 23:09:32--  http://d28rh4a8wq0iu5.cloudfront.net/ads1/code/bm_preproc.py
Resolving d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)... 54.192.55.61, 54.192.55.20, 54.192.55.177, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)|54.192.55.61|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9400 (9,2K) [application/octet-stream]
Saving to: ‘bm_preproc.py’


2017-07-05 23:09:33 (1,75 MB/s) - ‘bm_preproc.py’ saved [9400/9400]



Second, download the provided excerpt of human chromosome 1:

http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/chr1.GRCh38.excerpt.fasta

In [2]:
!wget http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/chr1.GRCh38.excerpt.fasta

--2018-11-01 18:43:49--  http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/chr1.GRCh38.excerpt.fasta
Resolving d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)... 13.32.81.77, 13.32.81.9, 13.32.81.145, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)|13.32.81.77|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 810105 (791K) [application/octet-stream]
Saving to: ‘chr1.GRCh38.excerpt.fasta’


2018-11-01 18:43:50 (748 KB/s) - ‘chr1.GRCh38.excerpt.fasta’ saved [810105/810105]



Third, implement versions of the naive exact matching and Boyer-Moore algorithms that additionally count and return (a) the number of character comparisons performed and (b) the number of alignments tried. Roughly speaking, these measure how much work the two different algorithms are doing.

In [17]:
#de algun lado lo saquè, era para entender como funcionaban los cambios
def naiveHamming(p, t, maxDistance):
    occurrences = []
    counts= []
    for i in range(len(t) - len(p) + 1):
        nmm = 0
        match = True
        
        for j in range(len(p)):
            if t[i+j] != p[j]:
                nmm += 1
                counts.append(j)
                if nmm > maxDistance:
                    break
        if nmm <= maxDistance:
            occurrences.append(i)
    return occurrences, len(counts)
#counts NO SIRVE, es el registro de las posicinoes donde no hace match durante todas las comparaciones
#nmm es el treshold de unmatchs que se registran cuando != y se agrega si a ocurrencias si su valor es igual o menor al especificado en maxDIstance

In [18]:
t = 'TCGCGCAAATTATAGACTCTAGCATCAGCGATAGGCTAGCTGAGCAGATACCATAGTCAGTAGATGACGATAGACAGTA'
p = 'TCGAGC'
naiveHamming(p, t, 1)
#se econtro match en las posiciones 0 y 17 

([0, 17], 146, 74)

In [None]:
# Ahora si, esta es la resolucion de la tarea. Parte 1

In [8]:
def naive_with_counts(p, t):
    occurrences = []
    aligs = (len(t)-len(p)+1)
    counts = 0
    for i in range(len(t) - len(p) + 1):
        match = True
        for j in range(len(p)):
            if t[i+j] == p[j]:
                counts += 1
            if t[i+j] != p[j]:
                counts += 1
                match = False
                break
        if match:
            occurrences.append(i)
    return occurrences, aligs, counts

In [9]:
t = 'TCGATCG'
p = 'TCG'
naive_with_counts(p, t)

([0, 4], 5, 9)

For a few examples to help you test if your enhanced versions of the naive exact matching and Boyer-Moore algorithms are working properly, see these notebooks:

#### hw2_naive_with_counts.ipynb

In [165]:
# Implement naive_with_counts by extending naive function
#from naive_with_counts import naive_with_counts

### Example 1

In [10]:
p = 'word'
t = 'there would have been a time for such a word'
occurrences, num_alignments, num_character_comparisons = naive_with_counts(p, t)
print(occurrences, num_alignments, num_character_comparisons)

[40] 41 46


### Example 2

In [11]:
p = 'needle'
t = 'needle need noodle needle'
occurrences, num_alignments, num_character_comparisons = naive_with_counts(p, t)
print(occurrences, num_alignments, num_character_comparisons)

[0, 19] 20 35


In [None]:
# Ahora si, esta es la resolucion de la tarea. Parte 2

In [19]:
def boyer_moore_with_counts(p, p_bm, t):
    """ Do Boyer-Moore matching """
    i = 0
    occurrences = []
    counts = 0
    align = []
    while i < len(t) - len(p) + 1:
        shift = 1
        mismatched = False
        for j in range(len(p)-1, -1, -1):
            if p[j] == t[i+j]:
                counts += 1
            if p[j] != t[i+j]:
                skip_bc = p_bm.bad_character_rule(j, t[i+j])
                skip_gs = p_bm.good_suffix_rule(j)
                shift = max(shift, skip_bc, skip_gs)
                mismatched = True
                counts += 1
                break
        if not mismatched:
            occurrences.append(i)
            skip_gs = p_bm.match_skip()
            shift = max(shift, skip_gs)
        i += shift
        align.append(i)
    return occurrences, len(align), counts

For a few examples to help you test if your enhanced versions of the naive exact matching and Boyer-Moore algorithms are working properly, see these notebooks:

#### hw2_bm_with_counts.ipynb

In [21]:
# Implement boyer_moore_with_counts by extending boyer_moore function
# from bm_with_counts import boyer_moore_with_counts
from bm_preproc import BoyerMoore

### Example 1

In [22]:
p = 'word'
t = 'there would have been a time for such a word'
lowercase_alphabet = 'abcdefghijklmnopqrstuvwxyz '
p_bm = BoyerMoore(p, lowercase_alphabet)
occurrences, num_alignments, num_character_comparisons = boyer_moore_with_counts(p, p_bm, t)
print(occurrences, num_alignments, num_character_comparisons)

[40] 12 15


### Example 2

In [39]:
p = 'needle'
t = 'needle need noodle needle'
p_bm = BoyerMoore(p, lowercase_alphabet)
occurrences, num_alignments, num_character_comparisons = boyer_moore_with_counts(p, p_bm, t)
print(occurrences, num_alignments, num_character_comparisons)

[0, 19] 5 18


#### Question 1
How many alignments does the naive exact matching algorithm try when matching the string GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG (derived from human Alu sequences) to the excerpt of human chromosome 1? (Don't consider reverse complements.)

#### Question 2
How many character comparisons does the naive exact matching algorithm try when matching the string GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG (derived from human Alu sequences) to the excerpt of human chromosome 1? (Don't consider reverse complements.)

In [23]:
def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            # ignore header line with genome information
            if not line[0] == '>':
                genome += line.rstrip()
    return genome
genome = readGenome('chr1.GRCh38.excerpt.fasta')
genome[:100]

'TTGAATGCTGAAATCAGCAGGTAATATATGATAATAGAGAAAGCTATCCCGAAGGTGCATAGGTCAACAATACTTGAGCCTAACTCAGTAGATCCTAAAA'

In [24]:
#1 y 2
t = genome
p = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG'
naive_with_counts(p,t)
#occurrences, num_alignments, num_character_comparisons

([56922], 799954, 984143)

#### Question 3
How many alignments does Boyer-Moore try when matching the string GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG (derived from human Alu sequences) to the excerpt of human chromosome 1? (Don't consider reverse complements.)

In [25]:
#3
p_bm = BoyerMoore(p)
occurrences, num_alignments, num_character_comparisons = boyer_moore_with_counts(p, p_bm, t)
print(occurrences, num_alignments, num_character_comparisons)

[56922] 127974 165191


In [26]:
boyer_moore_with_counts(p, p_bm, t)

([56922], 127974, 165191)

In [174]:
boyer_moore(p, p_bm, t)

[56922]

### Index-assisted approximate matching.
In practicals, we built a Python class called Index implementing an ordered-list version of the k-mer index. The Index class is copied below.

In [35]:
import bisect
import sys

In [36]:
# el mismo class Index visto en clase
import bisect
import sys

class Index(object):
    def __init__(self, t, k):
        ''' Create index from all substrings of size 'length' '''
        self.k = k  # k-mer length (k)
        self.index = []
        for i in range(len(t) - k + 1):  # for each k-mer
            self.index.append((t[i:i+k], i))  # add (k-mer, offset) pair
        self.index.sort()  # alphabetize by k-mer
    
    def query(self, p):
        ''' Return index hits for first k-mer of P '''
        kmer = p[:self.k]  # query with first k-mer
        i = bisect.bisect_left(self.index, (kmer, -1))  # binary search
        hits = []
        while i < len(self.index):  # collect matching index entries
            if self.index[i][0] != kmer:
                break
            hits.append(self.index[i][1])
            i += 1
        return hits

In [37]:
# el mismo queryIndex visto en clase
def queryIndex(p, t, index):
    k = index.k
    offsets = []
    for i in index.query(p):
        if p[k:] == t[i+k:i+len(p)]:  # verify that rest of P matches
            offsets.append(i)
    return offsets

In [38]:
index = Index(t, 4)

In [43]:
p = 'GGCGCGGTGGCTCACGCCTGTAAT'
print(queryIndex(p, t, index))

[56922, 262042, 364263, 657496, 717706]


In [40]:
p_bm = BoyerMoore(p)
boyer_moore_with_counts(p, p_bm, t)

([56922, 262042, 364263, 657496, 717706], 126203, 196873)

In [46]:
approximate_match(p, t, 0)

[364263, 717706, 657496, 56922, 262042]

We also implemented the pigeonhole principle using Boyer-Moore as our exact matching algorithm

In [41]:
# el mismo pigeonhole principle visto en clase

def approximate_match(p, t, n):
    segment_length = int(round(len(p) / (n+1)))
    all_matches = set()
    for i in range(n+1):
        start = i*segment_length
        end = min((i+1)*segment_length, len(p))
        p_bm = BoyerMoore(p[start:end], alphabet='ACGT')
        matches = boyer_moore(p[start:end], p_bm, t)
        # Extend matching segments to see if whole p matches
        for m in matches:
            if m < start or m-start+len(p) > len(t):
                continue
            mismatches = 0
            for j in range(0, start):
                if not p[j] == t[m-start+j]:
                    mismatches += 1
                    if mismatches > n:
                        break
            for j in range(end, len(p)):
                if not p[j] == t[m-start+j]:
                    mismatches += 1
                    if mismatches > n:
                        break
            if mismatches <= n:
                all_matches.add(m - start)
    return list(all_matches)

Implement the pigeonhole principle using Index to find exact matches for the partitions. Assume P always has length 24, and that we are looking for approximate matches with up to 2 mismatches (substitutions). We will use an 8-mer index.

Download the Python module for building a k-mer index. Consiste en el mismo class Index visto en clase, no lo llamare

https://d28rh4a8wq0iu5.cloudfront.net/ads1/code/kmer_index.py

In [29]:
!wget https://d28rh4a8wq0iu5.cloudfront.net/ads1/code/kmer_index.py

--2018-10-16 17:24:44--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/code/kmer_index.py
Resolviendo d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)... 52.85.39.22, 52.85.39.72, 52.85.39.109, ...
Conectando con d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)[52.85.39.22]:443... conectado.
Petición HTTP enviada, esperando respuesta... 200 OK
Longitud: 971 [application/octet-stream]
Grabando a: “kmer_index.py”


2018-10-16 17:24:45 (32,3 MB/s) - “kmer_index.py” guardado [971/971]



Write a function that, given a length-24 pattern P and given an Index object built on 8-mers, finds all approximate occurrences of P within T with up to 2 mismatches. Insertions and deletions are not allowed. Don't consider any reverse complements.

In [None]:
### Ahora si, la resolucion de la tarea
# Implement the pigeonhole principle using Index to find exact matches for the partitions.
# Assume P always has length 24, and that we are looking for approximate matches 
# with up to 2 mismatches (substitutions). We will use an 8-mer index

In [47]:
#creado para homework
def IndexApproximate_match(p, t, n): # n es numero de mismatches
    segment_length = int(round(len(p) / (n+1))) #para p=24 y n=2 es 3
    all_matches = set()
    p_idx = Index(t, segment_length)    #adicionado
    idx_hits = 0        #adicionado
    for i in range(n+1):  #loop over partitions #para n=2 es 0,1,2
        start = i*segment_length                 # 0*8, 1*8, 2*8
        end = min((i+1)*segment_length, len(p))  # 1*8, 2*8, 3*8
        matches = p_idx.query(p[start:end])   #usando Index y ya no boyermoore
        
        # Extend matching segments to see if whole p matches
        for m in matches:
            idx_hits += 1
            if m < start or m-start+len(p) > len(t):
                continue
            
            mismatches = 0
            
            for j in range(0, start):     #verificar hacia la izquierda
                if not p[j] == t[m-start+j]:
                    mismatches += 1
                    if mismatches > n:
                        break
            for j in range(end, len(p)):   #verificar hacia la derecha
                if not p[j] == t[m-start+j]:
                    mismatches += 1
                    if mismatches > n:
                        break
            
            if mismatches <= n:
                all_matches.add(m - start)
    return list(all_matches), idx_hits

#### Question 4
How many times does the string GGCGCGGTGGCTCACGCCTGTAAT, which is derived from a human Alu sequence, occur with up to 2 substitutions in the excerpt of human chromosome 1? (Don't consider reverse complements here.)

Hint 1: Multiple index hits might direct you to the same match multiple times, but be careful not to count a match more than once.

Hint 2: You can check your work by comparing the output of your new function to that of the naive_2mm function implemented in the previous module.

In [50]:
#revisando matches con Index_boyermoore y naive_2mm
p = 'GGCGCGGTGGCTCACGCCTGTAAT'

print(approximate_match(p, t, 2))

[84641, 160162, 724927, 273669, 147558, 364263, 421221, 681737, 717706, 465647, 429299, 747359, 657496, 160729, 56922, 635931, 191452, 551134, 262042]


In [51]:
print(IndexApproximate_match(p, t, 2))

([84641, 160162, 724927, 273669, 147558, 364263, 421221, 681737, 717706, 465647, 429299, 747359, 657496, 160729, 56922, 635931, 191452, 551134, 262042], 90)


In [48]:
# el mismo naive_2mm creado en examen
def naive_2mm(p, t):
    occurrences = []
    
    for i in range(len(t) - len(p) + 1):
        cross =0
        for j in range(len(p)):
            if t[i+j] != p[j]:
                cross += 1
                match = False
                
        if cross <= 2:
          occurrences.append(i)
    return occurrences

In [49]:
p = 'GGCGCGGTGGCTCACGCCTGTAAT'
print(naive_2mm(p, t))

[56922, 84641, 147558, 160162, 160729, 191452, 262042, 273669, 364263, 421221, 429299, 465647, 551134, 635931, 657496, 681737, 717706, 724927, 747359]


#### Question 5
Using the instructions given in Question 4, how many total index hits are there when searching for occurrences of GGCGCGGTGGCTCACGCCTGTAAT with up to 2 substitutions in the excerpt of human chromosome 1?

(Don't consider reverse complements.)

Hint: You should be able to use the boyer_moore function (or the slower naive function) to double-check your answer.

In [193]:
#Q4 y Q5
p = 'GGCGCGGTGGCTCACGCCTGTAAT'
print(IndexApproximate_match(p, t, 2))

([84641, 160162, 635931, 747359, 273669, 147558, 364263, 681737, 717706, 465647, 429299, 657496, 160729, 56922, 724927, 191452, 262042, 551134, 421221], 90)


Let's examine whether there is a benefit to using an index built using subsequences of T rather than substrings, as we discussed in the "Variations on k-mer indexes" video. We'll consider subsequences involving every N characters. For example, if we split ATATAT into two substring partitions, we would get partitions ATA (the first half) and TAT (second half). But if we split ATATAT into two subsequences by taking every other character, we would get AAA (first, third and fifth characters) and TTT (second, fourth and sixth).

Another way to visualize this is using numbers to show how each character of P is allocated to a partition. Splitting a length-6 pattern into two substrings could be represented as 111222, and splitting into two subsequences of every other character could be represented as 121212

The following class SubseqIndex is a more general implementation of Index that additionally handles subsequences. It only considers subsequences that take every Nth character:

In [87]:
#tomado del profe como dato para la tarea
import bisect
   
class SubseqIndex(object):
    """ Holds a subsequence index for a text T """
    
    def __init__(self, t, k, ival):
        """ Create index from all subsequences consisting of k characters
            spaced ival positions apart.  E.g., SubseqIndex("ATAT", 2, 2)
            extracts ("AA", 0) and ("TT", 1). """
        self.k = k  # num characters per subsequence extracted
        self.ival = ival  # space between them; 1=adjacent, 2=every other, etc
        self.index = []
        self.span = 1 + ival * (k - 1)
        for i in range(len(t) - self.span + 1):  # for each subseq
            self.index.append((t[i:i+self.span:ival], i))  # add (subseq, offset)
        self.index.sort()  # alphabetize by subseq
    
    def query(self, p):
        """ Return index hits for first subseq of p """
        subseq = p[:self.span:self.ival]  # query with first subseq
        i = bisect.bisect_left(self.index, (subseq, -1))  # binary search
        hits = []
        while i < len(self.index):  # collect matching index entries
            if self.index[i][0] != subseq:
                break
            hits.append(self.index[i][1])
            i += 1
        return hits

For example, if we do:

In [59]:
ind = SubseqIndex('ATATAT', 3, 2)  #t  k#3letras , tomadas cada ival#2 espacios
print(ind.index)

[('AAA', 0), ('TTT', 1)]


And if we query this index:

In [61]:
p = 'TTATAT'
print(ind.query(p[0:]))

[]


because the subsequence TAA is not in the index. But if we query with the second subsequence:

In [62]:
print(ind.query(p[1:]))

[1]


because the second subsequence TTT is in the index.

#### Question 6
Write a function that, given a length-24 pattern P and given a SubseqIndex object built with k = 8 and ival = 3, finds all approximate occurrences of P within T with up to 2 mismatches.

When using this function, how many total index hits are there when searching for GGCGCGGTGGCTCACGCCTGTAAT with up to 2 substitutions in the excerpt of human chromosome 1? (Again, don't consider reverse complements.)


In [144]:
query_subseq
#def approximate_match_subseq(p, t, n, ival):
def query_subseq(p, t, subseq_ind):
    all_matches = set()
    idx_hits = 0
    match=[]
    for i in (0,1,2):
        start = i
        matches = subseq_ind.query(p[start:])
        match.append(matches)
        # Extend matching segments to see if whole p matches
        for m in matches:
            idx_hits += 1
            if m < start or m-start+len(p) > len(t):
                continue
            
            mismatches = 0
            
            for j in range(0, len(p)):  #verificar hacia la izquierda, desde inicio de p
                if not p[j] == t[m-start+j]:
                    mismatches += 1
                    if mismatches > 2:
                        break
            
            if mismatches <= 2:
                all_matches.add(m - start)
    return list(all_matches), idx_hits, match

In [73]:
p = 'GGCGCGGTGGCTCACGCCTGTAAT'
occurrences, hits = approximate_match_subseq(p, t, 2, 3)
print(occurrences, hits)

[84641, 160162, 724927, 273669, 147558, 364263, 421221, 681737, 717706, 465647, 429299, 747359, 657496, 160729, 56922, 635931, 191452, 551134, 262042] 79


In [79]:
#respuesta opcion 2
from functools import reduce
def query_subseq(p, t, subseq_ind):
    """Write a function that, given a length-24 pattern P and given a SubseqIndex object built with k = 8 and ival = 3, 
    finds all approximate occurrences of P within T with up to 2 mismatches."""
    #number of mistmatches = 2, so we need to split into 3 (2+1)
    mistmatches_allowed = 2
    num_segments_required = mistmatches_allowed + 1
    
    pattern_size = 24

    reads, qualities = readFastq('chr1.GRCh38.excerpt.fasta')
    consolidated_read = ''.join([read for read in reads])

    #p_segments = [ p[i:] for i in range(0,  pattern_size / subseq_ind.ival) ]
    p_segments = []
    for i in range(0,  pattern_size):
        if len(p[i::subseq_ind.ival]) == subseq_ind.k:
            p_segments.append(p[i:])
        else:
            break
    #print p_segments

    #print [subseq_ind.get_subseq(segment) for segment in p_segments]
    checksize = reduce(lambda x,y : x and y, [map(lambda x: len(x)==8, subseq_ind.get_subseq(segment)) for segment in p_segments])[0]
    assert checksize
    
    hits_lists = []
    hits_per_segment = {}
    i = 0
    index_hits = 0
    segment_number = 0
    for segment in p_segments:
        
        #hits = subseq_ind.query(segment)
        hits = subseq_ind.queryDictIndex(segment)
        i += 1
        #print "hits: ", hits
        index_hits += 1
        if len(hits) > 0:
            #print segment, hits
            hits_lists.append(hits)
            hits_per_segment[segment_number] = set([hit-segment_number for hit in hits])
        segment_number += 1
    #print hits_per_segment

    #reduce:
    reduced_hits = set()
    for i in range(len(hits_per_segment)):
        intersect = hits_per_segment[i].intersection(hits_per_segment[(i+1)%len(hits_per_segment)])
        if len(intersect) > 0 : 
            for item in intersect:
                reduced_hits.add(item)
    
    return sorted(reduced_hits), index_hits

In [None]:
#Hint: See this notebook for a few examples you can use to test your function.

### Example 1

In [145]:
t = 'to-morrow and to-morrow and to-morrow creeps in this petty pace'
p = 'to-morrow and to-morrow '
subseq_ind = SubseqIndex(t, 8, 3)

In [146]:
occurrences, num_index_hits, matches = query_subseq(p, t, subseq_ind)

In [147]:
print(occurrences)

[0, 14]


In [148]:
print(num_index_hits)

6


In [149]:
print(matches)

[[0, 14], [1, 15], [2, 16]]


### Example 2

In [132]:


# King John by William Shakespeare
!wget http://www.gutenberg.org/ebooks/1110.txt.utf-8



--2018-11-01 21:07:34--  http://www.gutenberg.org/ebooks/1110.txt.utf-8
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://www.gutenberg.org/cache/epub/1110/pg1110.txt [following]
--2018-11-01 21:07:34--  http://www.gutenberg.org/cache/epub/1110/pg1110.txt
Reusing existing connection to www.gutenberg.org:80.
HTTP request sent, awaiting response... 200 OK
Length: 145418 (142K) [text/plain]
Saving to: ‘1110.txt.utf-8’


2018-11-01 21:07:35 (187 KB/s) - ‘1110.txt.utf-8’ saved [145418/145418]



In [150]:
t = open('1110.txt.utf-8').read()
p = 'English measure backward'
subseq_ind = SubseqIndex(t, 8, 3)

In [151]:
occurrences, num_index_hits, match = query_subseq(p, t, subseq_ind)

In [152]:
print(occurrences)

[132186]


In [153]:
print(num_index_hits)

3


In [154]:
print(match)

[[132186], [132187], [132188]]


In [None]:
#6   GGCGCGGTGGCTCACGCCTGTAAT
p = 'GGCGCGGTGGCTCACGCCTGTAAT'

#number of mistmatches = 2, so we need to split into 3 (2+1)
    mistmatches_allowed = 2
    num_segments_required = mistmatches_allowed + 1
    k_mer_size = 8
    pattern_size = 24
    ival = 3

    reads, qualities = readFastq('chr1.GRCh38.excerpt.fasta')
    t = ''.join([read for read in reads])

    subseq_ind = SubseqIndex(t, k_mer_size, ival)
    return query_subseq(p, t, subseq_ind)

In [156]:
t = genome
p = 'GGCGCGGTGGCTCACGCCTGTAAT'
subseq_ind = SubseqIndex(t, 8, 3)
occurrences, num_index_hits, match = query_subseq(p, t, subseq_ind)

In [157]:
print(occurrences)

[84641, 160162, 724927, 273669, 147558, 364263, 421221, 681737, 717706, 465647, 429299, 747359, 657496, 160729, 56922, 635931, 191452, 551134, 262042]


In [158]:
len(occurrences)

19

In [159]:
print(num_index_hits)

79


In [221]:
def question6():
    p = 'GGCGCGGTGGCTCACGCCTGTAAT'

    #number of mistmatches = 2, so we need to split into 3 (2+1)
    mistmatches_allowed = 2
    num_segments_required = mistmatches_allowed + 1
    k_mer_size = 8
    pattern_size = 24
    ival = 3

    reads, qualities = readFastq('chr1.GRCh38.excerpt.fasta')
    t = ''.join([read for read in reads])

    subseq_ind = SubseqIndex(t, k_mer_size, ival)
    return query_subseq(p, t, subseq_ind)

In [222]:
occurrences, num_index_hits = question6()
print occurrences, num_index_hits

[36918, 116447] 3


In [239]:
p = 'GGCGCGGTGGCTCACGCCTGTAAT'

In [240]:
subseq_ind = SubseqIndex(t, 8, 3)

In [242]:
print(occurrences)

[56922, 84641, 147558, 160729, 191452, 262042, 273669, 364263, 429299, 465647, 635931, 657496, 681737, 717706, 724927, 747359]
