## Deadline: 5 June 2023

### 1. FM-index

Implement FM-index data structure and string search using it. Provide tests.

https://www.youtube.com/watch?v=4n7NPk5lwbI

3 points

In [None]:
class FMIndex:
    def __init__(self, text):
        self.text = text + '$'
        self.bwt, self.sa = self.construct()
        self.counts, self.first_pos = self.preprocess_bwt()

    def construct(self):
        suffixes = sorted([self.text[i:] + self.text[:i] for i in range(len(self.text))])
        bwt = ''.join(suffix[-1] for suffix in suffixes)
        sa = [len(self.text) - suffix.index('$') - 1 for suffix in suffixes]
        return bwt, sa

    def preprocess_bwt(self):
        counts = {}
        first_pos = {}
        for char in self.bwt:
            counts[char] = counts.get(char, 0) + 1
        sorted_chars = sorted(counts.keys())
        total = 0
        for char in sorted_chars:
            first_pos[char] = total
            total += counts[char]
        return counts, first_pos

    def get_poses(self, char):
        start = self.first_pos.get(char, 0)
        end = self.first_pos.get(char, 0) + self.counts.get(char, 0)
        return start, end

    def backward_search(self, pattern):
        top, bottom = 0, len(self.bwt)
        for i in range(len(pattern) - 1, -1, -1):
            char = pattern[i]
            start, end = self.get_poses(char)
            top = start + self.bwt[:top].count(char)
            bottom = start + self.bwt[:bottom].count(char) - 1
            if top > bottom:
                return []
        return self.sa[top:bottom+1]

    def search(self, pattern):
        pos = self.backward_search(pattern)
        return pos


In [None]:
def motif_tests():
    # Initialize FMIndex with DNA sequence
    dna_sequence = "ATGCTAGCTAGCTGATCGATCGATCGATCG"
    fm_index = FMIndex(dna_sequence)

    # Test searching for a pattern present in the DNA sequence
    pattern = "ATCG"
    poses = fm_index.search(pattern)
    print(f"Pattern '{pattern}' occurrences: {poses}")

    # Test searching for a pattern not present in the DNA sequence
    pattern = "GATTACA"
    poses = fm_index.search(pattern)
    print(f"Pattern '{pattern}' occurrences: {poses}")

motif_tests()

Pattern 'ATCG' occurrences: [26, 22]
Pattern 'GATTACA' occurrences: []


### 2. Needleman–Wunsch

Implement Needleman–Wunsch algorithm. Provide tests.

2 points

In [None]:
class NeedlemanWunsch:
    def __init__(self, gap_penalty, match_score, mismatch_penalty):
        self.gap_penalty = gap_penalty
        self.match_score = match_score
        self.mismatch_penalty = mismatch_penalty

    def align(self, sequence1, sequence2):
        
        score_matrix = [[0] * (len(sequence2) + 1) for _ in range(len(sequence1) + 1)]

       
        traceback_matrix = [[0] * (len(sequence2) + 1) for _ in range(len(sequence1) + 1)]

        
        for i in range(1, len(sequence1) + 1):
            score_matrix[i][0] = score_matrix[i - 1][0] + self.gap_penalty
            traceback_matrix[i][0] = 'U'

        for j in range(1, len(sequence2) + 1):
            score_matrix[0][j] = score_matrix[0][j - 1] + self.gap_penalty
            traceback_matrix[0][j] = 'L'

        for i in range(1, len(sequence1) + 1):
            for j in range(1, len(sequence2) + 1):
                if sequence1[i - 1] == sequence2[j - 1]:
                    match = score_matrix[i - 1][j - 1] + self.match_score
                else:
                    match = score_matrix[i - 1][j - 1] + self.mismatch_penalty
                delete = score_matrix[i - 1][j] + self.gap_penalty
                insert = score_matrix[i][j - 1] + self.gap_penalty

                score_matrix[i][j] = max(match, delete, insert)

                if score_matrix[i][j] == match:
                    traceback_matrix[i][j] = 'D'
                elif score_matrix[i][j] == delete:
                    traceback_matrix[i][j] = 'U'
                else:
                    traceback_matrix[i][j] = 'L'

      
        aligned_sequence1 = ''
        aligned_sequence2 = ''
        i = len(sequence1)
        j = len(sequence2)

        while i > 0 or j > 0:
            if traceback_matrix[i][j] == 'D':
                aligned_sequence1 = sequence1[i - 1] + aligned_sequence1
                aligned_sequence2 = sequence2[j - 1] + aligned_sequence2
                i -= 1
                j -= 1
            elif traceback_matrix[i][j] == 'U':
                aligned_sequence1 = sequence1[i - 1] + aligned_sequence1
                aligned_sequence2 = '-' + aligned_sequence2
                i -= 1
            else:
                aligned_sequence1 = '-' + aligned_sequence1
                aligned_sequence2 = sequence2[j - 1] + aligned_sequence2
                j -= 1

        return aligned_sequence1, aligned_sequence2


In [None]:
def run_needleman_wunsch_tests():
    gap_penalty = -1
    match_score = 1
    mismatch_penalty = -1

    nw = NeedlemanWunsch(gap_penalty, match_score, mismatch_penalty)

    sequence1 = "ACTGAGCTAG"
    sequence2 = "GCTTAGCTA"
    aligned_sequence1, aligned_sequence2 = nw.align(sequence1, sequence2)

    print("Sequence 1:", aligned_sequence1)
    print("Sequence 2:", aligned_sequence2)

run_needleman_wunsch_tests()


Sequence 1: ACTGAGCTAG
Sequence 2: GCTTAGCTA-
