### BA1A : Compute the Number of Times a Pattern Appears in a Text

In [5]:
def count_pattern(text,pattern):
    count = 0
    
    for i in range (len(text) - len(pattern)+1):
        substring = text[i:i+len(pattern)]
        if substring == pattern:
            count+=1
    
    return count

text = "ACAACTATGCATACTATCGGGAACTATCCT"
pattern = "ACTAT"
result = count_pattern(text,pattern)
print(f"Count of '{pattern}' in '{text}' : '{result}'")

Count of 'ACTAT' in 'ACAACTATGCATACTATCGGGAACTATCCT' : '3'


### BA1B : Find the Most Frequent Words in a String

In [13]:
def most_frequent_kmers(text,k):
    kmers = {}
    frequent_kmers = []
    max_count = 0

    for i in range(len(text)-k +1):
        kmer = text[i:i+k]
        if kmer not in kmers:
            kmers[kmer]=0
        kmers[kmer] +=1
        max_count = max(max_count,kmers[kmer])
    for kmer, count in kmers.items():
        if count == max_count:
            frequent_kmers.append(kmer)
    
    return frequent_kmers

text = "ACGTTGCATGTCGCATGATGCATGAGAGCT";
k = 4;
result = most_frequent_kmers(text,k);
print(result);

['GCAT', 'CATG']


### BA1C : Find the Reverse Complement of a String

In [16]:
def reverse_complement(dna_string):
    complement_dictionary = {
        'A':'T',
        "T":"A",
        "C":"G",
        "G":"C"
    }

    reversed_complement = ""

    for neucleotide in reversed(dna_string):
        complement = complement_dictionary[neucleotide]
        reversed_complement += complement
    
    return reversed_complement


dna_string = "AAAACCCGGT"
reverse_comp = reverse_complement(dna_string)
print(reverse_comp)

ACCGGGTTTT


### BA1D : Find All Occurrences of a Pattern in a String

In [21]:
def findPattern(pattern,genome):
    positions = []

    for i in range(len(genome)- len(pattern) + 1):
        substring = genome[i:i+len(pattern)]

        if substring == pattern:
            positions.append(i)
    
    return positions

pattern = "ATAT"
genome = "GATATATGCATATACTT"

matches = findPattern(pattern,genome)
print(matches)

[1, 3, 9]


### BA1E : Find Patterns Forming Clumps in a String

In [26]:
def findClumps(genome,k,L,t):
    clumps = set()

    for i in range(len(genome) - L +1):
        window = genome[i: i+L]

        kmers = {}

        for j in range(len(window)-k+1):
            kmer = window[j:j+k]
            if kmer not in kmers:
                kmers[kmer] = 0
            kmers[kmer] += 1

        for kmer, count in kmers.items():
            if count >= t:
                clumps.add(kmer)

    return list(clumps) 

genome = "CGGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAATGTACGAATGTAATGTGCCTATGGC"

k = 5
L = 75
t = 4

result = findClumps(genome,k,L,t)
print(result)

['GAAGA', 'CGACA', 'AATGT']


### BA1F : Find a Position in a Genome Minimizing the Skew

In [31]:
def minSkew(genome):
    min_skew = float('inf')
    min_skew_positions = []

    current_skew = 0

    for i,nucleotide in enumerate(genome):
        if nucleotide == 'G':
            current_skew += 1
        elif nucleotide == 'C':
            current_skew -= 1

        if current_skew < min_skew:
            min_skew = current_skew
            min_skew_positions = [i+1]
        elif current_skew == min_skew:
            min_skew_positions.append(i+1)

    return min_skew_positions



genome = "CCTATCGGTGGATTAGCATGTCCCTGTACGTTTCGCCGCGAACTAGTTCACACGGCTTGATGGCAAATGGTTTTTCCGGCGACCGTAATCGTCCACCGAG"
positions = minSkew(genome)
print(positions)

[53, 97]


### BA1G : Compute the Hamming Distance Between Two Strings

In [33]:
def hammingDistance(genome1,genome2):
    count = 0

    for i, (nucleotide1, nucleotide2) in enumerate(zip(genome1,genome2)):
        if nucleotide1 != nucleotide2:
            count += 1

    
    return count

genome1 = "GGGCCGTTGGT"
genome2 = "GGACCGTTGAC"

result = hammingDistance(genome1,genome2)
print(result)

3


### BA1H : Find All Approximate Occurrences of a Pattern in a String

In [35]:
def calculateHammingDistance(genome,pattern):
    count = 0
    for i in range(len(genome)):
        if(genome[i]!=pattern[i]):
            count += 1
    return count

def findAllPositions(genome,pattern,d):
    result = []
    patternLen = len(pattern)
    genomeLen = len(genome)

    for i in range(genomeLen - patternLen + 1):
        subgenome = genome[i:i+patternLen]
        hammingDistance = calculateHammingDistance(subgenome,pattern)
        if hammingDistance <= d :
            result.append(i)
    
    return result

pattern = "ATTCTGGA"
genome = "CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAATGCCTAGCGGCTTGTGGTTTCTCCTACGCTCC"
d = 3

result = findAllPositions(genome,pattern,d)
print(result)

[6, 7, 26, 27, 78]


### BA1I : Find the Most Frequent Words with Mismatches in a String

In [36]:
def calculateHammingDistance(genome1,genome2):
    count = 0
    for i in range(len(genome1)):
        if genome1[i] != genome2[i]:
            count += 1
    
    return count

def mostFrequentWords(genome,k,d):
    max_count = 0
    result = []

    for i in range(len(genome) - k +1):
        pattern = genome[i:i+k]

        # serach pattern again in the total sentence
        count = 0
        for j in range(len(genome) -k + 1):
            hammingDistance = calculateHammingDistance(pattern,genome[j:j+k])
            if(hammingDistance <= d):
                count += 1
        if count > max_count:
            max_count = count
            result = [pattern]
        elif count == max_count:
            result.append(pattern)
    return result


genome = "ACGTTGCATGTCGCATGATGCATGAGAGCT"
k = 4
d = 1

result = mostFrequentWords(genome,k,d)
print(result)

['ATGT', 'GATG', 'ATGC']


### BA1J : Find Frequent Words with Mismatches and Reverse Complements

In [38]:
def calculateReverseComplement(pattern):
    complement_dict = {'A':'T', 'T':'A', 'G':'C','C':'G'}
    complement = ""

    for base in pattern[::-1]:
        complement += complement_dict[base]
    return complement

def hammingDistance(pattern1, pattern2):
    count = 0
    for p1,p2 in zip(pattern1,pattern2):
        if(p1 != p2):
            count += 1
    
    return count

def findAllWords(genome, k, d):
    result = []
    max_count = 0

    for i in range(len(genome) - k + 1):
        pattern = genome[i: i+k]
        reverse_complement = calculateReverseComplement(pattern)
        count = 0
        
        for j in range(len(genome) - k + 1):
            if hammingDistance(pattern, genome[j:j+k]) <=d or hammingDistance(reverse_complement, genome[j:j+k])<=d:
                count += 1
        if count > max_count :
            result = [pattern]
            max_count = count
        elif count == max_count:
            result.append(pattern)
    
    return result

def checkDuplicate(results):
    mySet = set()
    for result in results:
        if result not in mySet and calculateReverseComplement(result) not in mySet:
            mySet.add(result)
    
    return list(mySet)


genome = "ACGTTGCATGTCGCATGATGCATGAGAGCT"
k = 4
d = 1

result = findAllWords(genome,k,d)

finalResult = checkDuplicate(result)

print(finalResult)

['ATGA', 'ATGT', 'GCAT']


### BA1L : Implement PatternToNumbe

In [40]:
def patternToNumber(genome):
    number = 0
    genome_dict = {'A':0,'C':1,'G':2,'T':3}
    k = len(genome)

    for i, base in enumerate(genome[::-1]):
        number += genome_dict[base] * (4**i)
    
    return number

genome = "AGT"
result = patternToNumber(genome)
print(result)

11


### BA1M : Implement Implement NumberToPattern

In [41]:
def numberToPattern(number,k):
    base_dict = {0:'A',1:'C',2:'G',3:'T'}
    pattern = ""

    for i in range(k):
        remainder = number % 4
        base = base_dict[remainder]
        pattern = base + pattern
        number //= 4
    
    return pattern

number = 45
k = 4

result = numberToPattern(number,k)

print(result)

AGTC
