In [3]:
import itertools

def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's suffx in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match

def scs(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
    return shortest_sup  # return shortest

ss = ["CCT", "CTT", "TGC", "TGG", "GAT", "ATT"]
scs(ss)

'CCTTGGATTGC'

In [4]:
# working solution, gives wrong answer for scs_list
def scs_list(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    shortest_sup_list = []
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
        if shortest_sup not in shortest_sup_list:
            shortest_sup_list.append(shortest_sup)
    return len(shortest_sup_list)  # return number of different shortest superstrings

print(f"scs_list: {scs_list(['ABC', 'BCA', 'CAB'])}")

scs_list: 1


In [27]:
# solution in progress, using set

def scs_list(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    shortest_sup_set = set()
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
            print(f"shortest_sup: {shortest_sup}, sup: {sup}")
        if len(sup) == len(shortest_sup):
            print(f"superstring is tied for shortest superstring: {sup}")
            shortest_sup_set.add(sup)
    return [shortest_sup, len(shortest_sup_set)]  # return number of different shortest superstrings

print(f"scs_list: {scs_list(['ABC', 'BCA', 'CAB'])}")

shortest_sup: ABCAB, sup: ABCAB
superstring is tied for shortest superstring: ABCAB
superstring is tied for shortest superstring: BCABC
superstring is tied for shortest superstring: CABCA
scs_list: ['ABCAB', 3]


In [39]:
# correct solution

def scs_list(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    shortest_sup_list = []
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss) - 1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
            print(f"shortest_sup: {shortest_sup}, sup: {sup}")
        
        # add superstring to shortest superstrings if not there already
        if sup not in shortest_sup_list:
            shortest_sup_list.append(sup)
        # checking if the superstring if equal in length to shortest superstring
        for string in shortest_sup_list:
            if len(string) > len(shortest_sup):
                shortest_sup_list.remove(string)
        
        # if len(sup) == len(shortest_sup):
        #     print(f"superstring is tied for shortest superstring: {sup}")
        #     shortest_sup_set.add(sup)
            
    print(f"shortest_sup_list: {shortest_sup_list}")
    return [shortest_sup, len(shortest_sup_list)]  # return number of different shortest superstrings

strings = ['GAT', 'TAG', 'TCG', 'TGC', 'AAT', 'ATA']
scs_list(strings)

shortest_sup: GATAGTCGTGCAATA, sup: GATAGTCGTGCAATA
shortest_sup: GATCGTGCAATAG, sup: GATCGTGCAATAG
shortest_sup: TCGATGCAATAG, sup: TCGATGCAATAG
shortest_sup_list: ['TCGATGCAATAG', 'TCGATAGAATGC', 'TCGAATAGATGC', 'TGCAATCGATAG', 'TGCAATAGATCG', 'AATCGATAGTGC', 'AATGCTCGATAG', 'AATAGATCGTGC', 'AATAGATGCTCG', 'AATAGTCGATGC']


['TCGATGCAATAG', 10]

In [42]:
# finding list of common superstrings equal in length to the shortest common superstring

def scs_list(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    shortest_sup_list = []
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss) - 1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
        
        # add superstring to shortest superstrings if not there already
        if sup not in shortest_sup_list:
            shortest_sup_list.append(sup)
            
        # checking if the superstring if equal in length to shortest superstring
        for string in shortest_sup_list:
            if len(string) > len(shortest_sup):
                shortest_sup_list.remove(string)
            
    return [shortest_sup, len(shortest_sup_list)]  # return number of different shortest superstrings

# strings = ['GAT', 'TAG', 'TCG', 'TGC', 'AAT', 'ATA']
strings = ['ABC', 'BCA', 'CAB']
scs_list(strings)

shortest_sup: ABCAB, sup: ABCAB
shortest_sup_list: ['ABCAB', 'BCABC', 'CABCA']


['ABCAB', 3]

In [44]:
# finding list of common superstrings equal in length to the shortest common superstring
ss = ["CCT", "CTT", "TGC", "TGG", "GAT", "ATT"]
scs_list(ss)

shortest_sup: CCTTGCTGGATT, sup: CCTTGCTGGATT
shortest_sup: CCTTGGATTGC, sup: CCTTGGATTGC
shortest_sup_list: ['CCTTGGATTGC', 'TGCCTTGGATT', 'TGGATTGCCTT', 'GATTGCCTTGG']


['CCTTGGATTGC', 4]

In [6]:
!wget http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ads1_week4_reads.fq

SYSTEM_WGETRC = c:/progra~1/wget/etc/wgetrc
syswgetrc = C:\Program Files (x86)\GnuWin32/etc/wgetrc
--2021-03-19 12:48:32--  http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ads1_week4_reads.fq
Resolving d28rh4a8wq0iu5.cloudfront.net... 13.225.141.183, 13.225.141.205, 13.225.141.46, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net|13.225.141.183|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 395781 (387K) [video/m2ts]
Saving to: `ads1_week4_reads.fq'

     0K .......... .......... .......... .......... .......... 12%  374K 1s
    50K .......... .......... .......... .......... .......... 25%  813K 1s
   100K .......... .......... .......... .......... .......... 38% 2.41M 0s
   150K .......... .......... .......... .......... .......... 51% 1.39M 0s
   200K .......... .......... .......... .......... .......... 64% 3.18M 0s
   250K .......... .......... .......... .......... .......... 77% 5.82M 0s
   300K .......... .......... .......... .......... .......... 90%

In [21]:
def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()  # skip name line
            seq = fh.readline().rstrip()  # read base sequence
            fh.readline()  # skip placeholder line
            qual = fh.readline().rstrip() # base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences

viral_genome = readFastq("ads1_week4_reads.fq")
print(f"first five reads in viral_genome: {viral_genome[:5]}")
print(f"length of viral genome: {len(viral_genome)}")

first five reads in viral_genome: ['GTCCAGCAGAGCAAGTGATGCGAGAGCTGCCCATCCTCCAACCAGCATGCCCCTAGACATTGACACTGCATCGGAGTCAGGCCAAGATCCGCAGGACAGT', 'GGAGTACGACTTCAGAGATCTCACTTGGTGTATCAACCCGCCAGAGAGAATCAAATTGGATTATGATCAATACTGTGCAGATGTGGCTGCTGAAGAACTC', 'GCAAATTTTGATCTCTCTTGGCTTCACAATCAATTCAACCATGACCCGAGATGTAGTCATACCCCTCCTCACAAACAACGATCTCTTAATAAGGATGGCA', 'GAGTTAATTGAAGCCCTAGATTACATTTTCATAACTGATGACATACATCTGACAGGGGAGATTTTCTCATTTTTCAGAAGTTTCGGCCACCCCAGACTTG', 'AATGACAGAGACCGCTATGACCATTGATGCTAGGTATGCAGAACTTCTAGGAAGAGTCAGATACATGTGGAAACTGATAGATGGTTTCTTCCCTGCACTC']
length of viral genome: 1881


In [23]:
%%time
import itertools

def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's prefix in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match

def pick_maximal_overlap(reads, k):
    """Returns two reads that have maximal ovelap."""
    reada, readb = None, None
    best_olen = 0
    for a, b in itertools.permutations(reads, 2):
        olen = overlap(a, b, min_length=k)
        if olen > best_olen:
            reada, readb = a, b
            best_olen = olen
    return reada, readb, best_olen

def greedy_scs(reads, k):
    read_a, read_b, olen = pick_maximal_overlap(reads, k)
    while olen > 0:
        reads.remove(read_a)
        reads.remove(read_b)
        reads.append(read_a + read_b[olen:])  # just the suffix of read_b since prefix overlaps with read_a
        read_a, read_b, olen = pick_maximal_overlap(reads, k)
    return "".join(reads)

reads = viral_genome
k = 30
genome = greedy_scs(reads, k)
print(genome[:500])

ACCAAACAAAGTTGGGTAAGGATAGATCAATCAATGATCATATTCTAGTACACTTAGGATTCAAGATCCTATTATCAGGGACAAGAGCAGGATTAGGGATATCCGAGATGGCCACACTTTTGAGGAGCTTAGCATTGTTCAAAAGAAACAAGGACAAACCACCCATTACATCAGGATCCGGTGGAGCCATCAGAGGAATCAAACACATTATTATAGTACCAATTCCTGGAGATTCCTCAATTACCACTCGATCCAGACTACTGGACCGGTTGGTCAGGTTAATTGGAAACCCGGATGTGAGCGGGCCCAAACTAACAGGGGCACTAATAGGTATATTATCCTTATTTGTGGAGTCTCCAGGTCAATTGATTCAGAGGATCACCGATGACCCTGACGTTAGCATCAGGCTGTTAGAGGTTGTTCAGAGTGACCAGTCACAATCTGGCCTTACCTTCGCATCAAGAGGTACCAACATGGAGGATGAGGCGGACCAATACTTT
Wall time: 2 ms


In [5]:
count_A = genome.count("A")
count_T = genome.count("T")
print(f"number of As: {count_A}")
print(f"number of Ts: {count_T}")

number of As: 4633
number of Ts: 3723


In [6]:
with open("unknown_genome.txt", "x") as genome_file:
    genome_file.write(genome)

In [8]:
s = "abcabc" \
    "defgihklm"
print(s)
print([f"{x:3s}" for x in s])

abcabcdefgihklm
['a  ', 'b  ', 'c  ', 'a  ', 'b  ', 'c  ', 'd  ', 'e  ', 'f  ', 'g  ', 'i  ', 'h  ', 'k  ', 'l  ', 'm  ']


In [13]:
#    0123456789 123456789 123456
s = "abcdefabcghiklmghijftnoppok"
s_new = ""
for i in range(len(s)):
    if i == 10:
        s_new += s[:i] + "\n"
print(f"s_new using for loop: {s_new}")

j = 0
s_while = ""
while j < len(s):
    if j == 10:
        s_while += s[:j] + "\n"
    j += 1
print(f"s_while: {s_while}")

s_new using for loop: abcdefabcg

s_while: abcdefabcg



In [18]:
n = 10
chunks = [s[i:i+n] for i in range(0, len(s), n)]
print(chunks)

['abcdefabcg', 'hiklmghijf', 'tnoppok']


In [19]:
s_final = ""
for i in range(len(chunks)):
    s_final += chunks[i] + "\n"
print(s_final)

abcdefabcg
hiklmghijf
tnoppok

