In [3]:
from Bio import SeqIO
import time
import psutil
import os

In [4]:
""" Paths to FASTA data files """

pathToFile1 = "./data/13443_ref_Cara_1.0_chr1c.fa"
pathToFile2 = "./data/10093_ref_PAHARI_EIJ_v1.1_chrX.fa"
pathToFile3 = "./data/144034_ref_Pbar_UMD_V03_chrUn.fa"

files = [
    pathToFile1,
    pathToFile2,
    pathToFile3
]

In [5]:
""" Search patterns for data files """

patternsForFile1 = [
    "ATGCATG",
    "TCTCTCTA",
    "TTCACTACTCTCA"
]

patternsForFile2 = [
    "ATGATG",
    "CTCTCTA",
    "TCACTACTCTCA"
]

patternsForFile3 = []

In [6]:
""" Return list of rotations of input string t """
def Rotations(t):
    tt = t * 2
    return [tt[i : i + len(t)] for i in range(0, len(t))]

In [7]:
""" Return lexicographically sorted list of t's rotations """
def BWM(t):
    return sorted(Rotations(t))

In [8]:
""" Given T, returns BWT(T) (last column) by creating BWM """
def BWTViaBWM(t):
    return ''.join(map(lambda x: x[-1], BWM(t)))

In [9]:

test1 = "Tomorrow_and_tomorrow_and_tomorrow$"
test2 = "It_was_the_best_of_times_it_was_the_worst_of_times$"
test3 = "in_the_jingle_jangle_morning_Ill_come_following_you$"
test4 = "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTTGATTTGG\
GGTTCAAAGCAGTAATTTGGGGTTCAAAGCAGTATCGACAAATAGTAAATCCATTTGTTCATTCAAAGCAGTAATT\
TGGGGTTATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT$"

In [10]:
""" Save Burrows-Wheeler transforms for later test of reversal function """

bwt1 = BWTViaBWM(test1)
bwt2 = BWTViaBWM(test2)
bwt3 = BWTViaBWM(test3)
bwt4 = BWTViaBWM(test4)

In [11]:
""" Given T return suffix array SA(T) """
def SuffixArray(s):
    satups = sorted([(s[i:], i) for i in range(len(s))])
    # Extract and return just the offsets
    return map(lambda x: x[1], satups)

In [12]:
""" Given T, returns BWT(T) (last column) by way of the suffix array """
def BWTViaSA(t):
    bw = []
    for si in SuffixArray(t):
        if si == 0:
            bw.append('$')
        else:
            bw.append(t[si - 1])
    return ''.join(bw) # returns string version of list bw

In [13]:
""" Given BWT string bw, return parallel list of B-ranks. Also
    return tots: map from character to # times it appears. """
def RankBWT(bw):
    tots = dict()
    ranks = []
    for c in bw:
        if c not in tots:
            tots[c] = 0
        ranks.append(tots[c])
        tots[c] += 1
    return ranks, tots

In [14]:
""" Return map from character to the range of rows prefixed by 
    the character. """
def FirstColumn(tots):
    first = {}
    totc = 0
    for c, count in sorted(tots.items()):
        first[c] = (totc, totc + count)
        totc += count
    return first

In [15]:
""" Make T from BWT(T) """
def ReverseBWT(bw):
    ranks, tots = RankBWT(bw)
    first = FirstColumn(tots)
    rowi = 0   # first row
    t = '$'    # rightmost character
    while bw[rowi] != '$':
        c = bw[rowi]
        t = c + t    # prepend to answer
        # jump to row that starts with c of same rank
        rowi = first[c][0] + ranks[rowi]
    return t

In [21]:
""" FASTA sequences are often broken in smaller chunks. This function
    appends all sequences for given file. """
def GetWholeSequenceFromFile(file):
    # all genome records of given FASTA file
    records = list(SeqIO.parse(file, "fasta"))
    sequence = ""
    # iterate over each record element
    for i in range(0, len(records)):
        sequence += records[i].seq
    # sequence is now whole
    return "".join(str(sequence).split())

In [22]:
sequences = [] # list containing all of the sequences we need to work with

for file in files:
    sequences.append(GetWholeSequenceFromFile(file) + '$')

In [23]:
#for i in range(1, len(sequences) + 1):
#    with open("./data/" + "sequence" + str(i) + ".txt", "w") as f:
#        f.write(sequences[i - 1])
#        f.close()