In [12]:
from Bio import SeqIO
import time
import os
from ctypes import *

In [14]:
SAIS = cdll.LoadLibrary("sais/bin/sais64.dll")
# MSVCRT is Windows C library
MSVCRT = cdll.msvcrt
MSVCRT.malloc.argtypes = [c_int] # Malloc argument type(s)
MSVCRT.malloc.restype = POINTER(c_int) # Malloc result type

def SuffixArraySAIS(sequence):
    # Create empty suffix array
    suffixArray = MSVCRT.malloc(len(sequence) * sizeof(c_int))
    # Populate suffix array
    SAIS.sais(c_char_p(sequence), suffixArray, len(sequence))
    # Return suffix array
    return suffixArray

def CheckpointArray(sequence, suffixArray):
    # Create empty checkpoints array
    checkpoints = MSVCRT.malloc(len(sequence) * sizeof(c_int))
    # Compute all checkpoints and populate chekcpoints array
    SAIS.compute_lcp(c_char_p(sequence), suffixArray, checkpoints, len(sequence))
    # Return checkpoints array
    return checkpoints

def FindBestSubsequenceMatches(sequence1, sequence2):
    matches = MSVCRT.malloc(len(sequence2) * sizeof(c_int))
    checkpoints = MSVCRT.malloc(len(sequence2) * sizeof(c_int))
    SAIS.find_best_subsequence_matches(sequence1, SuffixArraySAIS(sequence1), len(sequence1),
                                       sequence2, SuffixArraySAIS(sequence2), len(sequence2),
                                       matches, checkpoints)
    return [(matches[i], checkpoints[i]) for i in range(len(sequence2))]

In [6]:
dataSet = [
    {"file" : "./data/13443_ref_Cara_1.0_chr1c.fa",
     "patterns" : [
     "ATGCATG",
     "TCTCTCTA",
     "TTCACTACTCTCA"
     ]},
    {"file" : "./data/10093_ref_PAHARI_EIJ_v1.1_chrX.fa",
     "patterns" : [
     "ATGATG",
     "CTCTCTA",
     "TCACTACTCTCA"
     ]},
    {"file" : "./data/144034_ref_Pbar_UMD_V03_chrUn.fa",
     "patterns": [
     "CGCGAG",
     "GTCGAAT",
     "GGGCGTCATCGCGCG"
     ]}
]

In [10]:
def GetWholeGenomeFromFile(file):
    fasta_sequences = SeqIO.parse(open(file),'fasta')
    genome = ""
    for sequence in fasta_sequences:
        genome += str(sequence.seq)
    return genome + "$"

In [15]:
for data in dataSet:
    file = data.get("file")
    genome = GetWholeGenomeFromFile(file)
    patterns = data.get("patterns")
    
    for pattern in patterns:
        startTime = time.time()
        FindBestSubsequenceMatches(genome, pattern)
        endTime = time.time()
        duration = endTime - startTime
        print(f"{file} : {pattern} executed in {duration}")

AttributeError: function 'find_best_subsequence_matches' not found