### EECS 730 Project - 2

#### Steps followed
1. 
8. Write the reconstructed sequence in to an output file in the below format.

">reconstructed genome sequence <br>
"ouput assembled sequence

#### Import relevant packages

In [136]:
# import packages
import os
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
import pandas as pd
import dask.dataframe as dd
from dask.multiprocessing import get
from pydna.common_sub_strings import terminal_overlap
from pydna.assembly import Assembly
from pydna.dseqrecord import Dseqrecord

# Print versions
print('The Biopython version is {}..'.format(Bio.__version__))

The Biopython version is 1.78..


#### Create the paths for reference files

In [137]:
# Set the local paths for data
path = r'C:\Users\pmspr\Documents\HS\MS\Sem 4\EECS 730\Bioinformatics\Project 2\Docs'
reads = os.path.join(path, 'HW2_reads.fasta')
output = os.path.join(path,'sequence_assembly.txt')

#### Important methods

In [138]:
# This method derive the shortest path for the given list of sequences
def getcontig(seqlist,seq):
    dseq = tuple(Dseqrecord(seq[s]) for i, s in enumerate(seqlist) if i < 9)
    x = Assembly(dseq, limit=49)
    contigs = x.assemble_linear()
    if len(contigs) > 0:
        return contigs[0].seq.watson

In [139]:
# This method verifies if there is an overlap of certain threshold between sequences.
# Threshold = 50bp according to project instructions
def compare_overlap(s1, s2):
    overlaps = terminal_overlap(s1,s2, limit=49)
    if len(overlaps) > 0:
        return 'y'
    else:
        return 'n'
    

In [140]:
# This method derive the contigs for sequences with overlap
def contigs(seq):
    lr = [i for i in range(0,len(seq))]  
    contigList = []
    while (len(lr) != 0):
        i1 = min(lr)
        c1 = [i for i in lr if compare_overlap(seq[i1],seq[i]) == 'y' ]
        c1 = sorted(c1)
        contigList.append(getcontig(c1,seq))
        #print(c1)
        lr = list(set(lr) - set(c1))
    return contigList

#### Main logic

In [157]:
# Gather all the sequences from th input fasta file
seq = []
with open(reads) as genome:
    for line in genome:
        if(line[0].strip() != '>'):
            seq.append(line.strip())
print('Total number of reads in file is {}..'.format(len(seq)))
seqlist = seq

# Derive assembled DNA sequence from the individual contigs
while (len(seq) > 1): 
    contigList = contigs(seq)
    seq = contigList

print()
print('Assembled DNA sequence..')
print(seq[0])

# Write the output to a fasta file
# Delete the output file if exists
if os.path.exists(output):
  os.remove(output)

# Open the output file in append mode.
outputfile = open(output,"a")

# Write the extracted protein sequence to an output file in fasta file format.
outputfile.write(str('>' + 'reconstructed genomic sequence' + '\n'))
outputfile.write(str(seq[0] + '\n'))

# Close the output file
outputfile.close()

Total number of reads in file is 127..

Assembled DNA sequence..
CCCTGTCTACCACCCAGACTATCGTGTAGTTCTGCCTGTTCCGTAAGTCGTAGATTGCTATCCTGGAAATCATCGTGCTCAGGATGTTAATATCTAGCGTCCTACGTTACGAGTTGGCAGATGACAGATCGTAGTCGTGGTAAGGGGCATTGCCGCTTGTGACCCAGTTCGCGTGCCTAGCAGCACTCCAAAATAAAGTTTACAGTACCGTCCGGACGGCAGAACTGTCCTCTAGATCGTCCTAACGCCTTAGTCGAATCCCTTGCCGTCGGTAACCACTGAATAAACTACGCGTTAGGACTTTGTCAGACGCGAGGAGCTAGTAGGAGGACAAATCAGCAAACGACCCTGAATTGAACAATGTGAGTAGGTATAACTGTGCTTGTATGACGTCCCGTTCGGTCGTTCTTGAGCAACTTCGGCCAGTGCATGCTATGGGGGAAGCTATGAATTCTATGTTGGAACTTGGGCCCGGCATAGTAGTTTATGCCTGTGGACCGGTGTTGAGTGTATCTGCTGGACCCCGGCGCGTTCACCTGTCCACATCTAATCCAAACATATACTATTGGTATTTGAGCGTCTCACAACGACATCGACTGGTATTAGACACCTACCAGGAACAACCAATCGGTTTAGATGACGCACAGCCACGGACAGCCTCTGTTGCTTGAGCAGTCCCAAAGTGCGTACCTGAAGCCTGCCAAAACGTAGCCTAGGCAAATGCCCGTCGTCTTGCTCATAACTCCTTGGGACTGGCGTATCCATAAATAATCCATTCGATTCCTTGAGAGTTCCACATTAGAGACTTATCCATCGAGGATCAGGCCAAATCCGCGAGACCCGACCGAGATCAAGTATAACTCATTACGCGTGGTGTGGTTGCGGCCCACCCTTATCGTGAGCCAGTTGTTGGATATACCCCTGGGCGGGCCTAA

#### Testing

In [156]:
# Check if all the reads are part of assembled sequence
from pydna.common_sub_strings import common_sub_strings

# Check if each read is a substring of the assembled sequence
outliers = [i for i in range(0, len(seqlist)) if len(common_sub_strings(seqlist[i],seq[0], limit=99)) == 0]
print('Number of reads that are NOT part of assembled sequence {}..'.format(len(outliers)))

# Check if each read is a repeated substring of the assembled sequence
outliers = [i for i in range(0, len(seqlist)) if len(common_sub_strings(seqlist[i],seq[0], limit=99)) > 1]
print('Number of reads that are repeated in assembled sequence {}..'.format(len(outliers)))

Number of reads that are NOT part of assembled sequence 0..
Number of reads that are repeated in assembled sequence 0..
