In [25]:
from Bio import SeqIO
from Bio import Align
import Levenshtein

In [5]:
aligner = Align.PairwiseAligner(mode='global', match_score=2, mismatch_score=-1)

In [21]:
def get_data(i, result_file):
    true_file = f'../oligo/data/fastas/oligo_{i}.fasta'
    lib_file = '../oligo/data/fastas/prey.fasta'
    true_seq = list(SeqIO.parse(true_file, "fasta"))[0].seq
    lib_seq = list(SeqIO.parse(lib_file, "fasta"))[0].seq
    cons_seq = list(SeqIO.parse(result_file, "fasta"))[0].seq.complement()
    true_seq.id = 'bait sequence'
    cons_seq.id = 'consensus sequence'
    lib_seq.id = 'library sequence'
    return true_seq, lib_seq, cons_seq

In [16]:
true_file = '../oligo/data/fastas/oligo_3.fasta'
lib_file = '../oligo/data/fastas/prey.fasta'
consensus_file = '../oligo/results_pruned/thresholded/oligo_3_10mer_graph_fdr1e-10_4log2fc.consensus.fasta'

In [17]:
true_seq = list(SeqIO.parse(true_file, "fasta"))[0].seq
lib_seq = list(SeqIO.parse(lib_file, "fasta"))[0].seq
cons_seq = list(SeqIO.parse(consensus_file, "fasta"))[0].seq.complement()

In [19]:
alignment = aligner.align(true_seq, cons_seq)
print(alignment[0])
alignment = aligner.align(lib_seq, cons_seq)
print(alignment[0])

bait sequ         0 AG--AACTTACAT-CAACTAAAC--AACAAATGAACAAAAAAAAAA 41
                  0 |---||||-|-|--||||-|||---|||||||||||||||||||-- 46
consensus         0 A-TCAACT-A-A-ACAAC-AAA-TGAACAAATGAACAAAAAAAA-- 38

library s         0 TATTGCGATAGCTGAGAGAGAAGACGCGAGGGNNNNNNNNNNNNNNNGCGAAAACAAA--
                  0 -||--|-|-|-||-|-|-|-----|---|---------------------|---||||--
consensus         0 -AT--C-A-A-CT-A-A-A-----C---A---------------------A---CAAATG

library s        58 AAACAAAAATAAGAATCCAAGCAGCAGCAACA- 90
                 60 ||-||||--|--|||-|-||--|--|--||-|- 93
consensus        19 AA-CAAA--T--GAA-C-AA--A--A--AA-AA 38



In [30]:
for i in range(1,4):
    print(f'----- OLIGO #{i} RESULTS  | log2 FC = 4 -----')
    cons_file = f'../oligo/results_pruned/thresholded/oligo_{i}_10mer_graph_fdr1e-10_4log2fc.consensus.fasta'
    true_seq, lib_seq, cons_seq = get_data(i, cons_file)
    alignment = aligner.align(true_seq, cons_seq)
    print(alignment[0])
    score = Levenshtein.ratio(str(true_seq), str(cons_seq))
    print(f'Levenshtein distance = {score}\n')
    alignment = aligner.align(lib_seq, cons_seq)
    print(alignment[0])
    print()

----- OLIGO #1 RESULTS  | log2 FC = 4 -----
bait sequ         0 AAGACCTACCTCA-C-ATGGCCAA-CAC-TC--GGACAAAAAAAAAA- 41
                  0 ------|---|||-|-||--|||--|-|-||--|-|||---------- 48
consensus         0 ------T---TCATCCAT--CCA-GC-CATCCAG-ACA---------C 25

Levenshtein distance = 0.5454545454545454

library s         0 TATTGCGATAGCTG-AGAGAGAAGA-CGCGAGGGNNNNNNNNNNNNNNNGCGAAAACAAA
                  0 |-|--|-||--|---|----------|-|-||------------------|-----||--
consensus         0 T-T--C-AT--C--CA---------TC-C-AG------------------C-----CA--

library s        58 AAACAAAAATAAGAATCCAAGCAGCAGCAACA 90
                 60 ---------|------|||-|-|-||-|---- 92
consensus        16 ---------T------CCA-G-A-CA-C---- 25


----- OLIGO #2 RESULTS  | log2 FC = 4 -----
bait sequ         0 GACCACCAGC-AGCAGCCAGCCGACGCAGGGACAAAAAAAAAA 42
                  0 --|-----|--|-|--|-|-||-|------------------- 43
consensus         0 --C-----G-TA-C--C-A-CC-A------------------- 10

Levenshtein distance = 0.34615384615

In [54]:
for i in range(1,4):
    print(f'===============================================================================')
    print(f'OLIGO #{i} RESULTS  | log2 FC = 3')
    print(f'===============================================================================')
    cons_file = f'../oligo/results_pruned/thresholded/oligo_{i}_10mer_graph_fdr1e-10_3log2fc.consensus.fasta'
    true_seq, lib_seq, cons_seq = get_data(i, cons_file)
    alignment = aligner.align(true_seq, cons_seq)
    print(alignment[0])
    score = Levenshtein.ratio(str(true_seq), str(cons_seq))
    print(f'\t➡ Levenshtein distance (truth | consensus) = {score}')
    print(f'--------------------------------------------------------------------------------')
    alignment = aligner.align(lib_seq, cons_seq)
    print(alignment[0])
    score = Levenshtein.ratio(str(lib_seq), str(cons_seq))
    print(f'\t➡ Levenshtein distance (library | consensus) = {score}')
    print(f'--------------------------------------------------------------------------------')
    alignment = aligner.align(lib_seq, true_seq)
    print(alignment[0])
    score = Levenshtein.ratio(str(lib_seq), str(true_seq))
    print(f'\t➡ Levenshtein distance (library | truth) = {score}')
    print(f'--------------------------------------------------------------------------------')
    lib_seq_c = lib_seq.complement()
    lib_seq_c.id = 'lib_comp'
    alignment = aligner.align(lib_seq_c, true_seq)
    print(alignment[0])
    score = Levenshtein.ratio(str(lib_seq_c), str(true_seq))
    print(f'\t➡ Levenshtein distance (library_comp | truth) = {score}')
    print(f'--------------------------------------------------------------------------------')
    print()

OLIGO #1 RESULTS  | log2 FC = 3
bait sequ         0 AAGACC-T--A-CC-TC-A-C-ATGGCCA-ACAC----TC--GGACAAAAAAAAAA---
                  0 --|----|--|-||-||-|-|-||--|||-||||----||--|--|-------------
consensus         0 --G---GTTCATCCATCCAGCCAT--CCAGACACCCCATCCCG--C----------TTT

bait sequ        41
                 59
consensus        40

	➡ Levenshtein distance (truth | consensus) = 0.5432098765432098
--------------------------------------------------------------------------------
library s         0 --TATTGCGATAGCTG-AGAGAGAAGA-CGCGAGGGNNNNNNNNNNNNNNNGCGAAAACA
                  0 --|-|--|-||--|---|----------|-|-||------------------|-----||
consensus         0 GGT-T--C-AT--C--CA---------TC-C-AG------------------C-----CA

library s        56 AAAAA-C-AAAAATAAGA-ATCCAAGCAGCAG-CAACA------  90
                 60 ------|-|-------||-|-||---|--||--|--|------- 104
consensus        18 -----TCCA-------GACA-CC---C--CA-TC--C-CGCTTT  40

	➡ Levenshtein distance (library | consensus) = 0.4
----------------

In [36]:
for i in range(1,4):
    print(f'----- OLIGO #{i} RESULTS  | FDR ONLY -----')
    cons_file = f'../oligo/results_pruned/thresholded/oligo_{i}_10mer_graph_fdr1e-10.consensus.fasta'
    true_seq, lib_seq, cons_seq = get_data(i, cons_file)
    alignment = aligner.align(true_seq, cons_seq)
    print(alignment[0])
    score = Levenshtein.ratio(str(true_seq), str(cons_seq))
    print(f'Levenshtein distance = {score}\n')
    alignment = aligner.align(lib_seq, cons_seq)
    print(alignment[0])
    print()

----- OLIGO #1 RESULTS  | FDR ONLY -----
bait sequ         0 -------AAGACC--T--A-CC-TC-A-C-ATGGCCA-ACAC----TC--GGACAAAAAA
                  0 -------|---||--|--|-||-||-|-|-||--|||-||||----||--|--|------
consensus         0 TCTCCCCA---CCGGTTCATCCATCCAGCCAT--CCAGACACCCCATCCCG--C------

bait sequ        37 AAAA----- 41
                 60 --------- 69
consensus        47 ----TTTTG 52

Levenshtein distance = 0.5161290322580645

library s         0 TATTGCGATAGCTG---AGAGAGAAGACGCGAGGGNNNNNNNNNNNNNNNG--CGAAAA-
                  0 |----|--|--|-----|---------|-||-|--------------------|-|----
consensus         0 T----C--T--C--CCCA---------C-CG-G------------------TTC-A---T

library s        54 CAAAAAACAAAAAT--AAG--AATCCAAGCAGCAGCAAC--A-----------  90
                 60 |------||----|--|-|--|-||||-|-|-||-|--|--|----------- 113
consensus        17 C------CA----TCCA-GCCA-TCCA-G-A-CA-C--CCCATCCCGCTTTTG  52


----- OLIGO #2 RESULTS  | FDR ONLY -----
bait sequ         0 GACCACCAGC-AGCAGCCAGCCGACGCAGGG

In [26]:
dir(true_seq)

['__abstractmethods__',
 '__add__',
 '__array_ufunc__',
 '__bytes__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_data',
 'back_transcribe',
 'complement',
 'complement_rna',
 'count',
 'count_overlap',
 'defined',
 'defined_ranges',
 'endswith',
 'find',
 'id',
 'index',
 'islower',
 'isupper',
 'join',
 'lower',
 'lstrip',
 'replace',
 'reverse_complement',
 'reverse_complement_rna',
 'rfind',
 'rindex',
 'rsplit',
 'rstrip',
 'split',
 'startswith',
 'strip',
 'transcribe',
 'translate',
 'ungap',
 'upper']

In [28]:
str(true_seq)

'AGAACTTACATCAACTAAACAACAAATGAACAAAAAAAAAA'