In [2]:
import cffi
import os
import numpy as np
import pandas as pd
from io import StringIO
from enum import Enum
from Bio import SeqIO
from pathlib import Path
from auxiliary import DATA_SEQ_DIR
from pytrsomix import TRScalculator, TRSanalyzer, AlignmentAnalyzer

In [3]:
trs_file = (DATA_SEQ_DIR/"ecoli"/"trs.txt").absolute().as_posix().encode()
trs1 = TRScalculator(sequence=(DATA_SEQ_DIR/"ecoli"/"sequence_EColi.fasta").absolute().as_posix().encode(), trs=trs_file, tmin=2000, tmax=3000)
trs1.calculate()

trs2 = TRScalculator(sequence=(DATA_SEQ_DIR/"ecoli"/"sequence_UTI89.fasta").absolute().as_posix().encode(), trs=trs_file, tmin=2000, tmax=3000)
trs2.calculate()


name of genome file: /home/rafalb/molecules/TRS-omix/TRS-omix/data/ecoli/sequence_EColi.fasta
name of input file: /home/rafalb/molecules/TRS-omix/TRS-omix/data/ecoli/trs.txt
name of output file: interiors.txt
tmin: 2000
tmax: 3000
mode: 0

START

size of genome: 5498578
size of input: 9
status after LC_TRSPositionsFindAndSaveToVLt: 1
status after LC_InteriorsFindAndSaveToFile: 1
END
name of genome file: /home/rafalb/molecules/TRS-omix/TRS-omix/data/ecoli/sequence_UTI89.fasta
name of input file: /home/rafalb/molecules/TRS-omix/TRS-omix/data/ecoli/trs.txt
name of output file: interiors.txt
tmin: 2000
tmax: 3000
mode: 0

START

size of genome: 5065741
size of input: 9
status after LC_TRSPositionsFindAndSaveToVLt: 1
status after LC_InteriorsFindAndSaveToFile: 1
END

In [3]:
sa = TRSanalyzer.SeqAnalyzer([trs1.Result, trs2.Result])
sa.Combined

Unnamed: 0,L-NoClass,L-No,LFS,Len(LFS),L-POS(LFS),R-POS(LFS),R-NoClass,R-No,RFS,Len(RFS),L-POS(RFS),R-POS(RFS),>SEQ,Len(SEQ),GENOME
0,17,49,ATCATCATC,9,10524,10532,4,10,GGTGGTGGT,9,12765,12773,>GAATGAATGGTGAAATAATTTCCCTGAATAACTGTAGTGTTTTCA...,2232,BA000007.3
1,20,60,TTATTATTA,9,24682,24690,18,54,TGATGATGA,9,26901,26909,>CATCATCTACTATATTTGATTTATAAAGACAATGTGGCGAAGTTG...,2210,BA000007.3
2,18,53,ATGATGATG,9,33584,33592,2,6,GCGGCGGCG,9,35721,35729,>CAAAAGTAACATATTTAATTTATTAATTATAAAGGGCTTTAATTT...,2128,BA000007.3
3,17,50,TCATCATCA,9,53094,53102,9,27,CAGCAGCAG,9,55453,55461,>AAATCGCCATGCTGTGGCTGATTGCCCGACCGTTGCAGGTGCCAA...,2350,BA000007.3
4,4,12,TGGTGGTGG,9,100322,100330,14,41,TTCTTCTTC,9,103243,103251,>GCGATATGGCGGAACTGGGCGCTGAAAGCGAAGCCTGCCATGTAC...,2912,BA000007.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,7,19,ACGACGACG,9,5009926,5009934,9,26,GCAGCAGCA,9,5012147,5012155,>GGGGCATCCCGGGTACCGGCTGGCGAGGTTCTGATGGCAATTCTT...,2212,NC_007946.1
727,9,26,GCAGCAGCA,9,5012147,5012155,11,31,AACAACAAC,9,5014697,5014705,>CCCGTGGTGGATGAAAACAGTATCCGTGCGCAGGTACTGGCAGAG...,2541,NC_007946.1
728,11,31,AACAACAAC,9,5014697,5014705,9,27,CAGCAGCAG,9,5017069,5017077,>TGAGGTTATGGCTGACTCGATGAACCGACATACACAAATCCGCCA...,2363,NC_007946.1
729,2,5,GGCGGCGGC,9,5027095,5027103,15,45,CTACTACTA,9,5029400,5029408,>TTCTGCTACTGCATCAGCCAACAGTCAAAAAGCTGCAAAAACCAG...,2296,NC_007946.1


In [4]:
algns = sa.calculate_all_alignments(0)

In [5]:
aa = AlignmentAnalyzer(algns)
most_similar = aa.get_sorted_scores().sort_values("score", ascending=False)[:10]
most_similar

Unnamed: 0_level_0,score
index,Unnamed: 1_level_1
0,13257
384,12895
544,8856
445,8845
594,8829
320,8799
579,8793
365,8785
364,8781
405,8752


In [6]:
sa.Combined.loc[most_similar.index, :]

Unnamed: 0_level_0,L-NoClass,L-No,LFS,Len(LFS),L-POS(LFS),R-POS(LFS),R-NoClass,R-No,RFS,Len(RFS),L-POS(RFS),R-POS(RFS),>SEQ,Len(SEQ),GENOME
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,17,49,ATCATCATC,9,10524,10532,4,10,GGTGGTGGT,9,12765,12773,>GAATGAATGGTGAAATAATTTCCCTGAATAACTGTAGTGTTTTCA...,2232,BA000007.3
384,17,49,ATCATCATC,9,10563,10571,4,10,GGTGGTGGT,9,12799,12807,>GAATGAATGGTGAAATAATTTCCCTGAATAACTGTAGTGTTTTCA...,2227,NC_007946.1
544,14,41,TTCTTCTTC,9,2473099,2473107,2,5,GGCGGCGGC,9,2476065,2476073,>TGGCTCGTCGTCAACGTCCACTTCCGGAGCGATTTCATCGTCCCC...,2957,NC_007946.1
445,7,20,CGACGACGA,9,821364,821372,1,3,GCCGCCGCC,9,824349,824357,>CGCCGGGCTGAGGGGATTACAGCATGTGTTCGGTACGGGCGATGA...,2976,NC_007946.1
594,11,33,CAACAACAA,9,3194790,3194798,17,51,CATCATCAT,9,3197790,3197798,>CGCTCAATCTTTGGTATTCCCGAGCAACAGTTTTACTCACCCGTA...,2991,NC_007946.1
320,1,3,GCCGCCGCC,9,4707803,4707811,1,1,CCGCCGCCG,9,4710796,4710804,>AGATTCGCGTCTTCCACGACGGTATATGGGCTAATTTCGCAATCA...,2984,BA000007.3
579,9,27,CAGCAGCAG,9,3007840,3007848,1,3,GCCGCCGCC,9,3010781,3010789,>AGTGCCAAGACGCCCCAGCGCAAACCACAGTTTGCCCTCTTTGCT...,2932,NC_007946.1
365,3,8,CCACCACCA,9,5173167,5173175,4,12,TGGTGGTGG,9,5176082,5176090,>GGCCGCGCAGCAGGTCGAGCAGGCGGGCCTGCACCGACACATCCA...,2906,BA000007.3
364,14,40,CTTCTTCTT,9,5170207,5170215,3,8,CCACCACCA,9,5173167,5173175,>CTGCCCACGCCAGTAACTTACTGCCGACGTTCAGACCGCGCGCCT...,2951,BA000007.3
405,12,36,TGTTGTTGT,9,256876,256884,13,37,AAGAAGAAG,9,259792,259800,>TGCAGACTCAGCAATCCCTTAATGGCGGCCTGCAACGCCCGTCCG...,2907,NC_007946.1
