In [None]:
import cffi
import os
import numpy as np
import pandas as pd
from io import StringIO
from enum import Enum
from Bio import SeqIO
from pathlib import Path
from auxiliary import DATA_SEQ_DIR
from pytrsomix import TRScalculator, TRSanalyzer, AlignmentAnalyzer

# Reading in the genomes and calculating the interiors accordinf the trs.txt file

In [12]:
trs_file = (DATA_SEQ_DIR/"klebsiella"/"trs.txt").absolute().as_posix().encode()
trs1 = TRScalculator(sequence=(DATA_SEQ_DIR/"klebsiella"/"Kp1.fasta").absolute().as_posix().encode(), trs=trs_file, tmin=2000, tmax=3000)
trs1.calculate()

trs2 = TRScalculator(sequence=(DATA_SEQ_DIR/"klebsiella"/"Kp2.fasta").absolute().as_posix().encode(), trs=trs_file, tmin=2000, tmax=3000)
trs2.calculate()

trs3 = TRScalculator(sequence=(DATA_SEQ_DIR/"klebsiella"/"Kp3.fasta").absolute().as_posix().encode(), trs=trs_file, tmin=2000, tmax=3000)
trs3.calculate()

trs4 = TRScalculator(sequence=(DATA_SEQ_DIR/"klebsiella"/"Kp4.fasta").absolute().as_posix().encode(), trs=trs_file, tmin=2000, tmax=3000)
trs4.calculate()

trs5 = TRScalculator(sequence=(DATA_SEQ_DIR/"klebsiella"/"Kp5.fasta").absolute().as_posix().encode(), trs=trs_file, tmin=2000, tmax=3000)
trs5.calculate()

trs6 = TRScalculator(sequence=(DATA_SEQ_DIR/"klebsiella"/"Kp6.fasta").absolute().as_posix().encode(), trs=trs_file, tmin=2000, tmax=3000)
trs6.calculate()

trs7 = TRScalculator(sequence=(DATA_SEQ_DIR/"klebsiella"/"Kp7.fasta").absolute().as_posix().encode(), trs=trs_file, tmin=2000, tmax=3000)
trs7.calculate()

# Instantiating the SequenceAnalyzer object

In [3]:
sa = TRSanalyzer.SeqAnalyzer([trs1.Result, trs2.Result, trs3.Result, trs4.Result, trs5.Result, trs6.Result, trs7.Result])
sa.Combined

Unnamed: 0,L-NoClass,L-No,LFS,Len(LFS),L-POS(LFS),R-POS(LFS),R-NoClass,R-No,RFS,Len(RFS),L-POS(RFS),R-POS(RFS),>SEQ,Len(SEQ),GENOME
0,9,26,GCAGCAGCA,9,8016,8024,10,30,TGCTGCTGCTGC,12,10085,10096,>CCAGGGAAATACCATTGAAATTCGTTACACCAGCCATGAGCAGTT...,2060,NC_016845.1
1,2,6,GCGGCGGCG,9,12122,12130,17,51,CATCATCAT,9,14641,14649,>AAAATCGCCCATCACCATCACACCACCGTGGTGCTGAACCCCGCG...,2510,NC_016845.1
2,10,28,GCTGCTGCT,9,71705,71713,1,2,CGCCGCCGC,9,74200,74208,>CCACCAGCGCGCGCAGCTGATAAAGCGTGTTAAACGGCAAAAACT...,2486,NC_016845.1
3,13,38,AGAAGAAGA,9,74454,74462,18,54,TGATGATGA,9,76810,76818,>ACTGGAGATACCACATCAGACCGCCGAGGGCGGAGAGCAGAAGGT...,2347,NC_016845.1
4,10,29,CTGCTGCTG,9,89561,89569,1,3,GCCGCCGCC,9,92179,92187,>CTTCCGGGAGCGTTTCAGACCGTTAAGGTCGACAACCTGCCCCTG...,2609,NC_016845.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3103,1,2,CGCCGCCGC,9,5183266,5183274,10,30,TGCTGCTGC,9,5185855,5185863,>AAAATCTTCATCGCTGATTTCGCCGCGCTCGTGCTGGTGAAACGC...,2580,NZ_CP077823.1
3104,4,11,GTGGTGGTG,9,5193575,5193583,8,22,CGTCGTCGT,9,5195591,5195599,>CTTTAGCGGGGTGAAAGAGCGCTATATTGAAATCGTACCGGATCA...,2007,NZ_CP077823.1
3105,18,52,GATGATGAT,9,5219428,5219436,1,2,CGCCGCCGC,9,5221948,5221956,>GCCGGTGGCGAGGCTGGCGCTGCTGCGCGCCTATCCCCGCAGCGA...,2511,NZ_CP077823.1
3106,9,27,CAGCAGCAG,9,5242506,5242514,18,53,ATGATGATG,9,5245178,5245186,>CTGGGCGGCGGTATCGTACGTACCATCGCCATGGGTTCTTCTGAT...,2663,NZ_CP077823.1


## Unique gnomes in the table

In [5]:
sa.Combined["GENOME"].unique()

array(['NC_016845.1', 'NZ_CP084876.1', 'NZ_CP084765.1', 'NZ_CP084787.1',
       'NZ_CP113789.1', 'NZ_CP084768.1', 'NZ_CP077823.1'], dtype=object)

## Calculating the Needleman-Wunsch alignment scores with respect to chosen sequence

In [6]:
algns = sa.calculate_all_alignments(0)

## 10 most similar scores
* the first one in the similarity to itself (the highest possible score here...)

In [7]:
aa = AlignmentAnalyzer(algns)
most_similar = aa.get_sorted_scores().sort_values("score", ascending=False)[:10]
least_similar = aa.get_sorted_scores().sort_values("score", ascending=False)[-10:]
most_similar

Unnamed: 0_level_0,score
index,Unnamed: 1_level_1
0,12691
2939,9010
1547,9003
2019,8990
682,8984
1131,8960
205,8959
1605,8919
707,8916
2999,8906


In [12]:
aa.get_sorted_scores().max()

score    12691
dtype: int64

In [9]:

sa.Combined.loc[most_similar.index, :]

Unnamed: 0_level_0,L-NoClass,L-No,LFS,Len(LFS),L-POS(LFS),R-POS(LFS),R-NoClass,R-No,RFS,Len(RFS),L-POS(RFS),R-POS(RFS),>SEQ,Len(SEQ),GENOME
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,9,26,GCAGCAGCA,9,8016,8024,10,30,TGCTGCTGCTGC,12,10085,10096,>CCAGGGAAATACCATTGAAATTCGTTACACCAGCCATGAGCAGTT...,2060,NC_016845.1
2939,1,1,CCGCCGCCG,9,3258622,3258630,10,29,CTGCTGCTG,9,3261618,3261626,>ACGCCCAGGCCTATATGGTCTCTTATCAGCAGGCGATCCATGCCA...,2987,NZ_CP077823.1
1547,1,1,CCGCCGCCG,9,3122019,3122027,10,29,CTGCTGCTG,9,3125015,3125023,>ACGCCCAGGCCTATATGGTCTCTTATCAGCAGGCGATCCACGCCA...,2987,NZ_CP084787.1
2019,1,1,CCGCCGCCG,9,3567402,3567410,10,29,CTGCTGCTG,9,3570398,3570406,>ATGCCCAGGCCTATATGGTCTCTTATCAGCAGGCGATCCACGCCA...,2987,NZ_CP113789.1
682,1,1,CCGCCGCCG,9,3330278,3330286,10,29,CTGCTGCTG,9,3333274,3333282,>ACGCCCAGGCCTATATGGTCTCTTATCAGCAGGCGATCCATGCCA...,2987,NZ_CP084876.1
1131,1,1,CCGCCGCCG,9,3533860,3533868,10,29,CTGCTGCTG,9,3536856,3536864,>ACGCTCAGGCCTATATGGTCTCTTATCAGCAGGCGATCCACGCCA...,2987,NZ_CP084765.1
205,10,28,GCTGCTGCT,9,2479515,2479523,1,3,GCCGCCGCC,9,2482394,2482402,>GGCCGGCCTGCCGGAGAGCGTGCCCGGGAGCACCATCAACCGCCT...,2870,NC_016845.1
1605,10,28,GCTGCTGCT,9,3856031,3856039,9,26,GCAGCAGCA,9,3859021,3859029,>GCGCTACGCCCGCGACCGGGTGAAATATCCTCGCGGCACCCGCCT...,2981,NZ_CP084787.1
707,1,3,GCCGCCGCC,9,3643372,3643380,2,5,GGCGGCGGC,9,3646350,3646358,>CAGTACCAGCGTGCGTTTGCCGGCGCGCTGGCAGGCGGTCACCGC...,2969,NZ_CP084876.1
2999,10,28,GCTGCTGCT,9,4014590,4014598,9,25,AGCAGCAGCAGC,12,4017579,4017590,>GCGCTATGCCCGCGACCGGGTGAAGTATCCTCGCGGTACCCGTCT...,2980,NZ_CP077823.1


In [14]:
most_similar.apply(lambda x: x/aa.get_sorted_scores().max().values[0])

Unnamed: 0_level_0,score
index,Unnamed: 1_level_1
0,1.0
2939,0.709952
1547,0.7094
2019,0.708376
682,0.707903
1131,0.706012
205,0.705933
1605,0.702781
707,0.702545
2999,0.701757


In [10]:
sa.Combined.loc[least_similar.index, :]

Unnamed: 0_level_0,L-NoClass,L-No,LFS,Len(LFS),L-POS(LFS),R-POS(LFS),R-NoClass,R-No,RFS,Len(RFS),L-POS(RFS),R-POS(RFS),>SEQ,Len(SEQ),GENOME
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1934,11,33,CAACAACAA,9,2683747,2683755,12,35,TTGTTGTTG,9,2685936,2685944,>ATTCTGCCACTTCAACTATTGTTGCAGCATCGAGTACTAGCAACA...,2180,NZ_CP113789.1
1051,11,33,CAACAACAA,9,2614889,2614897,12,35,TTGTTGTTG,9,2617078,2617086,>ATTCTGCCACTTCAACTATTGCAGCAACATCGAGTACTAGCAACA...,2180,NZ_CP084765.1
2666,14,41,TTCTTCTTC,9,118346,118354,9,26,GCAGCAGCA,9,120413,120421,>AGAATGTTATAGGTTTCTTTAACGTGCGCTTCGAGGCGGTCCAGT...,2058,NZ_CP077823.1
1473,1,2,CGCCGCCGC,9,2141311,2141319,9,25,AGCAGCAGC,9,2143529,2143537,>CCCCTGTACCCCCGGGCTCCGGCCAGCAACATCGCCACCGCGTGG...,2209,NZ_CP084787.1
2776,13,37,AAGAAGAAG,9,1432941,1432949,20,58,TATTATTAT,9,1435183,1435191,>AGAAAGTGGAATATATGAACAGCCCCTGCTTTTGAAAGCGGGGGA...,2233,NZ_CP077823.1
2930,2,5,GGCGGCGGC,9,3107366,3107374,15,43,TACTACTAC,9,3109522,3109530,>AAAGACGGCGTGCAAATCCTCCCCTTCCAGCAGATCACTGCGAAA...,2147,NZ_CP077823.1
1737,16,46,GTAGTAGTA,9,226939,226947,14,41,TTCTTCTTC,9,229046,229054,>ACAACAGGAATCTCACATGATAAGGCTTCCGCAACAACTATCCCA...,2098,NZ_CP113789.1
1428,20,59,ATTATTATT,9,1661882,1661890,18,54,TGATGATGA,9,1664156,1664164,>TAACGGCATCTAGGCTTGTGCCTTATAAAAGAATGGATTTAATAG...,2265,NZ_CP084787.1
281,18,53,ATGATGATG,9,3561224,3561232,19,57,AATAATAATAAT,12,3563293,3563304,>TCGCCTTTGTCGAGGTAAGGCTTCAGGGAATCGATGGCGCTGTCG...,2060,NC_016845.1
755,20,58,TATTATTAT,9,4233717,4233725,18,52,GATGATGAT,9,4235782,4235790,>TTTTACGTGTGAAAAAATAAATGACTTTCAAAATTTCTTTAAGGG...,2056,NZ_CP084876.1


In [23]:
least_similar.apply(lambda x: x/13501)*100

Unnamed: 0_level_0,score
index,Unnamed: 1_level_1
1248,53.951559
2640,53.73676
2666,53.277535
1473,52.870158
2776,52.44056
2930,52.173913
1737,51.677654
1428,49.922228
281,48.981557
755,46.203985


## 10 most similar sequences

In [16]:
sa.Combined.loc[most_similar.index, :]

Unnamed: 0_level_0,L-NoClass,L-No,LFS,Len(LFS),L-POS(LFS),R-POS(LFS),R-NoClass,R-No,RFS,Len(RFS),L-POS(RFS),R-POS(RFS),>SEQ,Len(SEQ),GENOME
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,9,26,GCAGCAGCA,9,8016,8024,10,30,TGCTGCTGCTGC,12,10085,10096,>CCAGGGAAATACCATTGAAATTCGTTACACCAGCCATGAGCAGTT...,2060,NC_016845.1
2939,1,1,CCGCCGCCG,9,3258622,3258630,10,29,CTGCTGCTG,9,3261618,3261626,>ACGCCCAGGCCTATATGGTCTCTTATCAGCAGGCGATCCATGCCA...,2987,NZ_CP077823.1
1547,1,1,CCGCCGCCG,9,3122019,3122027,10,29,CTGCTGCTG,9,3125015,3125023,>ACGCCCAGGCCTATATGGTCTCTTATCAGCAGGCGATCCACGCCA...,2987,NZ_CP084787.1
2019,1,1,CCGCCGCCG,9,3567402,3567410,10,29,CTGCTGCTG,9,3570398,3570406,>ATGCCCAGGCCTATATGGTCTCTTATCAGCAGGCGATCCACGCCA...,2987,NZ_CP113789.1
682,1,1,CCGCCGCCG,9,3330278,3330286,10,29,CTGCTGCTG,9,3333274,3333282,>ACGCCCAGGCCTATATGGTCTCTTATCAGCAGGCGATCCATGCCA...,2987,NZ_CP084876.1
1131,1,1,CCGCCGCCG,9,3533860,3533868,10,29,CTGCTGCTG,9,3536856,3536864,>ACGCTCAGGCCTATATGGTCTCTTATCAGCAGGCGATCCACGCCA...,2987,NZ_CP084765.1
205,10,28,GCTGCTGCT,9,2479515,2479523,1,3,GCCGCCGCC,9,2482394,2482402,>GGCCGGCCTGCCGGAGAGCGTGCCCGGGAGCACCATCAACCGCCT...,2870,NC_016845.1
1605,10,28,GCTGCTGCT,9,3856031,3856039,9,26,GCAGCAGCA,9,3859021,3859029,>GCGCTACGCCCGCGACCGGGTGAAATATCCTCGCGGCACCCGCCT...,2981,NZ_CP084787.1
707,1,3,GCCGCCGCC,9,3643372,3643380,2,5,GGCGGCGGC,9,3646350,3646358,>CAGTACCAGCGTGCGTTTGCCGGCGCGCTGGCAGGCGGTCACCGC...,2969,NZ_CP084876.1
2999,10,28,GCTGCTGCT,9,4014590,4014598,9,25,AGCAGCAGCAGC,12,4017579,4017590,>GCGCTATGCCCGCGACCGGGTGAAGTATCCTCGCGGTACCCGTCT...,2980,NZ_CP077823.1


In [21]:
#sa.Combined[">SEQ"].unique()

In [20]:
len(sa.Combined[">SEQ"].unique())

916

* podobieństwo sekwencji
* dla genomów avium szukamy najbardziej unikatowej sekwencji dla danego genomu

In [20]:
least_similar_avium = {}
for idx in range(10):
    print("Alignment calculation, %d", idx)
    algns = sa.calculate_all_alignments(idx)
    aa = AlignmentAnalyzer(algns)
    least_similar = aa.get_sorted_scores().sort_values("score", ascending=False)[-10:]
    least_similar_avium[idx] = least_similar

Alignment calculation, %d 0
Alignment calculation, %d 1
Alignment calculation, %d 2
Alignment calculation, %d 3
Alignment calculation, %d 4
Alignment calculation, %d 5
Alignment calculation, %d 6
Alignment calculation, %d 7
Alignment calculation, %d 8
Alignment calculation, %d 9


In [22]:
least_similar_avium

{0:        score
 index       
 1934    6842
 1051    6839
 2666    6761
 1473    6754
 2776    6663
 2930    6650
 1737    6646
 1428    6350
 281     6301
 755     5819,
 1:        score
 index       
 351     7484
 1934    7475
 2640    7425
 2666    7371
 2930    7333
 2776    7228
 1737    7199
 1428    6966
 281     6897
 755     6313,
 2:        score
 index       
 222     7442
 1934    7429
 1051    7419
 2666    7341
 2776    7266
 1737    7249
 2930    7201
 1428    6954
 281     6847
 755     6242,
 3:        score
 index       
 1934    7318
 1622    7287
 351     7276
 2666    7153
 2930    7123
 1737    7069
 2776    7060
 1428    6802
 281     6744
 755     6221,
 4:        score
 index       
 351     7634
 1051    7580
 2666    7544
 1934    7543
 2776    7500
 2930    7442
 1737    7351
 1428    7147
 281     6980
 755     6452,
 5:        score
 index       
 2666    6884
 2640    6882
 351     6860
 2930    6732
 1622    6717
 2776    6669
 1737    6651
 1428    63