In [1]:
import pandas as pd
from __future__ import annotations
from pathlib import Path
from itertools import chain

import Bio
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Align.AlignInfo import SummaryInfo
from Bio.Align import AlignInfo

import warnings
warnings.filterwarnings("ignore")
import numpy as np
from Bio.motifs import Motif
import glob 
import itertools
import matplotlib.pyplot as plt

from scipy import stats


Plan:
1. Calculate percent identity of each position in full TF alignments
2. Calculate percent identity of each position in all AD alignments
3. Compare distributions

In [2]:
counts_dfs = []

print(str(len(glob.glob("../data/zoonomia_toga_mca/all_AD_alignments/*"))) + " alignments")
i = 0

# For each AD alignment
for AD_alignment in glob.glob("../data/zoonomia_toga_mca/all_AD_alignments/*"):
    if i%25 == 0:
        print(i)
    i += 1
    
    # Get msa
    msa = AlignIO.read(AD_alignment, "fasta")
    
    # Get consensus sequence (>50% of positions have that AA)
    # summary_align = AlignInfo.SummaryInfo(msa)
    # consensus = summary_align.dumb_consensus(0.5)
    
    #Counting each AA
    motif = Motif("ACDEFGHIKLMNPQRSTVWYX", msa.alignment)
    counts = motif.counts
    
    # Saving as dataframe in desired format
    counts_df = pd.DataFrame(counts).T
    #counts_df.columns = list(str(consensus))
    counts_dfs.append(counts_df)

656 alignments
0
25
50
75
100
125
150
175
200
225
250
275
300
325
350
375
400
425
450
475
500
525
550
575
600
625
650


In [25]:
per_AD_mean_perc_ident = []

for count_df in counts_dfs:
    df = count_df.copy(deep = True)
    df = df / df.sum()    
    df_percent_identities = pd.DataFrame({"AA" : df.idxmax(axis=0), "percent identity" : df.max(axis=0)})
    mean_perc_idents = df_percent_identities.groupby("AA").mean()
    
    per_AD_mean_perc_ident.append(mean_perc_idents)

per_AD_mean_perc_ident = pd.concat(per_AD_mean_perc_ident)
per_AD_mean_perc_ident

Unnamed: 0_level_0,percent identity
AA,Unnamed: 1_level_1
A,0.958166
D,0.882064
E,1.000000
F,1.000000
G,0.993652
...,...
S,0.954328
T,0.952937
V,0.946533
W,1.000000


In [30]:
per_AD_mean_perc_ident.groupby("AA").median().sort_values(by = "percent identity", ascending = False)

Unnamed: 0_level_0,percent identity
AA,Unnamed: 1_level_1
W,1.0
C,0.997717
F,0.997418
K,0.997191
Y,0.997187
R,0.995676
L,0.994508
H,0.993258
Q,0.988776
M,0.987527


In [11]:
counts_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,162
A,0.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D,464.0,0.0,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,479.0,0.0
E,10.0,0.0,0.0,0.0,0.0,478.0,453.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,479.0,0.0,0.0,0.0,0.0,0.0
F,0.0,0.0,0.0,478.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
G,3.0,0.0,473.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L,0.0,427.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Wide dataframe where columns are positions in the AD consensus sequences, rows are the 20 AAs
combined_counts_df = pd.concat(counts_dfs, axis = 1)
# Converting to proportions
combined_percents_df = combined_counts_df / combined_counts_df.sum()
combined_percents_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,162
A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.995772,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H,0.997886,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
percent_identities = pd.DataFrame({"AA" : combined_percents_df.idxmax(axis=0), "percent identity" : combined_percents_df.max(axis=0)})
percent_identities

Unnamed: 0,AA,percent identity
0,H,0.997886
1,M,1.000000
2,M,1.000000
3,Y,1.000000
4,P,0.993658
...,...,...
158,P,1.000000
159,S,1.000000
160,S,1.000000
161,D,1.000000
