In [2]:
codon_to_aa = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
    'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'
}

In [3]:
import glob
import re
import pandas as pd
from collections import Counter

pattern = 'results/*Delta*/build_site_fubar_table/surface_glycoprotein.fubar.tsv'
date_pattern = re.compile(r'\d{4}-\d{2}')

# Initialize dictionaries to store file paths and codon counters
pairs_dict = {}
seq_df_list = []

df_list = []
for path in sorted(glob.glob(pattern)):
    parts = re.split(r'/|_', path)
    df = pd.read_csv(path, sep='\t')
    df = df.drop_duplicates(subset='Position_in_reference', keep='first').reset_index(drop=True)
    seq_columns = [col for col in df.columns if col.startswith('Seq_')]
    seq_df = df[seq_columns]
    # seq_df = seq_df.applymap(lambda codon: codon_to_aa.get(codon, 'X'))
    row_value_counts = seq_df.apply(lambda row: row.value_counts().to_dict(), axis=1)
    df[f'{parts[2]}_amino_counts_{parts[1]}'] = row_value_counts




    df.rename(columns={'Prob[alpha<beta]': f'{parts[2]}_Prob[alpha<beta]_{parts[1]}'}, inplace=True)
    df['Ref_amino_acid'] = df['Ref_codons'].apply(lambda codon: codon_to_aa.get(codon, 'X'))
    df = df[['Position_in_reference', 'Ref_amino_acid', f'{parts[2]}_Prob[alpha<beta]_{parts[1]}', f'{parts[2]}_amino_counts_{parts[1]}']]
    df_list.append(df)

In [4]:
merged_df = df_list[0]
for df in df_list[1:]:
    merged_df = merged_df.merge(df, on=['Position_in_reference', 'Ref_amino_acid'])
merged_df

Unnamed: 0,Position_in_reference,Ref_amino_acid,2021-08_Prob[alpha<beta]_Ja,2021-08_amino_counts_Ja,2021-09_Prob[alpha<beta]_Ja,2021-09_amino_counts_Ja,2021-10_Prob[alpha<beta]_Ja,2021-10_amino_counts_Ja,2021-11_Prob[alpha<beta]_Ja,2021-11_amino_counts_Ja,...,2021-11_Prob[alpha<beta]_Nein,2021-11_amino_counts_Nein,2021-12_Prob[alpha<beta]_Nein,2021-12_amino_counts_Nein,2022-01_Prob[alpha<beta]_Nein,2022-01_amino_counts_Nein,2022-02_Prob[alpha<beta]_Nein,2022-02_amino_counts_Nein,2022-03_Prob[alpha<beta]_Nein,2022-03_amino_counts_Nein
0,1,M,0.314286,{'ATG': 748},0.204489,{'ATG': 1478},0.197864,{'ATG': 1924},0.155420,{'ATG': 2626},...,0.168149,"{'ATG': 2560, '---': 1}",0.151949,{'ATG': 2670},0.307846,{'ATG': 439},0.440111,{'ATG': 12},0.463979,{'ATG': 4}
1,2,F,0.355880,{'TTT': 748},0.258370,{'TTT': 1478},0.510612,"{'TTT': 1923, 'TTA': 1}",0.231750,{'TTT': 2626},...,0.506214,"{'TTT': 2559, 'TTA': 1, '---': 1}",0.480591,"{'TTT': 2669, 'TTA': 1}",0.345312,{'TTT': 439},0.444622,{'TTT': 12},0.464341,{'TTT': 4}
2,3,V,0.606076,"{'GTT': 747, 'GGT': 1}",0.536248,"{'GTT': 1477, 'ATT': 1}",0.663954,"{'GTT': 1922, 'GGT': 1, 'GCT': 1}",0.247219,"{'GTT': 2615, 'GNT': 10, 'GNN': 1}",...,0.666692,"{'GTT': 2545, 'GNT': 12, 'GCT': 3, '---': 1}",0.509978,"{'GTT': 2668, 'GCT': 2}",0.357277,{'GTT': 439},0.451105,{'GTT': 12},0.466912,{'GTT': 4}
3,4,F,0.355880,{'TTT': 748},0.258370,{'TTT': 1478},0.263992,{'TTT': 1924},0.231750,"{'TTT': 2624, 'TYT': 1, 'NTT': 1}",...,0.243780,"{'TTT': 2560, '---': 1}",0.216878,{'TTT': 2670},0.345312,{'TTT': 439},0.444622,{'TTT': 12},0.464341,{'TTT': 4}
4,5,L,0.910864,"{'CTT': 734, 'TTT': 14}",0.997586,"{'CTT': 1450, 'TTT': 27, 'ATT': 1}",0.706331,"{'CTT': 1888, 'TTT': 35, 'CTC': 1}",0.978824,"{'CTT': 2556, 'TTT': 57, 'YTT': 9, 'NTT': 3, '...",...,0.990316,"{'CTT': 2502, 'TTT': 49, 'YTT': 7, 'NTT': 2, '...",0.999533,"{'CTT': 2604, 'TTT': 54, 'YTT': 9, 'NTT': 3}",0.992205,"{'CTT': 425, 'TTT': 14}",0.447259,{'CTT': 12},0.462241,{'CTT': 4}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268,1269,K,0.369541,{'AAA': 748},0.280843,{'AAA': 1478},0.087295,"{'AAA': 1923, 'AAG': 1}",0.250739,"{'AAA': 2625, 'AAR': 1}",...,0.014343,"{'AAA': 2558, 'AAG': 2, 'NAN': 1}",0.242571,{'AAA': 2670},0.356288,{'AAA': 439},0.450005,{'AAA': 12},0.468169,{'AAA': 4}
1269,1270,L,0.385898,{'TTA': 748},0.303403,{'TTA': 1478},0.314618,{'TTA': 1924},0.294901,"{'TTA': 2625, 'TNA': 1}",...,0.305756,"{'TTA': 2560, 'TNA': 1}",0.274641,{'TTA': 2670},0.120750,"{'TTA': 438, 'CTA': 1}",0.451075,{'TTA': 12},0.467201,{'TTA': 4}
1270,1271,H,0.364519,{'CAT': 748},0.271921,{'CAT': 1478},0.272554,{'CAT': 1924},0.243470,"{'CAT': 2625, 'CAN': 1}",...,0.255262,"{'CAT': 2560, 'CNN': 1}",0.081604,"{'CAT': 2669, 'CAC': 1}",0.353693,{'CAT': 439},0.448553,{'CAT': 12},0.466018,{'CAT': 4}
1271,1272,Y,0.404772,{'TAC': 748},0.322692,{'TAC': 1478},0.343141,{'TAC': 1924},0.058030,"{'TAC': 2621, 'TAT': 3, 'TRC': 1, 'NNC': 1}",...,0.061082,"{'TAC': 2557, 'TAT': 2, 'TRC': 1, 'NNC': 1}",0.307998,"{'TAC': 2656, 'TRC': 9, 'TRY': 5}",0.395000,{'TAC': 439},0.460426,{'TAC': 12},0.473633,{'TAC': 4}


In [5]:
prob_pairs = {}
for col in merged_df.columns:
    if '[alpha<beta]' in col:
        key = col.rsplit('_', 1)[0]
        prob_pairs.setdefault(key, []).append(col)

diff_df = pd.DataFrame()
for key, value in prob_pairs.items():
    if len(value) == 2:
        diff_df[f'{key}_diff'] = (merged_df[value[0]] - merged_df[value[1]]).abs()

merged_df['[alpha<beta]_max_diff'] = diff_df.max(axis=1)

In [6]:
merged_df.to_csv('spike_fubar.tsv', sep='\t', index=False)