In [7]:
codon_to_aa = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
    'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'
}

In [8]:
import glob
import re
import pandas as pd
from collections import Counter

pattern = 'results/*Delta*/build_site_fubar_table/surface_glycoprotein.fubar.tsv'
date_pattern = re.compile(r'\d{4}-\d{2}')

# Initialize dictionaries to store file paths and codon counters
pairs_dict = {}
seq_df_list = []

df_list = []
for path in sorted(glob.glob(pattern)):
    parts = re.split(r'/|_', path)
    df = pd.read_csv(path, sep='\t')
    df = df.drop_duplicates(subset='Position_in_reference', keep='first').reset_index(drop=True)
    seq_columns = [col for col in df.columns if col.startswith('Seq_')]
    seq_df = df[seq_columns]
    seq_df = seq_df.applymap(lambda codon: codon_to_aa.get(codon, 'X'))
    row_value_counts = seq_df.apply(lambda row: row.value_counts().to_dict(), axis=1)
    df[f'{parts[2]}_amino_counts_{parts[1]}'] = row_value_counts




    df.rename(columns={'Prob[alpha<beta]': f'{parts[2]}_Prob[alpha<beta]_{parts[1]}'}, inplace=True)
    df['Ref_amino_acid'] = df['Ref_codons'].apply(lambda codon: codon_to_aa.get(codon, 'X'))
    df = df[['Position_in_reference', 'Ref_amino_acid', f'{parts[2]}_Prob[alpha<beta]_{parts[1]}', f'{parts[2]}_amino_counts_{parts[1]}']]
    df_list.append(df)

In [9]:
merged_df = df_list[0]
for df in df_list[1:]:
    merged_df = merged_df.merge(df, on=['Position_in_reference', 'Ref_amino_acid'])
merged_df

Unnamed: 0,Position_in_reference,Ref_amino_acid,2021-08_Prob[alpha<beta]_Ja,2021-08_amino_counts_Ja,2021-09_Prob[alpha<beta]_Ja,2021-09_amino_counts_Ja,2021-10_Prob[alpha<beta]_Ja,2021-10_amino_counts_Ja,2021-11_Prob[alpha<beta]_Ja,2021-11_amino_counts_Ja,...,2021-11_Prob[alpha<beta]_Nein,2021-11_amino_counts_Nein,2021-12_Prob[alpha<beta]_Nein,2021-12_amino_counts_Nein,2022-01_Prob[alpha<beta]_Nein,2022-01_amino_counts_Nein,2022-02_Prob[alpha<beta]_Nein,2022-02_amino_counts_Nein,2022-03_Prob[alpha<beta]_Nein,2022-03_amino_counts_Nein
0,1,M,0.314286,{'M': 748},0.204489,{'M': 1478},0.197864,{'M': 1924},0.155420,{'M': 2626},...,0.168149,"{'M': 2560, 'X': 1}",0.151949,{'M': 2670},0.307846,{'M': 439},0.440111,{'M': 12},0.463979,{'M': 4}
1,2,F,0.355880,{'F': 748},0.258370,{'F': 1478},0.510612,"{'F': 1923, 'L': 1}",0.231750,{'F': 2626},...,0.506214,"{'F': 2559, 'L': 1, 'X': 1}",0.480591,"{'F': 2669, 'L': 1}",0.345312,{'F': 439},0.444622,{'F': 12},0.464341,{'F': 4}
2,3,V,0.606076,"{'V': 747, 'G': 1}",0.536248,"{'V': 1477, 'I': 1}",0.663954,"{'V': 1922, 'G': 1, 'A': 1}",0.247219,"{'V': 2615, 'X': 11}",...,0.666692,"{'V': 2545, 'X': 13, 'A': 3}",0.509978,"{'V': 2668, 'A': 2}",0.357277,{'V': 439},0.451105,{'V': 12},0.466912,{'V': 4}
3,4,F,0.355880,{'F': 748},0.258370,{'F': 1478},0.263992,{'F': 1924},0.231750,"{'F': 2624, 'X': 2}",...,0.243780,"{'F': 2560, 'X': 1}",0.216878,{'F': 2670},0.345312,{'F': 439},0.444622,{'F': 12},0.464341,{'F': 4}
4,5,L,0.910864,"{'L': 734, 'F': 14}",0.997586,"{'L': 1450, 'F': 27, 'I': 1}",0.706331,"{'L': 1889, 'F': 35}",0.978824,"{'L': 2556, 'F': 57, 'X': 13}",...,0.990316,"{'L': 2502, 'F': 49, 'X': 10}",0.999533,"{'L': 2604, 'F': 54, 'X': 12}",0.992205,"{'L': 425, 'F': 14}",0.447259,{'L': 12},0.462241,{'L': 4}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268,1269,K,0.369541,{'K': 748},0.280843,{'K': 1478},0.087295,{'K': 1924},0.250739,"{'K': 2625, 'X': 1}",...,0.014343,"{'K': 2560, 'X': 1}",0.242571,{'K': 2670},0.356288,{'K': 439},0.450005,{'K': 12},0.468169,{'K': 4}
1269,1270,L,0.385898,{'L': 748},0.303403,{'L': 1478},0.314618,{'L': 1924},0.294901,"{'L': 2625, 'X': 1}",...,0.305756,"{'L': 2560, 'X': 1}",0.274641,{'L': 2670},0.120750,{'L': 439},0.451075,{'L': 12},0.467201,{'L': 4}
1270,1271,H,0.364519,{'H': 748},0.271921,{'H': 1478},0.272554,{'H': 1924},0.243470,"{'H': 2625, 'X': 1}",...,0.255262,"{'H': 2560, 'X': 1}",0.081604,{'H': 2670},0.353693,{'H': 439},0.448553,{'H': 12},0.466018,{'H': 4}
1271,1272,Y,0.404772,{'Y': 748},0.322692,{'Y': 1478},0.343141,{'Y': 1924},0.058030,"{'Y': 2624, 'X': 2}",...,0.061082,"{'Y': 2559, 'X': 2}",0.307998,"{'Y': 2656, 'X': 14}",0.395000,{'Y': 439},0.460426,{'Y': 12},0.473633,{'Y': 4}


In [10]:
prob_pairs = {}
for col in merged_df.columns:
    if '[alpha<beta]' in col:
        key = col.rsplit('_', 1)[0]
        prob_pairs.setdefault(key, []).append(col)

diff_df = pd.DataFrame()
for key, value in prob_pairs.items():
    if len(value) == 2:
        diff_df[f'{key}_diff'] = (merged_df[value[0]] - merged_df[value[1]]).abs()

merged_df['[alpha<beta]_max_diff'] = diff_df.max(axis=1)

In [11]:
merged_df.to_csv('spike_fubar.tsv', sep='\t', index=False)