In [1]:
import pandas as pd
from Bio import SeqIO, Seq
import numpy as np
import datetime
import ete3
import re
import math
import statistics
import random
import scipy


In [2]:
def filt_fas(recdic, acclist, outname):
    fasta = ''
    for rec in list(recdic):
        shrec = rec.split('|')[0]
        if shrec in acclist:
            fasta = fasta + '>%s\n%s\n'%(shrec, str(recdic[rec].seq))
    print('Number of filtered sequences: %i'%fasta.count('>'))
    with open(outname, 'w') as out:
        out.write(fasta[:-1])

In [3]:
#convert entropy site values df to alignment positions

def aln_entr_tab(entrdf_f, asralf, outf):

    entrdf = pd.read_csv(entrdf_f, index_col=0)
    
    aldic = SeqIO.to_dict(SeqIO.parse(asralf, 'fasta'))
    
    newentrdic = {}

    c = 0
    
    for nod in list(entrdf.index):

        c = c+1
        if c%100==0:
            print(c)
            
        newentr = []
        alseq = str(aldic[nod].seq)
    
        tabpoz = 1
        for i in range(len(alseq)):
            # if (i%100==0) and c%100==0:
            #     print('\t\t', i)            
            if alseq[i] != '-':
                newentr.append(entrdf.loc[nod,str(tabpoz)])
                tabpoz = tabpoz + 1
            else:
                newentr.append('')
    
        newentrdic.update({nod:newentr})

    alndf = pd.DataFrame(newentrdic)
    alndf.index = alndf.index +1
    alndf = alndf.T

    alndf.to_csv(outf)

    return alndf
        

In [8]:

def calc_aln_entropy(alfile):

    seqdic = SeqIO.to_dict(SeqIO.parse(alfile, 'fasta'))
    
    #remove internal node seqs from per site entropy calculations
    aldf = pd.DataFrame({x:[a for a in seqdic[x]] for x in list(seqdic) if 'NODE_' not in x})
    
    alnpersite = {}
    
    alentropy = []
    
    for i in range(len(aldf)):
        reslist = aldf.iloc[i].tolist()
    #     print(reslist)
        if '-' in reslist:
            reslist.remove('-')
        proplist = [reslist.count(x)/len(reslist) for x in set(reslist)]
        #calculate alignment site entropy
        alentropy.append(scipy.stats.entropy(proplist, base=2))
        
        #also store the per-residue proportions for each site
        alnpersite.update({i+1:{x:reslist.count(x) for x in set(reslist)}})
    
    alentropy = alentropy

    return alentropy



In [9]:
def al_mod_entr_correl(alentropy, modentr_f):

    model_entr_df = pd.read_csv(modentr_f, index_col=0)
    #remove internal node seqs from per site entropy calculations
    model_entr_df = model_entr_df[~model_entr_df.index.str.contains('NODE_')]
    model_entr_avg = [sum(model_entr_df.get(i).dropna().tolist())/len(model_entr_df) for i in list(model_entr_df.columns)]

    # print(alentropy)
    # print(model_entr_avg)

    
    print( scipy.stats.spearmanr(alentropy, model_entr_avg) )

    # fig = px.scatter(x=alentropy, y=model_entr_avg, color=[i for i in range(len(alentropy))])
    # fig.show()


<br>
<br>

### Embedding and entropy calculation

Example code

In [2]:
#map embedding positions onto alignment

aln_entr_tab('gisaid_h7_270624_filt-ASR-esm2_t33-site_entropy.csv',
             'ancestral_sequences.fasta',
             'gisaid_h7_270624_filt-ASR-esm2_t33-site_entropy_aln.csv')

In [11]:
print('\n####    H7    ####\n')

h7path = '/media/spyros/HD-ADU3/spyros/flu_LLM_evol_data/gisaid_h7_270624_filt/'

#calculate alignment entropy
h7alentropy = calc_aln_entropy(h7path + 'ancestral_sequences.fasta')


#perform Spearman correlations

print('T5 uniref only')
al_mod_entr_correl(h7alentropy, h7path + 'gisaid_h7_270624_filt-ASR-T5_UniRef_e2-site_entropy_aln.csv')
print('T5 uniref HA all')
al_mod_entr_correl(h7alentropy, h7path + 'gisaid_h7_270624_filt-ASR-T5_uniref_HA_221124-site_entropy_aln.csv')
print('T5 uniref H1')
al_mod_entr_correl(h7alentropy, h7path + 'gisaid_h7_270624_filt-ASR-T5_uniref_H1_221124-site_entropy_aln.csv')
print('T5 uniref H3')
al_mod_entr_correl(h7alentropy, h7path + 'gisaid_h7_270624_filt-ASR-T5_uniref_H3_221124-site_entropy_aln.csv')
print('T5 uniref H5')
al_mod_entr_correl(h7alentropy, h7path + 'gisaid_h7_270624_filt-ASR-T5_uniref_H5_221124-site_entropy_aln.csv')
print('T5 uniref H7')
al_mod_entr_correl(h7alentropy, h7path + 'gisaid_h7_270624_filt-ASR-T5_uniref_H7_221124-site_entropy_aln.csv')


print('\n')


print('ESM-2')
al_mod_entr_correl(h7alentropy, h7path + 'gisaid_h7_270624_filt-ASR-esm2_t33-site_entropy_aln.csv')
print('ESM-2 HA all')
al_mod_entr_correl(h7alentropy, h7path + 'gisaid_h7_270624_filt-ASR-esm2_t33_e10-site_entropy_aln.csv')
print('H1')
al_mod_entr_correl(h7alentropy, h7path + 'gisaid_h7_270624_filt-ASR-esm2_t33-H1_071024-site_entropy_aln.csv')
print('H3')
al_mod_entr_correl(h7alentropy, h7path + 'gisaid_h7_270624_filt-ASR-esm2_t33-H3_071024-site_entropy_aln.csv')
print('H5')
al_mod_entr_correl(h7alentropy, h7path + 'gisaid_h7_270624_filt-ASR-esm2_t33-H5_071024-site_entropy_aln.csv')
print('H7')
al_mod_entr_correl(h7alentropy, h7path + 'gisaid_h7_270624_filt-ASR-esm2_t33-H7_071024-site_entropy_aln.csv')
     



####    H7    ####

T5 uniref only
SignificanceResult(statistic=-0.060873422746474885, pvalue=0.14593398711873576)
T5 uniref HA all
SignificanceResult(statistic=0.8556218291158466, pvalue=3.704164349967701e-165)
T5 uniref H1
SignificanceResult(statistic=0.44260696490402296, pvalue=7.726032381540528e-29)
T5 uniref H3
SignificanceResult(statistic=0.4627984485444651, pvalue=1.0528737760638554e-31)
T5 uniref H5
SignificanceResult(statistic=0.3978985052861403, pvalue=3.859370424488541e-23)
T5 uniref H7
SignificanceResult(statistic=0.778256590532292, pvalue=2.8058442587545153e-117)


ESM-2
SignificanceResult(statistic=0.44655923286728005, pvalue=2.198044027611488e-29)
ESM-2 HA all
SignificanceResult(statistic=0.832047304174184, pvalue=5.230835680114651e-148)
H1
SignificanceResult(statistic=0.4193005723790633, pvalue=9.228760997271712e-26)
H3
SignificanceResult(statistic=0.37291720165641196, pvalue=2.5932924856256982e-20)
H5
SignificanceResult(statistic=0.24808795889723284, pvalue=1.80218508

In [11]:
print('\n####    H5    ####\n')
h5path = '/media/spyros/HD-ADU3/spyros/flu_LLM_evol_data/gisaid_h5_270624_filt/'

h5alentropy = calc_aln_entropy(h5path + 'ancestral_sequences.fasta')

print('T5 uniref only')
al_mod_entr_correl(h5alentropy, h5path + 'gisaid_h5_270624_filt-ASR-T5_UniRef_e2-site_entropy_aln.csv')
print('T5 uniref HA all')
al_mod_entr_correl(h5alentropy, h5path + 'gisaid_h5_270624_filt-ASR-T5_uniref_HA_221124-site_entropy_aln.csv')
print('T5 uniref H1')
al_mod_entr_correl(h5alentropy, h5path + 'gisaid_h5_270624_filt-ASR-T5_uniref_H1_221124-site_entropy_aln.csv')
print('T5 uniref H3')
al_mod_entr_correl(h5alentropy, h5path + 'gisaid_h5_270624_filt-ASR-T5_uniref_H3_221124-site_entropy_aln.csv')
print('T5 uniref H5')
al_mod_entr_correl(h5alentropy, h5path + 'gisaid_h5_270624_filt-ASR-T5_uniref_H5_221124-site_entropy_aln.csv')
print('T5 uniref H7')
al_mod_entr_correl(h5alentropy, h5path + 'gisaid_h5_270624_filt-ASR-T5_uniref_H7_221124-site_entropy_aln.csv')

print('\n')

print('ESM-2')
al_mod_entr_correl(h5alentropy, h5path + 'gisaid_h5_270624_filt-ASR-esm2_t33-site_entropy_aln.csv')
print('ESM-2 HA all')
al_mod_entr_correl(h5alentropy, h5path + 'gisaid_h5_270624_filt-ASR-esm2_t33_e10-site_entropy_aln.csv')
print('H1')
al_mod_entr_correl(h5alentropy, h5path + 'gisaid_h5_270624_filt-ASR-esm2_t33-H1_071024-site_entropy_aln.csv')
print('H3')
al_mod_entr_correl(h5alentropy, h5path + 'gisaid_h5_270624_filt-ASR-esm2_t33-H3_071024-site_entropy_aln.csv')
print('H5')
al_mod_entr_correl(h5alentropy, h5path + 'gisaid_h5_270624_filt-ASR-esm2_t33-H5_071024-site_entropy_aln.csv')
print('H7')
al_mod_entr_correl(h5alentropy, h5path + 'gisaid_h5_270624_filt-ASR-esm2_t33-H7_071024-site_entropy_aln.csv')



####    H5    ####

T5 uniref only
SignificanceResult(statistic=0.023619211042106993, pvalue=0.5699200345942773)
T5 uniref HA all
SignificanceResult(statistic=0.8870411588015424, pvalue=1.7036516166306259e-196)
T5 uniref H1
SignificanceResult(statistic=0.7603354511278257, pvalue=1.3615771395698607e-110)
T5 uniref H3
SignificanceResult(statistic=0.48471857534352314, pvalue=1.4467075701928022e-35)
T5 uniref H5
SignificanceResult(statistic=0.85550466902374, pvalue=1.2171961658634066e-167)
T5 uniref H7
SignificanceResult(statistic=0.436585522330974, pvalue=1.9471406207915845e-28)


ESM-2
SignificanceResult(statistic=0.6723379963920078, pvalue=1.1483906214803146e-77)
ESM-2 HA all
SignificanceResult(statistic=0.8862272655457288, pvalue=1.202791223989157e-195)
H1
SignificanceResult(statistic=0.7027494209294508, pvalue=1.1481905176105168e-87)
H3
SignificanceResult(statistic=0.36797076889780594, pvalue=4.5456791705610114e-20)
H5
SignificanceResult(statistic=0.8370301212152663, pvalue=9.2064309

In [11]:
print('\n####    H1    ####\n')
h1path = '/media/spyros/HD-ADU3/spyros/flu_LLM_evol_data/ncbi_h1_110424_filt/'
h1alentropy = calc_aln_entropy(h1path + 'ancestral_sequences.fasta')

print('T5 uniref only')
al_mod_entr_correl(h1alentropy, h1path + 'ncbi_h1_110424_filt-ASR-T5_UniRef_e2-site_entropy_aln.csv')
print('T5 uniref HA all')
al_mod_entr_correl(h1alentropy, h1path + 'ncbi_h1_110424_filt-ASR-T5_uniref_HA_221124-site_entropy_aln.csv')
print('T5 uniref H1')
al_mod_entr_correl(h1alentropy, h1path + 'ncbi_h1_110424_filt-ASR-T5_uniref_H1_221124-site_entropy_aln.csv')
print('T5 uniref H3')
al_mod_entr_correl(h1alentropy, h1path + 'ncbi_h1_110424_filt-ASR-T5_uniref_H3_221124-site_entropy_aln.csv')
print('T5 uniref H5')
al_mod_entr_correl(h1alentropy, h1path + 'ncbi_h1_110424_filt-ASR-T5_uniref_H5_221124-site_entropy_aln.csv')
print('T5 uniref H7')
al_mod_entr_correl(h1alentropy, h1path + 'ncbi_h1_110424_filt-ASR-T5_uniref_H7_221124-site_entropy_aln.csv')


print('\n')


print('ESM-2')
al_mod_entr_correl(h1alentropy, h1path + 'ncbi_h1_110424_filt-ASR-esm2_t33-site_entropy_aln.csv')
print('ESM-2 HA all')
al_mod_entr_correl(h1alentropy, h1path + 'ncbi_h1_110424_filt-ASR-esm2_t33_e10-site_entropy_aln.csv')
print('H1')
al_mod_entr_correl(h1alentropy, h1path + 'ncbi_h1_110424_filt-ASR-esm2_t33-H1_071024-site_entropy_aln.csv')
print('H3')
al_mod_entr_correl(h1alentropy, h1path + 'ncbi_h1_110424_filt-ASR-esm2_t33-H3_071024-site_entropy_aln.csv')
print('H5')
al_mod_entr_correl(h1alentropy, h1path + 'ncbi_h1_110424_filt-ASR-esm2_t33-H5_071024-site_entropy_aln.csv')
print('H7')
al_mod_entr_correl(h1alentropy, h1path + 'ncbi_h1_110424_filt-ASR-esm2_t33-H7_071024-site_entropy_aln.csv')





####    H1    ####

T5 uniref only
SignificanceResult(statistic=0.10165227419604277, pvalue=0.012581821526446007)
T5 uniref HA all
SignificanceResult(statistic=0.8501328916976791, pvalue=2.8555899971925044e-169)
T5 uniref H1
SignificanceResult(statistic=0.8714150342600708, pvalue=9.557017930881812e-188)
T5 uniref H3
SignificanceResult(statistic=0.5701928830117747, pvalue=3.3354846122684725e-53)
T5 uniref H5
SignificanceResult(statistic=0.7471584164539867, pvalue=1.5593286375551177e-108)
T5 uniref H7
SignificanceResult(statistic=0.5679706501537386, pvalue=1.0285130816361125e-52)


ESM-2
SignificanceResult(statistic=0.7366969802459624, pvalue=4.9975319493947804e-104)
ESM-2 HA all
SignificanceResult(statistic=0.8682187593364379, pvalue=9.082704490737243e-185)
H1
SignificanceResult(statistic=0.8582754499206481, pvalue=5.563551910701364e-176)
H3
SignificanceResult(statistic=0.47340177069556794, pvalue=5.958889550616747e-35)
H5
SignificanceResult(statistic=0.7200810041306905, pvalue=2.68544

In [13]:
print('\n####    H3    ####\n')
h3path = '/media/spyros/HD-ADU3/spyros/flu_LLM_evol_data/ncbi_h3n2_110424_filt/'
h3alentropy = calc_aln_entropy(h3path + 'ancestral_sequences.fasta')


print('T5 uniref only')
al_mod_entr_correl(h3alentropy, h3path + 'ncbi_h3n2_110424_filt-ASR-T5_UniRef_e2-site_entropy_aln.csv')
print('T5 uniref HA all')
al_mod_entr_correl(h3alentropy, h3path + 'ncbi_h3n2_110424_filt-ASR-T5_uniref_HA_221124-site_entropy_aln.csv')
print('T5 uniref H1')
al_mod_entr_correl(h3alentropy, h3path + 'ncbi_h3n2_110424_filt-ASR-T5_uniref_H1_221124-site_entropy_aln.csv')
print('T5 uniref H3')
al_mod_entr_correl(h3alentropy, h3path + 'ncbi_h3n2_110424_filt-ASR-T5_uniref_H3_221124-site_entropy_aln.csv')
print('T5 uniref H5')
al_mod_entr_correl(h3alentropy, h3path + 'ncbi_h3n2_110424_filt-ASR-T5_uniref_H5_221124-site_entropy_aln.csv')
print('T5 uniref H7')
al_mod_entr_correl(h3alentropy, h3path + 'ncbi_h3n2_110424_filt-ASR-T5_uniref_H7_221124-site_entropy_aln.csv')

print('\n')


print('ESM-2')
al_mod_entr_correl(h3alentropy, h3path + 'ncbi_h3n2_110424_filt-ASR-esm2_t33-site_entropy_aln.csv')
print('ESM-2 HA all')
al_mod_entr_correl(h3alentropy, h3path + 'ncbi_h3n2_110424_filt-ASR-esm2_t33_e10-site_entropy_aln.csv')
print('H1')
al_mod_entr_correl(h3alentropy, h3path + 'ncbi_h3n2_110424_filt-ASR-esm2_t33-H1_071024-site_entropy_aln.csv')
print('H3')
al_mod_entr_correl(h3alentropy, h3path + 'ncbi_h3n2_110424_filt-ASR-esm2_t33-H3_071024-site_entropy_aln.csv')
print('H5')
al_mod_entr_correl(h3alentropy, h3path + 'ncbi_h3n2_110424_filt-ASR-esm2_t33-H5_071024-site_entropy_aln.csv')
print('H7')
al_mod_entr_correl(h3alentropy, h3path + 'ncbi_h3n2_110424_filt-ASR-esm2_t33-H7_071024-site_entropy_aln.csv')
    


####    H3    ####

T5 uniref only
SignificanceResult(statistic=0.0005563280859932967, pvalue=0.9894166376987367)
T5 uniref HA all
SignificanceResult(statistic=0.8581243051579084, pvalue=7.243823771640094e-167)
T5 uniref H1
SignificanceResult(statistic=0.4031941378083206, pvalue=9.870968169173e-24)
T5 uniref H3
SignificanceResult(statistic=0.8660027826167839, pvalue=2.087928525670729e-173)
T5 uniref H5
SignificanceResult(statistic=0.3693740584870011, pvalue=6.724522245265784e-20)
T5 uniref H7
SignificanceResult(statistic=0.4555885018138892, pvalue=1.3139034512049708e-30)


ESM-2
SignificanceResult(statistic=0.43142888432193116, pvalue=2.746593303226615e-27)
ESM-2 HA all
SignificanceResult(statistic=0.8714828205978321, pvalue=3.311493097123705e-178)
H1
SignificanceResult(statistic=0.6164979538887408, pvalue=4.5140761790528986e-61)
H3
SignificanceResult(statistic=0.8259017991818154, pvalue=1.0074175008487974e-143)
H5
SignificanceResult(statistic=0.28059955811766585, pvalue=8.60760728668

<br>
<br>


### 80-20 split

In [10]:
def split_8020(hapath, date_cutoff):
    nodedats = pd.read_csv(hapath + 'dates.tsv', sep='\t')
    nodedats = nodedats[nodedats['numeric date'] != "--"]
    nodedats['numeric date'] = pd.to_numeric(nodedats['numeric date'])
    
    nodedats_20 = nodedats[nodedats['numeric date'] > date_cutoff]
    print(nodedats_20.sort_values(by='numeric date'))
    
    filt_fas(SeqIO.to_dict(SeqIO.parse(hapath + 'ancestral_sequences.fasta', 'fasta')), list(nodedats_20['#node']), hapath + 'present20_seqs.fasta')


In [4]:
#protT5MLM example


d_cutoff = 2015.169406392694

#make present20 entropy aln files for T5 HA-80 model and prez 20 aln files 

for thispath in ['gisaid_h7_270624_filt', 'gisaid_h5_270624_filt', 'ncbi_h1_110424_filt', 'ncbi_h3n2_110424_filt']:
    
    entrdf = pd.read_csv('/media/spyros/HD-ADU3/spyros/flu_LLM_evol_data/%s/%s-ASR-T5_uniref_8020_221124-site_entropy_aln.csv'%(thispath, thispath), index_col=0)
    
    nodedats = pd.read_csv('/media/spyros/HD-ADU3/spyros/flu_LLM_evol_data/%s/dates.tsv'%thispath, sep='\t')
    nodedats = nodedats[nodedats['numeric date'] != "--"]
    nodedats['numeric date'] = pd.to_numeric(nodedats['numeric date'])
    nodedats_20 = nodedats[nodedats['numeric date'] > d_cutoff]
    entrdf_20 = entrdf[entrdf.index.isin(list(nodedats_20['#node']))]
    entrdf_20.to_csv('/media/spyros/HD-ADU3/spyros/flu_LLM_evol_data/%s/%s-ASR-T5_uniref_8020_221124-site_entropy_aln_prez20.csv'%(thispath, thispath))

    split_8020('/media/spyros/HD-ADU3/spyros/flu_LLM_evol_data/%s/'%thispath, d_cutoff)


#make present20 entropy aln files for T5 HA-all model and prez 20 aln files 

for thispath in ['gisaid_h7_270624_filt', 'gisaid_h5_270624_filt', 'ncbi_h1_110424_filt', 'ncbi_h3n2_110424_filt']:
    
    entrdf = pd.read_csv('/media/spyros/HD-ADU3/spyros/flu_LLM_evol_data/%s/%s-ASR-T5_uniref_HA_221124-site_entropy_aln.csv'%(thispath, thispath), index_col=0)
    
    nodedats = pd.read_csv('/media/spyros/HD-ADU3/spyros/flu_LLM_evol_data/%s/dates.tsv'%thispath, sep='\t')
    nodedats = nodedats[nodedats['numeric date'] != "--"]
    nodedats['numeric date'] = pd.to_numeric(nodedats['numeric date'])
    nodedats_20 = nodedats[nodedats['numeric date'] > d_cutoff]
    entrdf_20 = entrdf[entrdf.index.isin(list(nodedats_20['#node']))]
    entrdf_20.to_csv('/media/spyros/HD-ADU3/spyros/flu_LLM_evol_data/%s/%s-ASR-T5_uniref_HA_221124-site_entropy_aln_prez20.csv'%(thispath, thispath))

    split_8020('/media/spyros/HD-ADU3/spyros/flu_LLM_evol_data/%s/'%thispath, d_cutoff)



In [13]:

h7path = '/media/spyros/HD-ADU3/spyros/flu_LLM_evol_data/gisaid_h7_270624_filt/'

h7prez20_alentr = calc_aln_entropy(h7path + 'present20_seqs.fasta')

print('ESM-2 80-20 split H7')
al_mod_entr_correl(h7prez20_alentr, h7path + 'gisaid_h7_270624_filt-ASR-esm2_t33-8020_e10_241024-site_entropy_aln_prez20.csv')

print('ESM-2 HA-all H7 test 80-20')
al_mod_entr_correl(h7prez20_alentr, h7path + 'gisaid_h7_270624_filt-ASR-esm2_t33_e10-site_entropy_aln_prez20.csv')

print('T5 80-20 split H7')
al_mod_entr_correl(h7prez20_alentr, h7path + 'gisaid_h7_270624_filt-ASR-T5_uniref_8020_221124-site_entropy_aln_prez20.csv')

print('T5 HA-all H7 test 80-20')
al_mod_entr_correl(h7prez20_alentr, h7path + 'gisaid_h7_270624_filt-ASR-T5_uniref_HA_221124-site_entropy_aln_prez20.csv')


ESM-2 80-20 split H7
SignificanceResult(statistic=0.772187726914183, pvalue=2.345992686930245e-114)
ESM-2 HA-all H7 test 80-20
SignificanceResult(statistic=0.7900857584863945, pvalue=2.996873357297004e-123)
T5 80-20 split H7
SignificanceResult(statistic=0.813421010280231, pvalue=3.027629704528037e-136)
T5 HA-all H7 test 80-20
SignificanceResult(statistic=0.8126458569846052, pvalue=8.74493282049748e-136)


In [14]:

h5path = '/media/spyros/HD-ADU3/spyros/flu_LLM_evol_data/gisaid_h5_270624_filt/'

h5prez20_alentr = calc_aln_entropy(h5path + 'present20_seqs.fasta')

print('ESM-2 80-20 split H5')
al_mod_entr_correl(h5prez20_alentr, h5path + 'gisaid_h5_270624_filt-ASR-esm2_t33-8020_e10_241024-site_entropy_aln_prez20.csv')

print('ESM-2 HA-all H5 test 80-20')
al_mod_entr_correl(h5prez20_alentr, h5path + 'gisaid_h5_270624_filt-ASR-esm2_t33_e10-site_entropy_aln_prez20.csv')

print('T5 80-20 split H5')
al_mod_entr_correl(h5prez20_alentr, h5path + 'gisaid_h5_270624_filt-ASR-T5_uniref_8020_221124-site_entropy_aln_prez20.csv')

print('T5 HA-all H5 test 80-20')
al_mod_entr_correl(h5prez20_alentr, h5path + 'gisaid_h5_270624_filt-ASR-T5_uniref_HA_221124-site_entropy_aln_prez20.csv')


ESM-2 80-20 split H5
SignificanceResult(statistic=0.7837190585855821, pvalue=7.457210548659226e-122)
ESM-2 HA-all H5 test 80-20
SignificanceResult(statistic=0.7955803590325161, pvalue=4.05413880423931e-128)
T5 80-20 split H5
SignificanceResult(statistic=0.8103322759114011, pvalue=1.6273658040831611e-136)
T5 HA-all H5 test 80-20
SignificanceResult(statistic=0.8045075594526581, pvalue=4.0968289916119846e-133)


In [15]:

h1path = '/media/spyros/HD-ADU3/spyros/flu_LLM_evol_data/ncbi_h1_110424_filt/'

h1prez20_alentr = calc_aln_entropy(h1path + 'present20_seqs.fasta')

print('ESM-2 80-20 split H1')
al_mod_entr_correl(h1prez20_alentr, h1path + 'ncbi_h1_110424_filt-ASR-esm2_t33-8020_e10_241024-site_entropy_aln_prez20.csv')

print('ESM-2 HA-all H1 test 80-20')
al_mod_entr_correl(h1prez20_alentr, h1path + 'ncbi_h1_110424_filt-ASR-esm2_t33_e10-site_entropy_aln_prez20.csv')

print('T5 80-20 split H1')
al_mod_entr_correl(h1prez20_alentr, h1path + 'ncbi_h1_110424_filt-ASR-T5_uniref_8020_221124-site_entropy_aln_prez20.csv')

print('T5 HA-all H1 test 80-20')
al_mod_entr_correl(h1prez20_alentr, h1path + 'ncbi_h1_110424_filt-ASR-T5_uniref_HA_221124-site_entropy_aln_prez20.csv')



ESM-2 80-20 split H1
SignificanceResult(statistic=0.7898698317229611, pvalue=1.6130619542349655e-129)
ESM-2 HA-all H1 test 80-20
SignificanceResult(statistic=0.8007254151430245, pvalue=1.1984480354181272e-135)
T5 80-20 split H1
SignificanceResult(statistic=0.7906097853542161, pvalue=6.33126234544866e-130)
T5 HA-all H1 test 80-20
SignificanceResult(statistic=0.7821245270912657, pvalue=2.3060456537940815e-125)


In [16]:

h3path = '/media/spyros/HD-ADU3/spyros/flu_LLM_evol_data/ncbi_h3n2_110424_filt/'

h3prez20_alentr = calc_aln_entropy(h3path + 'present20_seqs.fasta')

print('ESM-2 80-20 split H3')
al_mod_entr_correl(h3prez20_alentr, h3path + 'ncbi_h3n2_110424_filt-ASR-esm2_t33-8020_e10_241024-site_entropy_aln_prez20.csv')

print('ESM-2 HA-all H3 test 80-20')
al_mod_entr_correl(h3prez20_alentr, h3path + 'ncbi_h3n2_110424_filt-ASR-esm2_t33_e10-site_entropy_aln_prez20.csv')

print('T5 80-20 split H3')
al_mod_entr_correl(h3prez20_alentr, h3path + 'ncbi_h3n2_110424_filt-ASR-T5_uniref_8020_221124-site_entropy_aln_prez20.csv')

print('T5 HA-all H3 test 80-20')
al_mod_entr_correl(h3prez20_alentr, h3path + 'ncbi_h3n2_110424_filt-ASR-T5_uniref_HA_221124-site_entropy_aln_prez20.csv')


ESM-2 80-20 split H3
SignificanceResult(statistic=0.7727220900608174, pvalue=2.0627942622193503e-114)
ESM-2 HA-all H3 test 80-20
SignificanceResult(statistic=0.8366837156518955, pvalue=6.698482152430933e-151)
T5 80-20 split H3
SignificanceResult(statistic=0.8221894809244479, pvalue=2.2941015718743894e-141)
T5 HA-all H3 test 80-20
SignificanceResult(statistic=0.8362341615711295, pvalue=1.366485929209064e-150)
