In [1]:
import os
# from esm.models.esmc import ESMC
# from esm.sdk.api import ESMProtein, LogitsConfig
import torch
from scipy import spatial
from matplotlib import pyplot as plt
from eba import methods 
from eba import score_matrices as sm
from eba import plm_extractor as plm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
working_dir = '/scicore/home/schwede/pantol0000/repositories/alphabeta_classic'
scop_lookup_file = os.path.join(working_dir, 'data/scop_lookup.fix.tsv')
scop_fasta_file = os.path.join(working_dir, 'data/SCOPe40.fasta')

In [3]:
scop_sequences = dict()
with open(scop_lookup_file, 'r') as file:
    for line in file:
        seq_id,label = line.split()
        scop_sequences[seq_id] = {'label':label, 'seq':'', 'emb':0} 

In [4]:
with open(scop_fasta_file, 'r') as file:
    seq_id = ''
    for line in file:
        if line[0]=='>':
            seq_id = line[1:].strip()
        else:
            scop_sequences[seq_id]['seq'] = line.strip()

In [18]:
scop_sequences.keys()

dict_keys(['d1dlwa_', 'd2gkma_', 'd2qrwa_', 'd1s69a_', 'd2bkma_', 'd1asha_', 'd1urva_', 'd3lb2a_', 'd1ecaa_', 'd1x9fd_', 'd1x9fc_', 'd1cqxa1', 'd1jl7a_', 'd1it2a_', 'd1or4a_', 'd1itha_', 'd3g46a_', 'd1b0ba_', 'd1cg5a_', 'd1cg5b_', 'd2w72b_', 'd1hlba_', 'd1tu9a_', 'd2gdma_', 'd1mbaa_', 'd1naza_', 'd1q1fa_', 'd1h97a_', 'd3mkbb_', 'd3boma_', 'd2nrla_', 'd1alla_', 'd1allb_', 'd3l0fa_', 'd1b8da_', 'd1xg0c_', 'd2xkia_', 'd2wy4a_', 'd2ig3a_', 'd3pt8a_', 'd2g3ha_', 'd2wtga_', 'd2zs0a_', 'd2zs0b_', 'd2zs0c_', 'd2zs0d_', 'd1x46a_', 'd1kf6b1', 'd2bs2b1', 'd1nekb1', 'd1gtea1', 'd1grja1', 'd2f23a1', 'd2qamx1', 'd1vq8v1', 'd2j0121', 'd1nz6a_', 'd1wjza_', 'd1fpoa1', 'd1iura_', 'd1fafa_', 'd1gh6a_', 'd2o37a_', 'd1fxkc_', 'd1fxka_', 'd1cxzb_', 'd1urfa_', 'd1srya1', 'd2iy5a1', 'd1ivsa1', 'd1lrza1', 'd1k4ta1', 'd1qoja_', 'd2jdih1', 'd1aqta1', 'd1coja1', 'd1wb8a1', 'd1my6a1', 'd1ix9a1', 'd3d36c_', 'd1rfya_', 'd1tjla1', 'd1x4ta1', 'd2a26a1', 'd2f6ma_', 'd2f6mb_', 'd2f66c1', 'd1z0pa1', 'd1z0jb1', 'd1z0kb1',

In [21]:
scop_sequences['d1tu1a_']

{'label': 'd.107.1.3',
 'seq': 'GHMTLYRLHEADLEIPDAWQDQSINIFKLPASGPAREASFVISRDASQGDAPFADYVARQLENAEKQLPGFKLHKRWDINIHGHAAVLLDYQWQREGRDLMLRQVFIERRPAVLITTLTTTPADLPHHEPAWKQAMQTLVPRPT',
 'emb': 0}

In [48]:
labels = ['a.1.1.1', 'd.107.1.3']
alphabet_path = '/scicore/home/schwede/pantol0000/repositories/alphabeta_classic/alphabets/kmeans_20'

In [49]:
codebook = torch.load(f'{alphabet_path}/codebook.pt')
ab_sequences = torch.load(f'{alphabet_path}/alphabeta.pt')

  codebook = torch.load(f'{alphabet_path}/codebook.pt')
  ab_sequences = torch.load(f'{alphabet_path}/alphabeta.pt')


In [50]:
###build embeddings
alphabeta_embeddings = dict()
for s in scop_sequences:
    if scop_sequences[s]['label'] in labels:
        emb_list = [torch.tensor(codebook[x]).unsqueeze(dim=0) for x in ab_sequences[s]]
        alphabeta_embeddings[s] = torch.cat(emb_list, dim=0)

In [80]:
alphabeta_embeddings.keys()

dict_keys(['d1dlwa_', 'd2gkma_', 'd2qrwa_', 'd1s69a_', 'd2bkma_', 'd1tu1a_'])

In [55]:
scores0 = dict()
scores1 = dict()

for i in alphabeta_embeddings:
    for j in alphabeta_embeddings:
        if i!=j:
            sim_matrix = sm.compute_similarity_matrix(alphabeta_embeddings[i], alphabeta_embeddings[j])
            scores0[(i,j)] = methods.compute_eba(sim_matrix, gap_open_penalty=0.0, gap_extend_penalty=0.0)
            scores1[(i,j)] = methods.compute_eba(sim_matrix, gap_open_penalty=1.0, gap_extend_penalty=1.0)


In [81]:
sim_matrix = sm.compute_similarity_matrix(alphabeta_embeddings['d1dlwa_'], alphabeta_embeddings['d2gkma_'])

In [59]:
for i in scores0:
    print(i)
    print(i, scores0[i], scop_sequences[i[0]]['label'], scop_sequences[i[1]]['label'])

('d1dlwa_', 'd2gkma_')
('d1dlwa_', 'd2gkma_') {'EBA_raw': 228.37051838636398, 'EBA_min': 1.7981930581603462, 'EBA_max': 1.9687113653996895} a.1.1.1 a.1.1.1
('d1dlwa_', 'd2qrwa_')
('d1dlwa_', 'd2qrwa_') {'EBA_raw': 167.44764368981123, 'EBA_min': 1.3289495530937399, 'EBA_max': 1.443514169739752} a.1.1.1 a.1.1.1
('d1dlwa_', 'd1s69a_')
('d1dlwa_', 'd1s69a_') {'EBA_raw': 199.22367810457945, 'EBA_min': 1.6197047000372313, 'EBA_max': 1.717445500901547} a.1.1.1 a.1.1.1
('d1dlwa_', 'd2bkma_')
('d1dlwa_', 'd2bkma_') {'EBA_raw': 186.03451731055975, 'EBA_min': 1.453394666488748, 'EBA_max': 1.6037458388841357} a.1.1.1 a.1.1.1
('d1dlwa_', 'd1tu1a_')
('d1dlwa_', 'd1tu1a_') {'EBA_raw': 140.23160600662231, 'EBA_min': 0.9738305972682105, 'EBA_max': 1.2088931552295028} a.1.1.1 d.107.1.3
('d2gkma_', 'd1dlwa_')
('d2gkma_', 'd1dlwa_') {'EBA_raw': 228.37095573544502, 'EBA_min': 1.798196501853898, 'EBA_max': 1.968715135650388} a.1.1.1 a.1.1.1
('d2gkma_', 'd2qrwa_')
('d2gkma_', 'd2qrwa_') {'EBA_raw': 186.02726

In [64]:
scores0.keys()

dict_keys([('d1dlwa_', 'd2gkma_'), ('d1dlwa_', 'd2qrwa_'), ('d1dlwa_', 'd1s69a_'), ('d1dlwa_', 'd2bkma_'), ('d1dlwa_', 'd1tu1a_'), ('d2gkma_', 'd1dlwa_'), ('d2gkma_', 'd2qrwa_'), ('d2gkma_', 'd1s69a_'), ('d2gkma_', 'd2bkma_'), ('d2gkma_', 'd1tu1a_'), ('d2qrwa_', 'd1dlwa_'), ('d2qrwa_', 'd2gkma_'), ('d2qrwa_', 'd1s69a_'), ('d2qrwa_', 'd2bkma_'), ('d2qrwa_', 'd1tu1a_'), ('d1s69a_', 'd1dlwa_'), ('d1s69a_', 'd2gkma_'), ('d1s69a_', 'd2qrwa_'), ('d1s69a_', 'd2bkma_'), ('d1s69a_', 'd1tu1a_'), ('d2bkma_', 'd1dlwa_'), ('d2bkma_', 'd2gkma_'), ('d2bkma_', 'd2qrwa_'), ('d2bkma_', 'd1s69a_'), ('d2bkma_', 'd1tu1a_'), ('d1tu1a_', 'd1dlwa_'), ('d1tu1a_', 'd2gkma_'), ('d1tu1a_', 'd2qrwa_'), ('d1tu1a_', 'd1s69a_'), ('d1tu1a_', 'd2bkma_')])

In [65]:
same_label = [scores0[x]['EBA_min'] for x in scores0 if scop_sequences[x[0]]['label']==scop_sequences[x[1]]['label']]
different_label = [scores0[x]['EBA_min'] for x in scores0 if scop_sequences[x[0]]['label']!=scop_sequences[x[1]]['label']]

In [66]:
len(same_label)

20

In [73]:
sum(same_label)/len(same_label)

1.5824811838378443

In [68]:
len(different_label)

10

In [74]:
sum(different_label)/len(different_label)

1.0064698834360264

In [77]:
same_label = [scores1[x]['EBA_min'] for x in scores1 if scop_sequences[x[0]]['label']==scop_sequences[x[1]]['label']]
different_label = [scores1[x]['EBA_min'] for x in scores1 if scop_sequences[x[0]]['label']!=scop_sequences[x[1]]['label']]

In [78]:
sum(different_label)/len(different_label)

0.4840842536081456

In [79]:
sum(same_label)/len(same_label)

1.3106526807042223

In [71]:
for i in alphabeta_embeddings:
    print(i,scop_sequences[i]['label'])

d1dlwa_ a.1.1.1
d2gkma_ a.1.1.1
d2qrwa_ a.1.1.1
d1s69a_ a.1.1.1
d2bkma_ a.1.1.1
d1tu1a_ d.107.1.3


In [61]:
for i in scores1:
    print(i)
    print(i, scores0[i]['EBA_min'], scop_sequences[i[0]]['label'], scop_sequences[i[1]]['label'])

('d1dlwa_', 'd2gkma_')
('d1dlwa_', 'd2gkma_') 1.7981930581603462 a.1.1.1 a.1.1.1
('d1dlwa_', 'd2qrwa_')
('d1dlwa_', 'd2qrwa_') 1.3289495530937399 a.1.1.1 a.1.1.1
('d1dlwa_', 'd1s69a_')
('d1dlwa_', 'd1s69a_') 1.6197047000372313 a.1.1.1 a.1.1.1
('d1dlwa_', 'd2bkma_')
('d1dlwa_', 'd2bkma_') 1.453394666488748 a.1.1.1 a.1.1.1
('d1dlwa_', 'd1tu1a_')
('d1dlwa_', 'd1tu1a_') 0.9738305972682105 a.1.1.1 d.107.1.3
('d2gkma_', 'd1dlwa_')
('d2gkma_', 'd1dlwa_') 1.798196501853898 a.1.1.1 a.1.1.1
('d2gkma_', 'd2qrwa_')
('d2gkma_', 'd2qrwa_') 1.4647816222777048 a.1.1.1 a.1.1.1
('d2gkma_', 'd1s69a_')
('d2gkma_', 'd1s69a_') 1.6813567551806217 a.1.1.1 a.1.1.1
('d2gkma_', 'd2bkma_')
('d2gkma_', 'd2bkma_') 1.5669016799656674 a.1.1.1 a.1.1.1
('d2gkma_', 'd1tu1a_')
('d2gkma_', 'd1tu1a_') 1.0298667756012745 a.1.1.1 d.107.1.3
('d2qrwa_', 'd1dlwa_')
('d2qrwa_', 'd1dlwa_') 1.3289482796357737 a.1.1.1 a.1.1.1
('d2qrwa_', 'd2gkma_')
('d2qrwa_', 'd2gkma_') 1.4647815916247255 a.1.1.1 a.1.1.1
('d2qrwa_', 'd1s69a_')
('d

In [13]:
labels

['d1dlwa_', 'd2gkma_']

In [None]:
scop_sequences[s]['label']

In [None]:
scores = dict()

for i in a111_seq:
    for j in a111_seq:
        if i!=j and (i,j) not in scores.keys() and (j,i) not in scores.keys():
            sm.compute