In [1]:
import torch
from transformers import MT5ForConditionalGeneration, MT5Config, MT5EncoderModel, MT5Tokenizer, Trainer, TrainingArguments
from progeny_tokenizer import TAPETokenizer
import numpy as np
import math
import random
import scipy
import time
import pandas as pd
from torch.utils.data import DataLoader, RandomSampler, Dataset, BatchSampler
import typing
from pathlib import Path
import argparse
from collections import OrderedDict
import pickle
import matplotlib.pyplot as plt

In [2]:
before_foldx = True

In [3]:
seed = 30
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)


<torch._C.Generator at 0x7f29cc08a250>

# Analyze 250K gen seqs and prepare for FoldX

saved output tsv file to run FoldX inference

In [4]:
wt_seq = 'STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNNAGDKWSAFLKEQSTLAQMYPLQEIQNLTVKLQLQALQ'
constant_region = 'NTNITEEN'
wt_cs_ind = wt_seq.index(constant_region)

In [5]:
# gen250k_tsv_name = 'generated_seqs/mcmc_ACE/top12500input1Kiter_temp01_trustr18/top12500input1Kiter_temp001-mcmc_seqs.tsv'
gen250k_tsv_name = 'generated_seqs/mcmc_ACE/top12500input1Kiter_temp01_trustr18/top12500input1Kiter_temp01_trustr18-mcmc_seqs.tsv'

In [6]:
gen250k_df = pd.read_table(gen250k_tsv_name)

In [7]:
gen250k_df

Unnamed: 0,disc_pred,MT_seq,PDB,Chain,Start_index,WT_seq,MT_edit_dist_vs_WT,accepted
0,2.795303,SDIEEQAKIFLDKFNNEREDLFYQSKLASWRYNTNITEENVQNMNI...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,18,1
1,2.795731,SSIEEQAKTFLAKFAHEAEQLKYQSELALWNFNTNITEENVTNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
2,2.797057,SDIEEQAKIFLDKFNNEREDLFYMSKLASWRYNTNITEENVQNMNI...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
3,2.797315,SSLEEQARIFLDKFNHESEDLFYQSALALMNYNTNITEENYQNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
4,2.798953,SSIEEQAKTFLEKFNHEAHDIFYLMELESATYNTNITEENVQNMNM...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,18,1
...,...,...,...,...,...,...,...,...
249995,32.436077,VTIEEQYKTFLLKFNHEAVRLFYISYLAQWRYNTNITEENLQNMYQ...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,21,0
249996,32.686634,PTMEEQFKTFLIKFNHEAQDLFYQYWLASRNYNTNITEENVQNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,21,0
249997,32.965042,FFIEEQYKMFLDKFNHEAYDLFYSSSLARNIYNTNITEENEQRMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,21,0
249998,32.981464,SDIEEQYKTFLDKFNDEAEMLFFQSYLASIVYNTNITEENFQNMNW...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,21,0


filter out sequences without constant region

In [8]:
indices_to_drop = []
dropped_seqs = []
for index, row in gen250k_df.iterrows():
    seq = row['MT_seq']
    if constant_region not in seq:
        indices_to_drop.append(index)
        dropped_seqs.append(seq)
    else:
        cs_ind = seq.index(constant_region)
        if cs_ind != wt_cs_ind:
            indices_to_drop.append(index)
            dropped_seqs.append(seq)

In [9]:
print(len(indices_to_drop))
print(indices_to_drop)
print(dropped_seqs)

0
[]
[]


In [10]:
gen250k_df_dropped_nocon = gen250k_df.drop(indices_to_drop)

In [11]:
gen250k_df_dropped_nocon

Unnamed: 0,disc_pred,MT_seq,PDB,Chain,Start_index,WT_seq,MT_edit_dist_vs_WT,accepted
0,2.795303,SDIEEQAKIFLDKFNNEREDLFYQSKLASWRYNTNITEENVQNMNI...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,18,1
1,2.795731,SSIEEQAKTFLAKFAHEAEQLKYQSELALWNFNTNITEENVTNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
2,2.797057,SDIEEQAKIFLDKFNNEREDLFYMSKLASWRYNTNITEENVQNMNI...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
3,2.797315,SSLEEQARIFLDKFNHESEDLFYQSALALMNYNTNITEENYQNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
4,2.798953,SSIEEQAKTFLEKFNHEAHDIFYLMELESATYNTNITEENVQNMNM...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,18,1
...,...,...,...,...,...,...,...,...
249995,32.436077,VTIEEQYKTFLLKFNHEAVRLFYISYLAQWRYNTNITEENLQNMYQ...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,21,0
249996,32.686634,PTMEEQFKTFLIKFNHEAQDLFYQYWLASRNYNTNITEENVQNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,21,0
249997,32.965042,FFIEEQYKMFLDKFNHEAYDLFYSSSLARNIYNTNITEENEQRMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,21,0
249998,32.981464,SDIEEQYKTFLDKFNDEAEMLFFQSYLASIVYNTNITEENFQNMNW...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,21,0


filter out sequences with non-AA tokens

In [12]:
rejected_tokens = ["<pad>", "<sep>", "<cls>", "<mask>", "<unk>"]

In [13]:
indices_to_drop = []
dropped_seqs = []
for index, row in gen250k_df_dropped_nocon.iterrows():
    seq = row['MT_seq']
    
    for rejected_token in rejected_tokens:
        if rejected_token in seq:
            indices_to_drop.append(index)
            dropped_seqs.append(seq)
            break
            

In [14]:
print(len(indices_to_drop))
print(indices_to_drop)
print(dropped_seqs)

0
[]
[]


In [15]:
gen250k_df_dropped = gen250k_df_dropped_nocon.drop(indices_to_drop)
print(len(gen250k_df_dropped))

250000


In [16]:
indices_to_drop = []
dropped_seqs = []
for index, row in gen250k_df_dropped.iterrows():
    seq = row['MT_seq']
    
    for rejected_token in rejected_tokens:
        if rejected_token in seq:
            indices_to_drop.append(index)
            dropped_seqs.append(seq)
            break
            
print(len(indices_to_drop))

0


In [17]:
indices_to_drop = []
dropped_seqs = []
for index, row in gen250k_df_dropped.iterrows():
    seq = row['MT_seq']
    if constant_region not in seq:
        indices_to_drop.append(index)
        dropped_seqs.append(seq)
    else:
        cs_ind = seq.index(constant_region)
        if cs_ind != wt_cs_ind:
            indices_to_drop.append(index)
            dropped_seqs.append(seq)
            
print(len(indices_to_drop))

0


In [18]:
gen250k_df_dropped

Unnamed: 0,disc_pred,MT_seq,PDB,Chain,Start_index,WT_seq,MT_edit_dist_vs_WT,accepted
0,2.795303,SDIEEQAKIFLDKFNNEREDLFYQSKLASWRYNTNITEENVQNMNI...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,18,1
1,2.795731,SSIEEQAKTFLAKFAHEAEQLKYQSELALWNFNTNITEENVTNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
2,2.797057,SDIEEQAKIFLDKFNNEREDLFYMSKLASWRYNTNITEENVQNMNI...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
3,2.797315,SSLEEQARIFLDKFNHESEDLFYQSALALMNYNTNITEENYQNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
4,2.798953,SSIEEQAKTFLEKFNHEAHDIFYLMELESATYNTNITEENVQNMNM...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,18,1
...,...,...,...,...,...,...,...,...
249995,32.436077,VTIEEQYKTFLLKFNHEAVRLFYISYLAQWRYNTNITEENLQNMYQ...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,21,0
249996,32.686634,PTMEEQFKTFLIKFNHEAQDLFYQYWLASRNYNTNITEENVQNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,21,0
249997,32.965042,FFIEEQYKMFLDKFNHEAYDLFYSSSLARNIYNTNITEENEQRMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,21,0
249998,32.981464,SDIEEQYKTFLDKFNDEAEMLFFQSYLASIVYNTNITEENFQNMNW...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,21,0


In [19]:
topK_saved = 10000
top10K_Dscore_gen250k_tsv_name = 'generated_seqs/mcmc_ACE/top12500input1Kiter_temp01_trustr18/top12500input1Kiter_temp01_trustr18-mcmc_seqs_top10Kdiscfiltered.tsv'
# top10K_Dscore_gen250k_tsv_name = 'generated_seqs/tophalf-basegen_top10K-Dscore_250Kgen_dropped.tsv'

gen250k_df_dropped = gen250k_df_dropped[:250000]

gen250k_df_dropped = gen250k_df_dropped.sort_values(by='disc_pred', ascending=True)
# gen250k_df_dropped = gen250k_df_dropped.sort_values(by='latent_head_pred', ascending=True)
topK_gen250k_df_dropped = gen250k_df_dropped.iloc[:topK_saved]

In [20]:
topK_gen250k_df_dropped

Unnamed: 0,disc_pred,MT_seq,PDB,Chain,Start_index,WT_seq,MT_edit_dist_vs_WT,accepted
0,2.795303,SDIEEQAKIFLDKFNNEREDLFYQSKLASWRYNTNITEENVQNMNI...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,18,1
1,2.795731,SSIEEQAKTFLAKFAHEAEQLKYQSELALWNFNTNITEENVTNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
2,2.797057,SDIEEQAKIFLDKFNNEREDLFYMSKLASWRYNTNITEENVQNMNI...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
3,2.797315,SSLEEQARIFLDKFNHESEDLFYQSALALMNYNTNITEENYQNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
4,2.798953,SSIEEQAKTFLEKFNHEAHDIFYLMELESATYNTNITEENVQNMNM...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,18,1
...,...,...,...,...,...,...,...,...
9995,2.899484,SDIEEQAKTFLQKYNAEAQDMFYFKLLASWNYNTNITEENVYNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
9996,2.899485,STLEEQAKTFLWKWDHEAEILMYQFSKAEINYNTNITEENVYNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
9997,2.899489,STYEEFAKTFLTKFIHWADDLAYQSRLASWNYNTNITEENFQMMNQ...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,20,0
9998,2.899493,SDIEEQAKTFLDKFLHMAEDLFYQSSLASWNYNTNITEENIWNKNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0


# Save top 10K seqs for FoldX Evaluation

In [21]:
if before_foldx:
    topK_gen250k_df_dropped.to_csv(top10K_Dscore_gen250k_tsv_name, sep="\t", index=False)

# Sample for E[min] FoldX Computation

In [22]:
gen250k_df_dropped

Unnamed: 0,disc_pred,MT_seq,PDB,Chain,Start_index,WT_seq,MT_edit_dist_vs_WT,accepted
0,2.795303,SDIEEQAKIFLDKFNNEREDLFYQSKLASWRYNTNITEENVQNMNI...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,18,1
1,2.795731,SSIEEQAKTFLAKFAHEAEQLKYQSELALWNFNTNITEENVTNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
2,2.797057,SDIEEQAKIFLDKFNNEREDLFYMSKLASWRYNTNITEENVQNMNI...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
3,2.797315,SSLEEQARIFLDKFNHESEDLFYQSALALMNYNTNITEENYQNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
4,2.798953,SSIEEQAKTFLEKFNHEAHDIFYLMELESATYNTNITEENVQNMNM...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,18,1
...,...,...,...,...,...,...,...,...
249995,32.436077,VTIEEQYKTFLLKFNHEAVRLFYISYLAQWRYNTNITEENLQNMYQ...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,21,0
249996,32.686634,PTMEEQFKTFLIKFNHEAQDLFYQYWLASRNYNTNITEENVQNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,21,0
249997,32.965042,FFIEEQYKMFLDKFNHEAYDLFYSSSLARNIYNTNITEENEQRMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,21,0
249998,32.981464,SDIEEQYKTFLDKFNDEAEMLFFQSYLASIVYNTNITEENFQNMNW...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,21,0


In [23]:
# Get topk seqs
num_rounds = 100 # N
round_pool_size = 10000
topk = 10 # K

round_topk = {}
# cols_to_sort = ['latent_head_pred']
cols_to_sort = ['disc_pred']
# cols_to_sort = ['disc_pred', 'latent_head_pred']

foldx_df = None
in_count = 0 
for col_to_sort in cols_to_sort:
    print("col_to_sort: ", col_to_sort)
    round_topk[col_to_sort] = {}
    for round_ind in range(num_rounds):
        sampled_rows = gen250k_df_dropped.sample(n=round_pool_size)
        sorted_sampled_rows = sampled_rows.sort_values(by=col_to_sort, ascending=True)[:topk]
        topk_rows = sorted_sampled_rows[:topk]
        round_topk[col_to_sort][round_ind] = topk_rows
    
    for round_ind in round_topk[col_to_sort]:
        round_topk_df = round_topk[col_to_sort][round_ind]
        if foldx_df is None:
            foldx_df = round_topk_df
        else:
            all_mt = foldx_df['MT_seq'].tolist()

            for row_ind, row in round_topk_df.iterrows():
                if row['MT_seq'] not in all_mt:
                    foldx_df = foldx_df.append(row)
                else:
                    in_count += 1
                    
    print("len(foldx_df)+in_count: ", len(foldx_df)+in_count)

col_to_sort:  disc_pred
len(foldx_df)+in_count:  1000


In [24]:
foldx_df

Unnamed: 0,disc_pred,MT_seq,PDB,Chain,Start_index,WT_seq,MT_edit_dist_vs_WT,accepted
11,2.802795,SDIEEQAKTFLDKFYHEAEDLWYISREARVYYNTNITEENFRNMMN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,18,1
31,2.810106,SSIEEQAKTFLEKFNHEAHDIFYLMELESATYNTNITEENVQNMNM...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
109,2.819377,SSIEERAKTFLDKFNHEAYFLFYQSSLALMNYNTNITEENEFNMNY...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
111,2.819815,ADIEEIAKFFLDKFNHEAEDLFYISALASWFYNTNITEENVYDMMN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
121,2.820358,SSLEEQAATFLDYFNNAVEPLFEQASLKSWSYNTNITEENWQNMNM...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0
...,...,...,...,...,...,...,...,...
246,2.826724,SDLEAQARTFLDNFNHEIEDLFYQASLASWNYNTNITEENIQNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,17,1
283,2.828326,SDIEEQAKTFLDKFNSWAAIVFYQSSLASWNYNTNITEENIEMMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,17,1
219,2.825514,SDIERQAKLFLFKFKFKAEQLFYQSTLASWNYNTNITEENVQNMDN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,18,1
333,2.830192,IDIEEQAKTFLMKFNAEFEDLFYQSSLQSWLYNTNITEENVQIMMG...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,18,1


In [25]:
in_count

671

# save E[min] seqs to do FoldX

In [26]:
seqsforEmin_dict_name = 'generated_seqs/mcmc_ACE/top12500input1Kiter_temp01_trustr18/top12500input1Kiter_temp01_trustr18-mcmc_seqs_discfiltered_seqsforEmin_df.pkl'

if before_foldx:
    with open(seqsforEmin_dict_name, 'wb') as f:
        pickle.dump(round_topk, f)

# with open(seqsforEmin_dict_name, 'rb') as f:
#     b = pickle.load(f)

In [27]:
seqsforEmin_tsv_name = 'generated_seqs/mcmc_ACE/top12500input1Kiter_temp01_trustr18/top12500input1Kiter_temp01_trustr18-mcmc_seqs_discfiltered_seqsforEmin_foldx.tsv'

if before_foldx:
    foldx_df.to_csv(seqsforEmin_tsv_name, sep="\t", index=False)

# <<===== After Foldx Computation =====>>

In [28]:
foldx_results_name = "foldx_sim_results/top12500input1Kiter_temp01_trustr18-mcmc_seqs_discfiltered_seqsforEmin/results_full.tsv"
# foldx_results_name = "foldx_sim_results/top12500input1Kiter_temp01_trustr18-mcmc_seqs_top10Kdiscfiltered/results_full.tsv"
# Emin_results_tsv_name = "foldx_sim_results/tophalf-basegen_seqsforEmin_foldx_results/results_full.tsv"


In [29]:
# load results df here
foldx_results_df = pd.read_table(foldx_results_name)



In [30]:
foldx_results_df

Unnamed: 0,disc_pred,MT_seq,PDB,Chain,Start_index,WT_seq,MT_edit_dist_vs_WT,accepted,ddG
0,2.802795,SDIEEQAKTFLDKFYHEAEDLWYISREARVYYNTNITEENFRNMMN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,18,1,-5.24262
1,2.810106,SSIEEQAKTFLEKFNHEAHDIFYLMELESATYNTNITEENVQNMNM...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0,-6.01953
2,2.819377,SSIEERAKTFLDKFNHEAYFLFYQSSLALMNYNTNITEENEFNMNY...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0,-3.02263
3,2.819815,ADIEEIAKFFLDKFNHEAEDLFYISALASWFYNTNITEENVYDMMN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0,-4.48885
4,2.820358,SSLEEQAATFLDYFNNAVEPLFEQASLKSWSYNTNITEENWQNMNM...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,19,0,-6.25656
...,...,...,...,...,...,...,...,...,...
324,2.826724,SDLEAQARTFLDNFNHEIEDLFYQASLASWNYNTNITEENIQNMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,17,1,-6.51907
325,2.828326,SDIEEQAKTFLDKFNSWAAIVFYQSSLASWNYNTNITEENIEMMNN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,17,1,-6.39551
326,2.825514,SDIERQAKLFLFKFKFKAEQLFYQSTLASWNYNTNITEENVQNMDN...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,18,1,-5.18250
327,2.830192,IDIEEQAKTFLMKFNAEFEDLFYQSSLQSWLYNTNITEENVQIMMG...,template2.pdb,A,19,STIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNN...,18,1,-5.79957


In [31]:
# # for debug
# foldx_results_df = foldx_df

In [32]:
# Compute Emin from foldx values
rows_to_patch = None
Emin_results_dict = {}
for col_to_sort in round_topk:
    print(col_to_sort)
    current_score_round_topk = round_topk[col_to_sort]
    
    round_min_list = []
    
    for round_ind in current_score_round_topk:
        round_topk_df = current_score_round_topk[round_ind]
        
        round_ddG = []
        for row_ind, row in round_topk_df.iterrows():
            row_seq = row['MT_seq']
            matched_row = foldx_results_df.loc[foldx_results_df['MT_seq'] == row_seq]
            if len(matched_row) != 1 :
                print("matched_row: ", matched_row)
                if len(matched_row) == 0 :
                    if rows_to_patch is None:
                        rows_to_patch = row
                    else:
                        rows_to_patch.append(row)
#                 raise
                else:
                    round_ddG.append(matched_row.iloc[0]['ddG'])
            else:        
                round_ddG.append(matched_row['ddG']) # ! changed to ddG
        
        round_min  = np.min(round_ddG)
        round_min_list.append(round_min)
        
    Emin = np.mean(round_min_list)
    
    Emin_results_dict[col_to_sort] = Emin

disc_pred


In [33]:
print(rows_to_patch)

None


# Save Emin Results

In [34]:
Emin_results_name = 'generated_seqs/mcmc_ACE/top12500input1Kiter_temp01_trustr18/discfiltered_tophalf-basegen_seqsforEmin_results.txt'

In [35]:
with open(Emin_results_name, "w") as writer:
    writer.write("***** E[min] results *****\n")
    writer.write("seqsforEmin_dict_name: {}\n".format(seqsforEmin_dict_name))
    for key in sorted(Emin_results_dict.keys()):
        writer.write("%s = %s\n" % (key, str(Emin_results_dict[key])))