# WITH PROLINE

In [None]:
# for protein structural modelling
from modeller import *
from modeller.automodel import *
from Bio.SVDSuperimposer import SVDSuperimposer
import numpy as np
import biovec
import pickle
import glob
from tqdm import tqdm
# from utils functions
from utils.encoder_decoder import *
from utils.sequence import *
from utils.reward import *
from utils.environment import *

# for deep learning
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# for envronment creation
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import MultiDiscrete
from gymnasium.spaces import Discrete
from gymnasium.spaces import Box

#for reading PDB files and processing them
from biopandas.pdb import PandasPdb
import pandas as pd
from utils.sequence import *

# for generating structures through esm instead of modeller
import esm
import biotite.structure as struc
import biotite.structure.io as strucio

# for general utility
import random
import os
import subprocess
import time
import matplotlib.pyplot as plt
from datetime import datetime
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.dpi'] = 200
plt.style.use('seaborn')
plt.rcParams.update({"text.usetex": True})

# WITHOUT PROLINE

In [1]:
# for protein structural modelling
from modeller import *
from modeller.automodel import *
from Bio.SVDSuperimposer import SVDSuperimposer
import numpy as np
import biovec
import pickle
import glob
from tqdm import tqdm
# from utils functions
from utils.encoder_decoder_no_proline import *
from utils.sequence import *
from utils.reward import *
from utils.environment_no_proline import *

# for deep learning
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# for envronment creation
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import MultiDiscrete
from gymnasium.spaces import Discrete
from gymnasium.spaces import Box

#for reading PDB files and processing them
from biopandas.pdb import PandasPdb
import pandas as pd
from utils.sequence import *

# for generating structures through esm instead of modeller
import esm
import biotite.structure as struc
import biotite.structure.io as strucio

# for general utility
import random
import os
import subprocess
import time
import matplotlib.pyplot as plt
from datetime import datetime
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.dpi'] = 200
plt.style.use('seaborn')
plt.rcParams.update({"text.usetex": True})

In [2]:
def plot_rewards(array_name,window_size):
    df = pd.DataFrame(array_name,columns=['reward'])
    rolling_average_reward = df['reward'].rolling(window=window_size).mean().dropna()
    plt.plot(rolling_average_reward)
    plt.ylabel(f'Rolling average of total rewards (window = {window_size})')
    plt.xlabel('Number of episodes')

In [3]:
class PolicyNetwork():
    def __init__(self, n_state, n_action, n_hidden=50,lr=0.001,entropy_weight=0.01):
        self.model = nn.Sequential(nn.Linear(n_state, n_hidden),
                                   nn.ReLU(),
                                   nn.Linear(n_hidden, n_action),
                                   nn.Softmax(dim=-1), )
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
        self.entropy_weight = entropy_weight
    def predict(self, s):
        return self.model(torch.Tensor(s))
    def update(self, returns, log_probs,entropies):
        policy_gradient = []
        for log_prob, Gt, entropy in zip(log_probs, returns, entropies):
            policy_gradient.append((-log_prob * Gt) + (self.entropy_weight * entropy))
        loss = torch.stack(policy_gradient).sum()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def get_action(self, s):

        probs = self.predict(s)
        action = torch.multinomial(probs, 1).item()
        log_prob = torch.log(probs[action])
        entropy = -torch.sum(probs * torch.log(probs + 1e-9))  # Calculate entropy
        return action, log_prob, entropy

In [4]:
# this is the evaluation model
def evaluate_model(estimator, env, num_episodes):
    total_reward_episode = [0] * num_episodes
    actions_taken_in_episodes = {}
    number_of_mutations_per_episode = [0] * num_episodes
    file_chosen_for_mutations = {}
    for episode in tqdm(range(num_episodes)):
        actions_taken_in_episodes[episode] = []
        file_chosen_for_mutations[episode] = [0]
        rewards = []
        
        state, info,dummy = env.reset()
        while True:
            action, log_prob, entropy = estimator.get_action(state)
            next_state, reward, terminated, truncated, info = env.step(action)
            total_reward_episode[episode] += reward
            rewards.append(reward)
            name_of_the_protein = os.path.basename(env.path_of_template_pdb_file),
            actions_taken_in_episodes[episode].append(info)
            number_of_mutations_per_episode[episode]+=1
            file_chosen_for_mutations[episode][0] = name_of_the_protein 
            if terminated or truncated:
                break
            state = next_state
#         print(f'{episode} episode done')
    return total_reward_episode,np.mean(total_reward_episode),np.std(total_reward_episode),actions_taken_in_episodes,number_of_mutations_per_episode,file_chosen_for_mutations

In [13]:
def validate_model(entropy,percentage,training_folder_name,validation_folder_path,validation_folder_code,use_proline = False):
    validation_folder_base_path = os.path.basename(validation_folder_path)
    n_hidden = 128
    lr = 0.0007
    gamma = 0.95
    entropic_factor = float(entropy)
    if use_proline == False:
#         saved_rl_model_1_lr_0.0007_gamma_0.95_ep_8000_entropic_factor_0.0_trained_on_tp_30_training_no_proline.pth
        saved_model_file_path = f'saved_models/saved_rl_model_1_lr_0.0007_gamma_0.95_ep_8000_entropic_factor_{entropy}_co_{percentage}_trained_on_{training_folder_name}_no_proline.pth'
        validation_structures_to_save_path = f'validation_structures/entropy_{entropy}_co_{percentage}_trained_on_{training_folder_name}_validated_on_{validation_folder_code}_no_proline'
        os.makedirs(validation_structures_to_save_path,exist_ok=True)
        env = PeptideEvolutionNoProline(folder_containing_pdb_files=validation_folder_path,
               structure_generator='esm_sse',
               validation=True,
               reward_cutoff=float(percentage),
               unique_path_to_give_for_file='valid',
               folder_to_save_validation_files=validation_structures_to_save_path)
        folder_for_saving_stuff = f'validation_results/entropy_{entropic_factor}_co_{percentage}_trained_on_{training_folder_name}_validated_on_{validation_folder_code}_no_proline'
        os.makedirs(folder_for_saving_stuff,exist_ok=True)
    if use_proline == True:
        saved_model_file_path = f'saved_models/saved_rl_model_1_lr_0.0007_gamma_0.95_ep_8000_entropic_factor_{entropy}_co_{percentage}_trained_on_{training_folder_name}.pth'
        validation_structures_to_save_path = f'validation_structures/entropy_{entropy}_co_{percentage}_trained_on_{training_folder_name}_validated_on_{validation_folder_code}'
        os.makedirs(validation_structures_to_save_path,exist_ok=True)
        env = PeptideEvolution(folder_containing_pdb_files=validation_folder_path,
               structure_generator='esm_sse',
               validation=True,
               reward_cutoff=float(percentage),
               unique_path_to_give_for_file='valid',
               folder_to_save_validation_files=validation_structures_to_save_path)
        folder_for_saving_stuff = f'validation_results/entropy_{entropic_factor}_co_{percentage}_trained_on_{training_folder_name}_validated_on_{validation_folder_code}'
        os.makedirs(folder_for_saving_stuff,exist_ok=True)
        
#     validation_structures_to_save_path = f'validation_structures/entropy_{entropy}_co_{percentage}_trained_on_{training_folder_name}_validated_on_{validation_folder_code}'



    n_state = env.observation_space.shape[0]
    n_action = env.action_space.n

    loaded_estimator = PolicyNetwork(n_state, n_action, n_hidden, lr, entropy_weight=entropic_factor)
    loaded_estimator.model.load_state_dict(torch.load(saved_model_file_path))
    total_reward_array,mean_validation_reward, standard_deviation,actions_taken, mutations_array, files_chosen_for_mutations = evaluate_model(estimator=loaded_estimator, env=env,num_episodes=500)
    folder_for_saving_stuff = f'validation_results/entropy_{entropic_factor}_co_{percentage}_trained_on_{training_folder_name}_validated_on_{validation_folder_code}'
    os.makedirs(folder_for_saving_stuff,exist_ok=True)
    with open(f'{folder_for_saving_stuff}/entropy_{entropic_factor}_co_{percentage}_mutations.pkl','wb') as file:
        pickle.dump(actions_taken,file)
    with open(f'{folder_for_saving_stuff}/entropy_{entropic_factor}_co_{percentage}_files.pkl','wb') as file:
        pickle.dump(files_chosen_for_mutations,file)
    
    
    



In [None]:
def validate_model_no_proline(entropy,percentage,training_folder_name,validation_folder_path,validation_folder_code):
    

In [15]:
entropy_no_proline = 0.0
percentage_no_proline = '50.0'
validate_model(entropy=entropy_no_proline,
              percentage=percentage_no_proline,
              training_folder_name='tp_30_training',
            validation_folder_path='../DrugResistance/folder_for_machine_learning/tp_30_validate/',
              validation_folder_code='tp_30_validate',
              use_proline=False)

100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [57:43<00:00,  6.93s/it]


In [30]:
list_of_percentages = ['30.0','40.0','50.0']
list_of_entropies = ['0.0','0.001','0.0001','1e-05']

In [32]:
for entropy in list_of_entropies:
    for percentage in list_of_percentages:
        try:
            validate_model(entropy, 
                           percentage,
                           training_folder_name = 'membranome_30_training',
                           validation_folder_path='../DrugResistance/folder_for_machine_learning/tp_30_validate/',
                           validation_folder_code='tp_30_validate')
        except Exception as e:
            print(e)
            print(entropy,percentage)

100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [48:54<00:00,  5.87s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [37:20<00:00,  4.48s/it]


[Errno 2] No such file or directory: 'saved_models/saved_rl_model_1_lr_0.0007_gamma_0.95_ep_8000_entropic_factor_0.0_co_50.0_trained_on_membranome_30_training.pth'
0.0 50.0
[Errno 2] No such file or directory: 'saved_models/saved_rl_model_1_lr_0.0007_gamma_0.95_ep_8000_entropic_factor_0.001_co_30.0_trained_on_membranome_30_training.pth'
0.001 30.0
[Errno 2] No such file or directory: 'saved_models/saved_rl_model_1_lr_0.0007_gamma_0.95_ep_8000_entropic_factor_0.001_co_40.0_trained_on_membranome_30_training.pth'
0.001 40.0


100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [28:57<00:00,  3.48s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [41:48<00:00,  5.02s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [42:41<00:00,  5.12s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [29:37<00:00,  3.56s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [44:21<00:00,  5.32s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [31:28<00:00,  3.78s/it]

[Errno 2] No such file or directory: 'saved_models/saved_rl_model_1_lr_0.0007_gamma_0.95_ep_8000_entropic_factor_1e-05_co_50.0_trained_on_membranome_30_training.pth'
1e-05 50.0





In [34]:
validate_model(entropy='0.001', 
               percentage='40.0',
               training_folder_name = 'membranome_30_training',
               validation_folder_path='../DrugResistance/folder_for_machine_learning/tp_30_validate/',
               validation_folder_code='tp_30_validate')

100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [39:42<00:00,  4.76s/it]


In [38]:
validate_model(entropy='0.001', 
               percentage='30.0',
               training_folder_name = 'membranome_30_training',
               validation_folder_path='../DrugResistance/folder_for_machine_learning/tp_30_validate/',
               validation_folder_code='tp_30_validate')

100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [47:54<00:00,  5.75s/it]


In [40]:
validate_model(entropy='0.0', 
               percentage='50.0',
               training_folder_name = 'membranome_30_training',
               validation_folder_path='../DrugResistance/folder_for_machine_learning/tp_30_validate/',
               validation_folder_code='tp_30_validate')

100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [33:37<00:00,  4.04s/it]


In [41]:
validate_model(entropy='1e-05', 
               percentage='50.0',
               training_folder_name = 'membranome_30_training',
               validation_folder_path='../DrugResistance/folder_for_machine_learning/tp_30_validate/',
               validation_folder_code='tp_30_validate')

100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [33:38<00:00,  4.04s/it]


In [46]:
for files in glob.glob('validation_structures/entropy*/*.pdb'):
    if percentage_of_b(get_structural_annotations(files)) >40 :
        if 'membranome' not in files:
            print(files,percentage_of_b(get_structural_annotations(files)))

validation_structures/entropy_0.0_co_30.0/3mde_A_1_292_321_2024-01-22 12:40:46.pdb 46.666666666666664
validation_structures/entropy_0.0_co_50.0/2h89_C_1_80_109_2024-01-22 13:35:16.pdb 43.333333333333336
validation_structures/entropy_0.001_co_30.0/3ayg_A_1_434_463_2024-01-21 00:40:46.pdb 50.0
validation_structures/entropy_0.001_co_30.0/4kgj_A_1_145_174_2024-01-21 00:32:56.pdb 43.333333333333336
validation_structures/entropy_0.001_co_30.0/2ycz_A_1_75_104_2024-01-21 00:43:49.pdb 43.333333333333336
validation_structures/entropy_0.0001_co_30.0/4il6_D_1_108_137_2024-01-21 02:20:30.pdb 60.0
validation_structures/entropy_0.0001_co_30.0/4il6_D_1_108_137_2024-01-21 02:45:08.pdb 50.0
validation_structures/entropy_0.0001_co_50.0/4o9r_A_1_312_341_2024-01-21 03:25:57.pdb 66.66666666666666
validation_structures/entropy_1e-05_co_30.0/1zrt_C_1_89_118_2024-01-21 04:10:48.pdb 43.333333333333336
validation_structures/entropy_0.001_co_50.0/4il6_D_1_108_137_2024-01-21 01:56:17.pdb 50.0


In [17]:
env = PeptideEvolution(folder_containing_pdb_files='../DrugResistance/folder_for_machine_learning/tp_30_validate/',
                       structure_generator='esm_sse',
                       validation=True,
                       reward_cutoff=40,
                       unique_path_to_give_for_file='valid',
                       folder_to_save_validation_files='validation_structures')

In [18]:
n_state = env.observation_space.shape[0]
n_action = env.action_space.n
n_hidden = 128
lr = 0.001
gamma = 0.95
entropic_factor = 0.0

In [19]:
loaded_estimator = PolicyNetwork(n_state, n_action, n_hidden, lr, entropy_weight=entropic_factor)
loaded_estimator.model.load_state_dict(torch.load('saved_models/saved_rl_model_1_lr_0.001_gamma_0.95_ep_5000_entropic_factor_0.0_trained_on_29_usable.pth'))


<All keys matched successfully>

In [20]:
total_reward_array,mean_validation_reward, standard_deviation,actions_taken, mutations_array, files_chosen_for_mutations = evaluate_model(estimator=loaded_estimator, env=env,num_episodes=500)


100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [40:10<00:00,  4.82s/it]


In [21]:
with open(f'validation_results/0.0.pkl','wb') as file:
    pickle.dump(actions_taken,file)

In [54]:
files_chosen_for_mutations

[('Q54GX9_DICDI.pdb',),
 ('Q54GX9_DICDI.pdb',),
 ('Q54GX9_DICDI.pdb',),
 ('Q54GX9_DICDI.pdb',),
 ('Q54GX9_DICDI.pdb',),
 ('Q54GX9_DICDI.pdb',),
 ('Q54GX9_DICDI.pdb',),
 ('Q54GX9_DICDI.pdb',),
 ('CHSTC_HUMAN.pdb',),
 ('CHSTC_HUMAN.pdb',),
 ('CHSTC_HUMAN.pdb',),
 ('CHSTC_HUMAN.pdb',),
 ('CHSTC_HUMAN.pdb',),
 ('CHSTC_HUMAN.pdb',),
 ('CHSTC_HUMAN.pdb',),
 ('SIGL9_HUMAN.pdb',),
 ('SIGL9_HUMAN.pdb',),
 ('SIGL9_HUMAN.pdb',),
 ('SIGL9_HUMAN.pdb',),
 ('CHSTC_HUMAN.pdb',),
 ('CHSTC_HUMAN.pdb',),
 ('CHSTC_HUMAN.pdb',),
 ('CHSTC_HUMAN.pdb',),
 ('CHSTC_HUMAN.pdb',),
 ('CHSTC_HUMAN.pdb',),
 ('CHSTC_HUMAN.pdb',),
 ('CHSTC_HUMAN.pdb',),
 ('Q54BE6_DICDI.pdb',),
 ('Q54BE6_DICDI.pdb',),
 ('Q54BE6_DICDI.pdb',),
 ('Q54BE6_DICDI.pdb',),
 ('Q54BE6_DICDI.pdb',),
 ('Q54BE6_DICDI.pdb',),
 ('Q54BE6_DICDI.pdb',),
 ('MBHT_ECOLI.pdb',),
 ('MBHT_ECOLI.pdb',),
 ('BZP49_ARATH.pdb',),
 ('BZP49_ARATH.pdb',),
 ('BZP49_ARATH.pdb',),
 ('Y2923_ARATH.pdb',),
 ('Y2923_ARATH.pdb',),
 ('Y2923_ARATH.pdb',),
 ('Y2923_ARATH.pdb',)

In [38]:
pd.concat([pd.DataFrame(actions_taken[i],columns=['position','amino_acid']) for i in actions_taken.keys()]).value_counts()

position  amino_acid
12        D             736
14        P             547
6         P             411
19        P             386
7         D             358
                       ... 
13        G               1
1         R               1
16        S               1
          T               1
15        D               1
Length: 63, dtype: int64

In [42]:
pd.DataFrame([actions_taken[i][-1] for i in actions_taken.keys()],columns=['position','amino_acid'])['amino_acid'].value_counts()

P    327
D    115
N     32
K     12
E      7
H      3
S      1
Y      1
W      1
V      1
Name: amino_acid, dtype: int64

In [55]:
for file in glob.glob('validation_structures/*.pdb'):
    if percentage_of_b(get_structural_annotations(file))>30:
        print(file,percentage_of_b(get_structural_annotations(file)))

validation_structures/1sb3_A_1_524_553_2024-01-09 19:57:28.pdb 33.33333333333333
validation_structures/2ycz_A_1_75_104_2024-01-09 19:58:59.pdb 31.03448275862069
validation_structures/RAE1E_HUMAN_2024-01-19 17:25:07.pdb 62.06896551724138
validation_structures/4q65_A_1_452_481_2024-01-12 19:19:37.pdb 40.0
validation_structures/2ash_C_1_320_349_2024-01-12 20:20:08.pdb 56.666666666666664
validation_structures/CASP_ARATH_2024-01-19 17:12:09.pdb 41.37931034482759
validation_structures/SINE3_ARATH_2024-01-19 17:41:17.pdb 31.03448275862069
validation_structures/4c5u_A_1_321_350_2024-01-12 21:13:47.pdb 43.333333333333336
validation_structures/CTSRD_HUMAN_2024-01-19 17:35:18.pdb 62.06896551724138
validation_structures/NDUA1_ARATH_2024-01-19 17:16:19.pdb 31.03448275862069
validation_structures/COX41_HUMAN_2024-01-19 17:26:27.pdb 37.93103448275862
validation_structures/4xnj_A_1_449_478_2024-01-12 20:35:58.pdb 36.666666666666664
validation_structures/STX1A_HUMAN_2024-01-19 17:02:35.pdb 31.034482758

In [None]:
with open(f'validation_results/{entropy}.pkl','wb') as file:
    pickle.dump(actions_taken,file)

In [7]:
given_entropies = ['0.01','0.001','0.0001','1e-05']

In [8]:
mutations_array_for_all_entropies = []

In [9]:
for entropy in given_entropies:
    saved_model_file_path = f'saved_models/saved_rl_model_1_lr_0.0007_gamma_0.95_ep_8000_entropic_factor_{entropy}_co_30.0_trained_on_tp_30_training.pth'
    loaded_estimator_each_entropy = PolicyNetwork(n_state, n_action, n_hidden, lr, entropy_weight=entropy)
    loaded_estimator_each_entropy.model.load_state_dict(torch.load(saved_model_file_path))
    total_reward_array,mean_validation_reward, standard_deviation,actions_taken, mutations_array = evaluate_model(estimator=loaded_estimator_each_entropy, env=env,num_episodes=500)
    with open(f'validation_results/{entropy}.pkl','wb') as file:
        pickle.dump(actions_taken,file)
    mutations_array_for_all_entropies.append([entropy,mutations_array])

100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [42:36<00:00,  5.11s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [41:05<00:00,  4.93s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [37:16<00:00,  4.47s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████| 500/500 [33:23<00:00,  4.01s/it]


In [3]:
def validate_entropy(entropy_file_path):
    saved_model_file_path = entropy_file_path
    entropy_base_path = os.path.basename(entropy_file_path).split('.')[0]
    loaded_estimator_each_entropy = PolicyNetwork(n_state, n_action, n_hidden, lr, entropy_weight=entropy)
    loaded_estimator_each_entropy.model.load_state_dict(torch.load(saved_model_file_path))
    total_reward_array,mean_validation_reward, standard_deviation,actions_taken, mutations_array = evaluate_model(estimator=loaded_estimator_each_entropy, env=env,num_episodes=500)
    with open(f'validation_results/{entropy_base_path}.pkl','wb') as file:
        pickle.dump(actions_taken,file)
    mutations_array_for_all_entropies.append([entropy,mutations_array])

In [14]:
mutations_array_for_all_entropies
with open('no_of_mutations_array_all_entropies.pkl', 'wb') as file:
    pickle.dump(mutations_array_for_all_entropies, file)

In [37]:
with open('delete.pkl', 'wb') as file:
    pickle.dump(actions_taken, file)

In [1]:
with open('no_of_mutations_array_all_entropies.pkl','rb') as file:
    loaded_dict = pickle.load(file)

NameError: name 'pickle' is not defined

In [16]:
loaded_dict

[['0.01',
  [6,
   5,
   2,
   8,
   3,
   9,
   15,
   8,
   2,
   6,
   2,
   5,
   5,
   7,
   15,
   6,
   7,
   5,
   7,
   5,
   13,
   3,
   2,
   4,
   3,
   2,
   3,
   15,
   4,
   5,
   3,
   7,
   3,
   7,
   15,
   3,
   4,
   11,
   3,
   4,
   11,
   15,
   15,
   6,
   2,
   15,
   6,
   15,
   4,
   7,
   15,
   5,
   2,
   2,
   5,
   11,
   7,
   5,
   5,
   9,
   5,
   5,
   2,
   2,
   5,
   2,
   2,
   9,
   3,
   5,
   3,
   15,
   2,
   4,
   15,
   15,
   5,
   15,
   15,
   15,
   9,
   3,
   7,
   3,
   14,
   15,
   2,
   15,
   2,
   2,
   5,
   4,
   2,
   3,
   9,
   2,
   4,
   15,
   6,
   2,
   12,
   8,
   4,
   2,
   4,
   2,
   7,
   6,
   4,
   2,
   5,
   5,
   6,
   15,
   15,
   7,
   4,
   15,
   9,
   6,
   7,
   15,
   6,
   5,
   8,
   6,
   3,
   4,
   3,
   4,
   7,
   15,
   8,
   5,
   8,
   6,
   2,
   2,
   10,
   3,
   4,
   7,
   5,
   3,
   15,
   7,
   15,
   7,
   6,
   6,
   9,
   15,
   14,
   6,
   2,
   5,
   3,
   15,
   15,


In [34]:
actions_taken

{0: [[18, 'D'],
  [23, 'P'],
  [10, 'P'],
  [12, 'P'],
  [23, 'P'],
  [12, 'P'],
  [18, 'D'],
  [12, 'P'],
  [12, 'P'],
  [12, 'P'],
  [24, 'F'],
  [25, 'P']],
 1: [[25, 'P'], [10, 'P']],
 2: [[12, 'P'],
  [10, 'P'],
  [10, 'P'],
  [10, 'P'],
  [25, 'P'],
  [25, 'P'],
  [10, 'D'],
  [22, 'T'],
  [10, 'P'],
  [18, 'D'],
  [26, 'P'],
  [23, 'P'],
  [12, 'P'],
  [18, 'D'],
  [23, 'P']],
 3: [[23, 'P'],
  [25, 'P'],
  [25, 'P'],
  [12, 'P'],
  [10, 'P'],
  [25, 'P'],
  [23, 'P'],
  [10, 'P'],
  [25, 'P'],
  [10, 'P'],
  [10, 'P'],
  [23, 'P'],
  [23, 'P'],
  [23, 'P'],
  [23, 'P']],
 4: [[10, 'P'], [18, 'D'], [25, 'P']],
 5: [[10, 'P'],
  [18, 'D'],
  [23, 'P'],
  [10, 'P'],
  [12, 'P'],
  [12, 'P'],
  [25, 'P']],
 6: [[25, 'P'], [11, 'M'], [12, 'P'], [18, 'D'], [10, 'P']],
 7: [[12, 'P'],
  [23, 'P'],
  [12, 'P'],
  [23, 'P'],
  [18, 'D'],
  [18, 'D'],
  [23, 'P'],
  [10, 'P']],
 8: [[25, 'P'], [18, 'D'], [18, 'D'], [10, 'P']],
 9: [[12, 'P'], [23, 'P']],
 10: [[18, 'D'],
  [12, 'P'],
  [

In [25]:
df_of_actions = pd.concat([pd.DataFrame(actions_taken[i],columns=['position','amino_acid']) for i in actions_taken.keys()])

In [None]:
with open('delete.pl')

In [35]:
actions_taken

{0: [[18, 'D'],
  [23, 'P'],
  [10, 'P'],
  [12, 'P'],
  [23, 'P'],
  [12, 'P'],
  [18, 'D'],
  [12, 'P'],
  [12, 'P'],
  [12, 'P'],
  [24, 'F'],
  [25, 'P']],
 1: [[25, 'P'], [10, 'P']],
 2: [[12, 'P'],
  [10, 'P'],
  [10, 'P'],
  [10, 'P'],
  [25, 'P'],
  [25, 'P'],
  [10, 'D'],
  [22, 'T'],
  [10, 'P'],
  [18, 'D'],
  [26, 'P'],
  [23, 'P'],
  [12, 'P'],
  [18, 'D'],
  [23, 'P']],
 3: [[23, 'P'],
  [25, 'P'],
  [25, 'P'],
  [12, 'P'],
  [10, 'P'],
  [25, 'P'],
  [23, 'P'],
  [10, 'P'],
  [25, 'P'],
  [10, 'P'],
  [10, 'P'],
  [23, 'P'],
  [23, 'P'],
  [23, 'P'],
  [23, 'P']],
 4: [[10, 'P'], [18, 'D'], [25, 'P']],
 5: [[10, 'P'],
  [18, 'D'],
  [23, 'P'],
  [10, 'P'],
  [12, 'P'],
  [12, 'P'],
  [25, 'P']],
 6: [[25, 'P'], [11, 'M'], [12, 'P'], [18, 'D'], [10, 'P']],
 7: [[12, 'P'],
  [23, 'P'],
  [12, 'P'],
  [23, 'P'],
  [18, 'D'],
  [18, 'D'],
  [23, 'P'],
  [10, 'P']],
 8: [[25, 'P'], [18, 'D'], [18, 'D'], [10, 'P']],
 9: [[12, 'P'], [23, 'P']],
 10: [[18, 'D'],
  [12, 'P'],
  [

In [32]:
df_of_actions

Unnamed: 0,position,amino_acid
0,18,D
1,23,P
2,10,P
3,12,P
4,23,P
...,...,...
0,25,P
1,25,P
2,12,P
3,10,P


In [31]:
df_of_actions

Unnamed: 0,position,amino_acid
0,18,D
1,23,P
2,10,P
3,12,P
4,23,P
...,...,...
0,25,P
1,25,P
2,12,P
3,10,P


In [13]:
df_of_actions

Unnamed: 0,position,amino_acid
0,18,D
1,23,P
2,10,P
3,12,P
4,23,P
...,...,...
0,25,P
1,25,P
2,12,P
3,10,P


In [10]:
pd.DataFrame(df_of_actions['amino_acid'].value_counts()).reset_index().rename(columns={'index':'amino_acid','amino_acid':'frequency'})

Unnamed: 0,amino_acid,frequency
0,P,1360
1,D,282
2,T,17
3,G,13
4,V,4
5,F,3
6,Y,3
7,L,2
8,N,2
9,E,2


In [9]:
list_of_beta_percentages = []
for file in glob.glob('validation_structures/entropy_*/*.pdb'):
    beta_percentage = percentage_of_b(get_structural_annotations(file))
    list_of_beta_percentages.append([file,beta_percentage])

In [11]:
df_beta = pd.DataFrame(list_of_beta_percentages,columns=['path','percentage'])

In [28]:
df_beta[df_beta['percentage']>40].sort_values(by='percentage')

Unnamed: 0,path,percentage
1056,validation_structures/entropy_0.0_co_50.0/2h89...,43.333333
1857,validation_structures/entropy_0.001_co_30.0/4k...,43.333333
1977,validation_structures/entropy_0.001_co_30.0/2y...,43.333333
4913,validation_structures/entropy_1e-05_co_30.0/1z...,43.333333
580,validation_structures/entropy_0.0_co_30.0/3mde...,46.666667
1677,validation_structures/entropy_0.001_co_30.0/3a...,50.0
2739,validation_structures/entropy_0.0001_co_30.0/4...,50.0
5395,validation_structures/entropy_0.001_co_50.0/4i...,50.0
2670,validation_structures/entropy_0.0001_co_30.0/4...,60.0
3604,validation_structures/entropy_0.0001_co_50.0/4...,66.666667


In [22]:
df_beta.iloc[1677]['path']

'validation_structures/entropy_0.001_co_30.0/3ayg_A_1_434_463_2024-01-21 00:40:46.pdb'