In [None]:
from protein_engineering.pipeline import AADataset
from torch_geometric.loader import DataLoader as geom_DataLoader
from protein_engineering.archive.test import GB1Olson 

# Data loading
data = GB1Olson()
protein = '2GI9'
dataset = AADataset(protein, code=protein)
dataloader = geom_DataLoader(dataset)


In [None]:
"""
/Users/antoniaboca/partIII-amino-acid-prediction/data/ProteinGym_structures_clean/1IGS.pdb : TRPC_SACS2_Chan_2017
/Users/antoniaboca/partIII-amino-acid-prediction/data/ProteinGym_structures_clean/6LQA.pdb : SCN5A_HUMAN_Glazer_2019
"""

In [None]:
import torch
from gvp_antonia.main import ModelWrapper
from protein_engineering.pipeline import Pair, uncertainty_search

# Model loading
MODEL_PATH = '/Users/antoniaboca/partIII-amino-acid-prediction/data/eqgat_debug.ckpt'
best_model = ModelWrapper('eqgat', 1e-3, None, 0.0, n_layers=5)
best_model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device('cpu'))['state_dict'])
    

In [None]:
uncertainty_search(best_model, dataloader, k=4, locs=2)

In [None]:
import pandas as pd
new_df = data.data.copy()
new_df['variant'] = new_df['variant'].astype('str')

mutations = pd.read_csv('mutations.csv')
codes = mutations['mutation_codes']
data.data[new_df['variant'].isin(codes)]

In [10]:
from pathlib import Path 
from biopandas.pdb import PandasPdb
base_path = '/Users/antoniaboca/partIII-amino-acid-prediction/data/ProteinGym_structures_clean'
monomers = 0
for index, row in df.iterrows():
    structs = row['identifiers'].split(',')
    for struct in structs:
        file = Path(f'{base_path}/{struct}.pdb')
        if file.exists():
            pdb = PandasPdb().read_pdb(str(file)).df['ATOM']
            if pdb['chain_id'].nunique() == 1:
                print(f'{file} : {row["name"]}')
                monomers += 1
                break


/Users/antoniaboca/partIII-amino-acid-prediction/data/ProteinGym_structures_clean/1IGS.pdb : TRPC_SACS2_Chan_2017
/Users/antoniaboca/partIII-amino-acid-prediction/data/ProteinGym_structures_clean/6LQA.pdb : SCN5A_HUMAN_Glazer_2019
/Users/antoniaboca/partIII-amino-acid-prediction/data/ProteinGym_structures_clean/8F2I.pdb : P53_HUMAN_Giacomelli_NULL_Nutlin_2018
/Users/antoniaboca/partIII-amino-acid-prediction/data/ProteinGym_structures_clean/6BVC.pdb : AACC1_PSEAI_Dandage_2018
/Users/antoniaboca/partIII-amino-acid-prediction/data/ProteinGym_structures_clean/3VUB.pdb : CCDB_ECOLI_Adkar_2012
/Users/antoniaboca/partIII-amino-acid-prediction/data/ProteinGym_structures_clean/8F2I.pdb : P53_HUMAN_Giacomelli_NULL_Nutlin_2018
/Users/antoniaboca/partIII-amino-acid-prediction/data/ProteinGym_structures_clean/5WDX.pdb : POLG_HCVJF_Qi_2014
/Users/antoniaboca/partIII-amino-acid-prediction/data/ProteinGym_structures_clean/2V01.pdb : CALM1_HUMAN_Weile_2017
/Users/antoniaboca/partIII-amino-acid-predicti

In [10]:
homomultimer = 'HIS7_YEAST_Pokusaeva_2019.csv'

In [11]:
from protein_engineering.protein_gym import ProteinGymDataset
import pandas as pd
from pathlib import Path
from protein_engineering.pipeline import AADataset

file = 'SCN5A_HUMAN_Glazer_2019.csv'
base_path = '/Users/antoniaboca/partIII-amino-acid-prediction/data/ProteinGym_substitutions'

df = pd.read_csv('/Users/antoniaboca/partIII-amino-acid-prediction/data/mapping.csv')
gym_dataset = ProteinGymDataset(Path(f'{base_path}/{homomultimer}'))
dataset = AADataset(gym_dataset, df)

It seems the fitness data is not log transformed.
No wildtype data found. Inferring wildtype data.
/Users/antoniaboca/partIII-amino-acid-prediction/data/ProteinGym_structures_clean/6EZM.pdb
Detected heteromeric assembly. Skipping for now. Are you sure you are using AlphaFold?


In [2]:
from torch_geometric.loader import DataLoader as geom_DataLoader

loader = geom_DataLoader(dataset)

In [3]:
import torch
from gvp_antonia.main import ModelWrapper
from protein_engineering.pipeline import uncertainty_search

# Model loading
MODEL_PATH = '/Users/antoniaboca/partIII-amino-acid-prediction/data/eqgat_debug.ckpt'
best_model = ModelWrapper('eqgat', 1e-3, None, 0.0, n_layers=5)
best_model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device('cpu'))['state_dict'])
    

Using Cross Product
With Feature Attention


<All keys matched successfully>

In [4]:
uncertainty_search(best_model, loader)

1it [00:02,  2.33s/it]

tensor([[ 0.4125, -0.2038, -0.5431, -0.4071,  0.9860, -0.0642, -0.1356, -1.1135,
          1.5418,  0.1272,  0.3535, -0.6924,  0.9228, -0.0910, -0.8472,  0.6529,
          0.2967, -0.1092, -0.4290,  0.0823]])
tensor([[8]])


2it [00:04,  2.19s/it]

tensor([[-1.5311, -3.3717, -2.9112, -2.4956,  4.8571, -2.5910, -2.7655, -2.7414,
         -0.1756, -3.4286, -1.4761, -3.7741, -0.2533, -2.4268, -4.9171, -1.0681,
         -2.2037, -2.2754, -2.9614, -3.9457]])
tensor([[4]])


3it [00:06,  2.12s/it]

tensor([[-1.3665, -1.4528, -1.5599, -2.0405,  3.2129, -1.5341, -0.8675, -2.8573,
          0.8822, -0.6912,  1.0495, -2.0805,  1.0369, -0.4792, -1.1437, -0.7933,
         -0.9320, -1.1999, -1.3370, -1.3581]])
tensor([[4]])


4it [00:08,  2.08s/it]

tensor([[-1.2441, -1.4707, -1.6421, -1.4034,  3.3948, -1.2156, -0.9157, -3.5730,
          1.1012, -1.9598,  0.7000, -1.9865,  0.5431, -0.9983, -1.7451, -0.2817,
         -0.8877, -0.9191, -1.5175, -2.3797]])
tensor([[4]])


5it [00:10,  2.09s/it]

tensor([[-1.3384, -2.3301, -2.0038, -1.5211,  4.3319, -1.3846, -1.3358, -3.4052,
          0.5777, -3.2017, -0.3482, -2.6666,  0.2039, -1.1787, -2.4520, -0.5764,
         -2.1993, -1.2948, -1.5960, -3.4225]])
tensor([[4]])


5it [00:10,  2.20s/it]


KeyboardInterrupt: 

In [9]:
mutations = pd.read_csv('mutations_all.csv')

In [10]:
mask = mutations['mutation_code'].str[0] == mutations['mutation_code'].str[-1]

In [11]:
mutations[~mask]

Unnamed: 0,wildtype,name,mutation_position,mutation_code,mutation_confidence,rank
75,MPRYLKGWLKDVVQLSLRRPSFRASRQRPIISLNERILEFNKRNIT...,TRPC_SACS2_Chan_2017,107,I107V,0.42251700162887573,76
77,MPRYLKGWLKDVVQLSLRRPSFRASRQRPIISLNERILEFNKRNIT...,TRPC_SACS2_Chan_2017,105,I105V,0.421470582485199,78
78,MPRYLKGWLKDVVQLSLRRPSFRASRQRPIISLNERILEFNKRNIT...,TRPC_SACS2_Chan_2017,65,D65N,0.4188465476036072,79
79,MPRYLKGWLKDVVQLSLRRPSFRASRQRPIISLNERILEFNKRNIT...,TRPC_SACS2_Chan_2017,183,D183N,0.4133667051792145,80
83,MPRYLKGWLKDVVQLSLRRPSFRASRQRPIISLNERILEFNKRNIT...,TRPC_SACS2_Chan_2017,208,V208I,0.40425795316696167,84
...,...,...,...,...,...,...
7137,MPRYLKGWLKDVVQLSLRRPSFRASRQRPIISLNERILEFNKRNIT...,TRPC_SACS2_Chan_2017,173,G173V,8.53897286212657e-10,2376
7138,MPRYLKGWLKDVVQLSLRRPSFRASRQRPIISLNERILEFNKRNIT...,TRPC_SACS2_Chan_2017,226,G226P,3.8456759998695134e-10,2377
7139,MPRYLKGWLKDVVQLSLRRPSFRASRQRPIISLNERILEFNKRNIT...,TRPC_SACS2_Chan_2017,153,G153P,1.0999452737525672e-10,2378
7140,MPRYLKGWLKDVVQLSLRRPSFRASRQRPIISLNERILEFNKRNIT...,TRPC_SACS2_Chan_2017,59,G59P,2.3683345215519402e-11,2379


In [24]:
'AFP06633F1'[2:-2]

'P06633'

In [7]:
df[df['identifiers'].str.contains('6EZM')]

Unnamed: 0,wildtype,identifiers,name
0,MTEQKALVKRITNETKIQIAISLKGGPLAIEHSIFPEKEAEAVAEQ...,"6EZM,AFP06633F1",HIS7_YEAST_Pokusaeva_2019


In [14]:
f'{base_path}/{homomultimer}'

'/Users/antoniaboca/partIII-amino-acid-prediction/data/ProteinGym_substitutions/HIS7_YEAST_Pokusaeva_2019.csv'

In [18]:
from Bio import PDB

parser = PDB.PDBParser()  # create a PDBParser object
structure = parser.get_structure("example", f'/Users/antoniaboca/partIII-amino-acid-prediction/data/ProteinGym_structures_clean/6EZM.pdb')  # parse the PDB file

chains = structure.get_chains()



In [19]:
chain = next(chains)

In [23]:
chain.__repr__()

'<Chain id=U>'

In [None]:
seq1 = chains[0].get_sequence()
all_same = all(seq1 == chain.get_sequence() for chain in chains)

if len(chains) == 1:
    print('Homomer')
elif all_same:
    print('Homomultimer')
else:
    print('Heteromultimer')

In [25]:
from biopandas.pdb import PandasPdb

pdb = PandasPdb().read_pdb('/Users/antoniaboca/partIII-amino-acid-prediction/data/ProteinGym_structures_clean/6EZM.pdb').df['ATOM']

In [26]:
pdb

Unnamed: 0,record_name,atom_number,blank_1,atom_name,alt_loc,residue_name,blank_2,chain_id,residue_number,insertion,...,x_coord,y_coord,z_coord,occupancy,b_factor,blank_4,segment_id,element_symbol,charge,line_idx
0,ATOM,1,,N,,MET,,U,1,,...,43.021,118.385,51.809,1.0,0.0,,,N,,2
1,ATOM,2,,CA,,MET,,U,1,,...,44.223,117.574,51.681,1.0,0.0,,,C,,3
2,ATOM,3,,CB,,MET,,U,1,,...,45.341,118.263,50.938,1.0,0.0,,,C,,4
3,ATOM,4,,CG,,MET,,U,1,,...,45.228,119.493,50.039,1.0,0.0,,,C,,5
4,ATOM,5,,SD,,MET,,U,1,,...,46.859,120.044,49.445,1.0,0.0,,,S,,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40171,ATOM,40195,,SD,,MET,,X,220,,...,137.896,126.936,109.159,1.0,0.0,,,S,,40196
40172,ATOM,40196,,CE,,MET,,X,220,,...,136.639,126.872,107.822,1.0,0.0,,,C,,40197
40173,ATOM,40197,,C,,MET,,X,220,,...,140.572,130.942,107.519,1.0,0.0,,,C,,40198
40174,ATOM,40198,,O,,MET,,X,220,,...,140.534,131.689,106.527,1.0,0.0,,,O,,40199
