<a href="https://colab.research.google.com/github/prathithbhargav/AlphaMut/blob/master/3_inference_of_Helix-in-protein_trained_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [None]:
!pip install stable-baselines3
!pip install biotite
!pip install biopandas

Collecting stable-baselines3
  Downloading stable_baselines3-2.3.2-py3-none-any.whl.metadata (5.1 kB)
Collecting gymnasium<0.30,>=0.28.1 (from stable-baselines3)
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium<0.30,>=0.28.1->stable-baselines3)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading stable_baselines3-2.3.2-py3-none-any.whl (182 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.3/182.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium, stable-baselines3
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1 stable-baselines3-2.

In [None]:

#@title Input protein sequence(s), then hit `Runtime` -> `Run all`
from google.colab import files
import os
import re
import hashlib
import random

from sys import version_info
python_version = f"{version_info.major}.{version_info.minor}"

def add_hash(x,y):
  return x+"_"+hashlib.sha1(y.encode()).hexdigest()[:5]

query_sequence = 'PIAQIHILEGRSDEQKETLIREVSEAISRSLDAPLTSVRVIITEMAKGHFGIGGELASK' #@param {type:"string"}
#@markdown  - Use `:` to specify inter-protein chainbreaks for **modeling complexes** (supports homo- and hetro-oligomers). For example **PI...SK:PI...SK** for a homodimer
jobname = 'test' #@param {type:"string"}
# number of models to use
num_relax = 0 #@param [0, 1, 5] {type:"raw"}
#@markdown - specify how many of the top ranked structures to relax using amber
template_mode = "none" #@param ["none", "pdb100","custom"]
#@markdown - `none` = no template information is used. `pdb100` = detect templates in pdb100 (see [notes](#pdb100)). `custom` - upload and search own templates (PDB or mmCIF format, see [notes](#custom_templates))

use_amber = num_relax > 0

# remove whitespaces
query_sequence = "".join(query_sequence.split())

basejobname = "".join(jobname.split())
basejobname = re.sub(r'\W+', '', basejobname)
jobname = add_hash(basejobname, query_sequence)

# check if directory with jobname exists
def check(folder):
  if os.path.exists(folder):
    return False
  else:
    return True
if not check(jobname):
  n = 0
  while not check(f"{jobname}_{n}"): n += 1
  jobname = f"{jobname}_{n}"

# make directory to save results
os.makedirs(jobname, exist_ok=True)

# save queries
queries_path = os.path.join(jobname, f"{jobname}.csv")
with open(queries_path, "w") as text_file:
  text_file.write(f"id,sequence\n{jobname},{query_sequence}")

if template_mode == "pdb100":
  use_templates = True
  custom_template_path = None
elif template_mode == "custom":
  custom_template_path = os.path.join(jobname,f"template")
  os.makedirs(custom_template_path, exist_ok=True)
  uploaded = files.upload()
  use_templates = True
  for fn in uploaded.keys():
    os.rename(fn,os.path.join(custom_template_path,fn))
else:
  custom_template_path = None
  use_templates = False

print("jobname",jobname)
print("sequence",query_sequence)
print("length",len(query_sequence.replace(":","")))



In [None]:
# prompt: generate a code that takes user input for protein pdb code, and when there is a PDB code, it downloads and shows it. Make is colab friendly

import requests
from Bio.PDB import PDBParser
from biopandas.pdb import PandasPdb
import matplotlib.pyplot as plt

def download_pdb(pdb_code):
  """
  Downloads a PDB file from the RCSB PDB database.

  Args:
    pdb_code: The PDB code of the protein.

  Returns:
    The PDB file content as a string.
  """
  url = f"https://files.rcsb.org/download/{pdb_code.upper()}.pdb"
  try:
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes
    return response.text
  except requests.exceptions.RequestException as e:
    print(f"Error downloading PDB file: {e}")
    return None

def show_pdb_structure(pdb_code):
  """
  Downloads a PDB file and displays its structure.

  Args:
    pdb_code: The PDB code of the protein.
  """
  pdb_content = download_pdb(pdb_code)
  if pdb_content:
    # Parse the PDB content
    parser = PDBParser()
    try:
      structure = parser.get_structure("protein", pdb_content)
      # Use biopandas to create a Pandas DataFrame
      ppdb = PandasPdb()
      ppdb.set_structure(structure)
      # Access the atom DataFrame
      atom_df = ppdb.df['ATOM']
      # Plot the structure
      fig = plt.figure()
      ax = fig.add_subplot(111, projection='3d')
      ax.scatter(atom_df['x_coord'], atom_df['y_coord'], atom_df['z_coord'], s=10)
      ax.set_xlabel('X')
      ax.set_ylabel('Y')
      ax.set_zlabel('Z')
      plt.show()
    except Exception as e:
      print(f"Error parsing PDB file: {e}")
  else:
    print("Failed to download PDB file.")

# Get user input for PDB code
pdb_code = input("Enter the PDB code: ")

# Download and show the PDB structure
show_pdb_structure(pdb_code)


ModuleNotFoundError: No module named 'Bio'

In [None]:

import torch
from transformers import AutoTokenizer, EsmForProteinFolding
from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37
from datetime import datetime
import os
# from biopandas.pdb import PandasPdb
# from whole_protein_utils.sequence import *
# import biotite.structure as struc
# import biotite.structure.io as strucio
import numpy as np

# DEFINING THE MODEL FOR PROTEIN MODELLING
torch.backends.cuda.matmul.allow_tf32 = True
tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")
model = model.cuda()
def convert_outputs_to_pdb(outputs):
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.cpu().numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs


# In[4]:


def generate_structure_from_sequence(sequence,name=None):
    '''
    This function takes in the sequence of a protein and gives back the structure - this is using the ESM Model
    '''

    tokenized_input = tokenizer([sequence], return_tensors="pt", add_special_tokens=False)['input_ids']
    tokenized_input = tokenized_input.cuda()

    import torch

    with torch.no_grad():
        output = model(tokenized_input)


    pdb = convert_outputs_to_pdb(output)
    with open(f"{name}.pdb", "w") as f:
        f.write("".join(pdb))


tokenizer_config.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/121 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/8.44G [00:00<?, ?B/s]

Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.bias', 'esm.contact_head.regression.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
generate_structure_from_sequence('VTMWRKVERVGVMHFNWNPVEWMEVRLFHAMHPKDYFYYRDETAMKTHHMVLPNDREPRNILIWMDRTEKFKTWTYNFYRYIYIHNCATNETFISMAWWMYMRMQNPDETGDERWFMGTGFGGFMTSCMNEDVQRIGHRRVNDGYWSQYVFNIITRPARKYRYMHGYHTRPMVFKPQRFPCSTLVHWNMGSNPWEHFVHLMKKVCRRKIFQMQMVCSMETDHAYERQVKRASCSPPASC')


In [None]:
from transformers import AutoTokenizer,EsmModel
import torch

esm_seq_tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
encoder_model_esm = EsmModel.from_pretrained("facebook/esm2_t6_8M_UR50D")


tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/31.4M [00:00<?, ?B/s]

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Inference Model