<a href="https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/boltz1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
#@title Input protein sequence(s), then hit `Runtime` -> `Run all`
from google.colab import files
import os
import re
import hashlib
import random
from string import ascii_uppercase

# Function to add a hash to the jobname
def add_hash(x, y):
    return x + "_" + hashlib.sha1(y.encode()).hexdigest()[:5]

# User inputs
query_sequence = 'PIAQIHILEGRSDEQKETLIREVSEAISRSLDAPLTSVRVIITEMAKGHFGIGGELASK'  #@param {type:"string"}
#@markdown  - Use `:` to specify inter-protein chainbreaks for **modeling complexes** (supports homo- and hetro-oligomers). For example **PI...SK:PI...SK** for a homodimer
ligand_input = 'N[C@@H](Cc1ccc(O)cc1)C(=O)O'  #@param {type:"string"}
#@markdown  - Use `:` to specify multiple ligands as smile strings
jobname = 'test'  #@param {type:"string"}

# Clean up the query sequence and jobname
query_sequence = "".join(query_sequence.split())
basejobname = "".join(jobname.split())
basejobname = re.sub(r'\W+', '', basejobname)
jobname = add_hash(basejobname, query_sequence)

# Check if a directory with jobname exists
def check(folder):
    return not os.path.exists(folder)

if not check(jobname):
    n = 0
    while not check(f"{jobname}_{n}"):
        n += 1
    jobname = f"{jobname}_{n}"

# Make directory to save results
os.makedirs(jobname, exist_ok=True)

from string import ascii_uppercase

# Split sequences on chain breaks
protein_sequences = query_sequence.strip().split(':')
ligand_sequences = ligand_input.strip().split(':')

# Initialize chain labels starting from 'A'
chain_labels = iter(ascii_uppercase)

fasta_entries = []
csv_entries = []
chain_label_to_seq_id = {}

# Process protein sequences
for i, seq in enumerate(protein_sequences):
    seq = seq.strip()
    if not seq:
        continue  # Skip empty sequences
    chain_label = next(chain_labels)
    seq_id = f"{jobname}_{i}"
    chain_label_to_seq_id[chain_label] = seq_id
    # For CSV file (for ColabFold)
    csv_entries.append((seq_id, seq))
    # For FASTA file
    msa_path = os.path.join(jobname, f"{seq_id}.a3m")
    header = f">{chain_label}|protein|{msa_path}"
    sequence = seq
    fasta_entries.append((header, sequence))

# Process ligand sequences (assumed to be SMILES strings)
for lig in ligand_sequences:
    lig = lig.strip()
    if not lig:
        continue  # Skip empty ligands
    chain_label = next(chain_labels)
    lig_type = 'smiles'
    header = f">{chain_label}|{lig_type}"
    sequence = lig
    fasta_entries.append((header, sequence))

# Write the CSV file for ColabFold
queries_path = os.path.join(jobname, f"{jobname}.csv")
with open(queries_path, "w") as text_file:
    text_file.write("id,sequence\n")
    for seq_id, seq in csv_entries:
        text_file.write(f"{seq_id},{seq}\n")

# Write the FASTA file
queries_fasta = os.path.join(jobname, f"{jobname}.fasta")
with open(queries_fasta, 'w') as f:
    for header, sequence in fasta_entries:
        f.write(f"{header}\n{sequence}\n")

# Optionally, print the output for verification
#print(f"Generated FASTA file '{queries_fasta}':\n")
#for header, sequence in fasta_entries:
#    print(f"{header}\n{sequence}\n")


In [36]:
#@title Install dependencies
%%time
import os
if not os.path.isfile("COLABFOLD_READY"):
  print("installing colabfold...")
  os.system("pip install -q --no-warn-conflicts 'colabfold[alphafold-minus-jax] @ git+https://github.com/sokrypton/ColabFold'")
  if os.environ.get('TPU_NAME', False) != False:
    os.system("pip uninstall -y jax jaxlib")
    os.system("pip install --no-warn-conflicts --upgrade dm-haiku==0.0.10 'jax[cuda12_pip]'==0.3.25 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html")
  os.system("ln -s /usr/local/lib/python3.*/dist-packages/colabfold colabfold")
  os.system("ln -s /usr/local/lib/python3.*/dist-packages/alphafold alphafold")
  os.system("touch COLABFOLD_READY")

if not os.path.isfile("BOLZ_READY"):
  os.system("pip install -q --no-warn-conflicts boltz")
  os.system("touch BOLZ_READY")

CPU times: user 30 µs, sys: 6 µs, total: 36 µs
Wall time: 39.1 µs


In [37]:
#@title Generate MSA with ColabFold
!colabfold_batch "{queries_path}" "{jobname}" --msa-only

2024-11-17 23:21:37,398 Running colabfold 1.5.5 (c21e1768d18e3608e6e6d99c97134317e7e41c75)

limited shared resource only capable of processing a few thousand MSAs per day. Please
submit jobs only from a single IP address. We reserve the right to limit access to the
server case-by-case when usage exceeds fair use. If you require more MSAs: You can 
precompute all MSAs with `colabfold_search` or host your own API and pass it to `--host-url`

2024-11-17 23:21:39,576 Running on GPU
2024-11-17 23:21:40,309 Found 4 citations for tools or databases
2024-11-17 23:21:40,310 Query 1/1: test_a5e17_1_0 (length 59)
COMPLETE: 100% 150/150 [00:22<00:00,  6.53it/s] 
2024-11-17 23:22:03,313 Saved test_a5e17_1/test_a5e17_1_0.pickle
2024-11-17 23:22:04,055 Done


In [38]:
#@title Predict structure using boltz
!boltz predict --out_dir "{jobname}" "{jobname}/{jobname}.fasta"

Downloading data and model to /root/.boltz. You may change this by setting the --cache flag.
Checking input data.
Processing input data.
100% 1/1 [00:00<00:00, 16.14it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
2024-11-17 23:22:42.568213: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-17 23:22:42.619224: I tensorflow/core/platform/cp

In [42]:
#@title Download results
# Import necessary modules
import os
import zipfile
from google.colab import files
import glob

# Ensure 'jobname' variable is defined
# jobname = 'test_abcde'  # Uncomment and set if not already defined

# Name of the zip file
zip_filename = f"results_{jobname}.zip"

# Create a zip file and add the specified files without preserving directory structure
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    coverage_png_files = glob.glob(os.path.join(jobname, '*_coverage.png'))
    a3m_files = glob.glob(os.path.join(jobname, '*.a3m'))
    for file in coverage_png_files + a3m_files:
        arcname = os.path.basename(file)  # Use only the file name
        zipf.write(file, arcname=arcname)

    cif_files = glob.glob(os.path.join(jobname, f'boltz_results_{jobname}', 'predictions', jobname, '*.cif'))
    for file in cif_files:
        arcname = os.path.basename(file)  # Use only the file name
        zipf.write(file, arcname=arcname)

    hparams_file = os.path.join(jobname, f'boltz_results_{jobname}', 'lightning_logs', 'version_0', 'hparams.yaml')
    if os.path.exists(hparams_file):
        arcname = os.path.basename(hparams_file)  # Use only the file name
        zipf.write(hparams_file, arcname=arcname)
    else:
        print(f"Warning: {hparams_file} not found.")

# Download the zip file
files.download(zip_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>