# RGN2 Colab
## Instructions
1. Paste your protein sequence in the input field.
2. Run the cells in the Colab individually with the play button on the left or via _Runtime_ > _Run all._
3. The predicted protein structure will be downloaded after the final "Refinement" cell is executed.

In [1]:
import os
import sys
import subprocess
from IPython import get_ipython
from IPython.utils import io

WORKDIR = './rgn2'
GIT_REPO = 'https://github.com/aqlaboratory/rgn2.git'
ENV_CONFIG = os.path.join(WORKDIR, 'environment.yml')
RGN2_PARAM_SOURCE_URL = 'https://huggingface.co/christinafl/rgn2'
RGN2_PARAMS_DIR = os.path.join(WORKDIR, 'resources')
RGN2_PARAM_RUN_DIR = os.path.join(RGN2_PARAMS_DIR, 'rgn2_runs')
RGN2_RUN_DIR = os.path.join(WORKDIR, 'runs')

AF2_GIT_REPO = 'https://github.com/deepmind/alphafold.git'
AF2_SOURCE_URL = 'https://storage.googleapis.com/alphafold/alphafold_params_2022-03-02.tar'
AF2_PARAMS_DIR = './alphafold/data/params'
AF2_PARAMS_PATH = os.path.join(AF2_PARAMS_DIR, os.path.basename(AF2_SOURCE_URL))
CONDA_INIT = 'source /opt/conda/etc/profile.d/conda.sh && conda init'
AF2_ENV_INIT = f'{CONDA_INIT} && conda activate af2'

REFINER_DIR = os.path.join(WORKDIR, 'ter2pdb')
REFINER_PATH = os.path.join(REFINER_DIR, 'ModRefiner-l.zip')
REFINER_URL = 'https://zhanggroup.org/ModRefiner/ModRefiner-l.zip'




try:
  # Change to content directory
  %cd '/content'

  # Install Miniconda
  !rm -rf /opt/conda
  !wget -q -P /tmp https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
  !bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda
  !rm /tmp/Miniconda3-latest-Linux-x86_64.sh

  # Update PATH
  os.environ['PATH'] = f"/opt/conda/bin:{os.environ['PATH']}"

  # Download RGN2
  !rm -rf {WORKDIR}
  !git clone {GIT_REPO} {WORKDIR}
  !{CONDA_INIT} && conda env create -f {ENV_CONFIG}

  # Download AlphaFold
  !rm -rf alphafold
  !git clone --branch v2.2.4 {AF2_GIT_REPO} alphafold
  !{CONDA_INIT} && conda create -y -q --name af2 python=3.7

  # Install AlphaFold dependencies with more control
  !{AF2_ENV_INIT} && pip install absl-py==0.13.0 biopython==1.79 chex==0.0.7 dm-haiku==0.0.4 dm-tree==0.1.6 immutabledict==2.0.0 ml-collections==0.1.0 numpy==1.19.5 scipy==1.7.0 tensorflow==2.5.0

  # Install AlphaFold without dependencies
  !{AF2_ENV_INIT} && pip install --no-dependencies ./alphafold

  # Install JAX with CUDA support
  !{AF2_ENV_INIT} && pip install --upgrade jax==0.3.17 jaxlib==0.3.15+cuda11.cudnn805 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html

  # Download AlphaFold parameters
  !mkdir -p {AF2_PARAMS_DIR}
  !wget -O {AF2_PARAMS_PATH} {AF2_SOURCE_URL}
  !tar -xvf {AF2_PARAMS_PATH} -C {AF2_PARAMS_DIR} --preserve-permissions
  !rm {AF2_PARAMS_PATH}

  # Download RGN2 weights
  !GIT_LFS_SKIP_SMUDGE=1 git clone {RGN2_PARAM_SOURCE_URL} {RGN2_PARAMS_DIR}
  !cd {RGN2_PARAMS_DIR} && git lfs pull
  !mv {RGN2_PARAM_RUN_DIR} {RGN2_RUN_DIR}

  # Download Modrefiner
  !wget -O {REFINER_PATH} {REFINER_URL}
  !unzip -o {REFINER_PATH} -d {REFINER_DIR}
  !rm {REFINER_PATH}

except Exception as e:
  print(f"Error: {e}")

/content
PREFIX=/opt/conda
Unpacking payload ...

Installing base environment...

Preparing transaction: ...working... done
Executing transaction: ...working... done
installation finished.
    You currently have a PYTHONPATH environment variable set. This may cause
    unexpected behavior when running the Python interpreter in Miniconda3.
    For best results, please verify that your PYTHONPATH only points to
    directories of packages that are compatible with the Python interpreter
    in Miniconda3: /opt/conda
Cloning into './rgn2'...
remote: Enumerating objects: 258, done.[K
remote: Counting objects: 100% (62/62), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 258 (delta 31), reused 37 (delta 13), pack-reused 196 (from 1)[K
Receiving objects: 100% (258/258), 17.48 MiB | 38.08 MiB/s, done.
Resolving deltas: 100% (98/98), done.
no change     /opt/conda/condabin/conda
no change     /opt/conda/bin/conda
no change     /opt/conda/bin/conda-env
no change     

In [None]:
#!/bin/bash

# Exit immediately if any command exits with a non-zero status.
set -e

# Define directories and repository URLs.
WORKDIR="./rgn2"
GIT_REPO="https://github.com/aqlaboratory/rgn2.git"
ENV_CONFIG="${WORKDIR}/environment.yml"
RGN2_PARAM_SOURCE_URL="https://huggingface.co/christinafl/rgn2"
RGN2_PARAMS_DIR="${WORKDIR}/resources"
RGN2_PARAM_RUN_DIR="${RGN2_PARAMS_DIR}/rgn2_runs"
RGN2_RUN_DIR="${WORKDIR}/runs"

AF2_GIT_REPO="https://github.com/deepmind/alphafold.git"
AF2_SOURCE_URL="https://storage.googleapis.com/alphafold/alphafold_params_2022-03-02.tar"
AF2_PARAMS_DIR="./alphafold/data/params"
AF2_PARAMS_PATH="${AF2_PARAMS_DIR}/$(basename ${AF2_SOURCE_URL})"
CONDA_INIT="source /opt/conda/etc/profile.d/conda.sh && conda init"
AF2_ENV_INIT="${CONDA_INIT} && conda activate af2"

REFINER_DIR="${WORKDIR}/ter2pdb"
REFINER_PATH="${REFINER_DIR}/ModRefiner-l.zip"
REFINER_URL="https://zhanggroup.org/ModRefiner/ModRefiner-l.zip"

# Step 1: Install Miniconda if not already installed.
echo "Installing Miniconda..."
wget -q -P /tmp https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda
rm /tmp/Miniconda3-latest-Linux-x86_64.sh
export PATH="/opt/conda/bin:$PATH"

# Step 2: Set up the RGN2 environment.
echo "Setting up the RGN2 environment..."
conda create -y -n rgn2 python=3.8
conda activate rgn2
git clone ${GIT_REPO} ${WORKDIR}
conda env create -f ${ENV_CONFIG}

# Step 3: Download RGN2 weights using Git LFS.
echo "Downloading RGN2 weights..."
GIT_LFS_SKIP_SMUDGE=1 git clone ${RGN2_PARAM_SOURCE_URL} ${RGN2_PARAMS_DIR}
cd ${RGN2_PARAMS_DIR} && git lfs pull
mv ${RGN2_PARAM_RUN_DIR} ${RGN2_RUN_DIR}

# Step 4: Set up the AlphaFold environment.
echo "Setting up the AlphaFold environment..."
conda create -y -n af2 python=3.7
conda activate af2
git clone --branch v2.2.4 ${AF2_GIT_REPO} alphafold

# Install AlphaFold dependencies in the correct order.
pip install absl-py==0.13.0 biopython==1.79 chex==0.0.7 dm-haiku==0.0.4 dm-tree==0.1.6 immutabledict==2.0.0 ml-collections==0.1.0 numpy==1.19.5 scipy==1.7.0 tensorflow==2.5.0 pandas==1.3.3 docker==5.0.3

# Install AlphaFold itself, bypassing dependency resolution.
pip install --no-dependencies ./alphafold

# Install JAX with GPU (CUDA) support.
pip install --upgrade jax==0.3.17 jaxlib==0.3.15+cuda11.cudnn805 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html

# Step 5: Download and extract AlphaFold parameters.
echo "Downloading AlphaFold parameters..."
mkdir -p ${AF2_PARAMS_DIR}
wget -O ${AF2_PARAMS_PATH} ${AF2_SOURCE_URL}
tar -xvf ${AF2_PARAMS_PATH} -C ${AF2_PARAMS_DIR} --preserve-permissions
rm ${AF2_PARAMS_PATH}

# Step 6: Download and extract ModRefiner.
echo "Downloading ModRefiner..."
wget -O ${REFINER_PATH} ${REFINER_URL}
unzip -o ${REFINER_PATH} -d ${REFINER_DIR}
rm ${REFINER_PATH}

echo "Installation complete!"

In [4]:
# ----- Step 1: Install Core Dependencies for Colab Environment -----

# Install TensorFlow (if you need a specific version)
!pip install tensorflow==2.15.0

# Install JAX with GPU support (adapt version as needed; here we use a common CUDA 11 version for Colab)
!pip install --upgrade jax jaxlib==0.4.10+cuda11.cudnn86 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html

# Install other global dependencies
!pip install torch  # For GPU verification using PyTorch in AlphaFold section

# ----- Step 2: Clone and Set Up RGN2 -----

# Clone the RGN2 repository into /content (Google Colab's working directory)
!git clone https://github.com/aqlaboratory/rgn2.git

# Change directory into the RGN2 folder
%cd rgn2

# Install RGN2 dependencies (assuming a requirements.txt file is provided)
!pip install -r requirements.txt

# (Optional: If you need an environment file and have conda installed on Colab, you could try !mamba or !conda commands,
# but typically using pip is simpler within Colab.)

# ----- Step 3: Verify GPU Availability for TensorFlow (used in RGN2) -----

import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("TensorFlow GPU devices available:", tf.config.list_physical_devices('GPU'))

# ----- Step 4: Set Up AlphaFold (Using ColabFold for compatibility) -----

# Change back to /content directory
%cd /content

# Clone the ColabFold repository (a lightweight version of AlphaFold optimized for Colab)
!git clone https://github.com/sokrypton/ColabFold.git

# Change directory into the ColabFold folder
%cd ColabFold

# Install AlphaFold (ColabFold) dependencies from its requirements file
!pip install -r requirements.txt

# Install additional AlphaFold dependencies (if not already covered by the requirements)
!pip install biopython dm-haiku

# ----- Step 5: Verify GPU Availability for AlphaFold using PyTorch -----

import torch
print("PyTorch version:", torch.__version__)
print("PyTorch GPU available:", torch.cuda.is_available())

# ----- Step 6: (Optional) Download and Prepare AlphaFold Parameters -----
# If you intend to use AlphaFold/ColabFold with official parameters:
!mkdir -p /content/alphafold/params
!wget -O /content/alphafold/params/alphafold_params.tar https://storage.googleapis.com/alphafold/alphafold_params_2022-03-02.tar
!tar -xvf /content/alphafold/params/alphafold_params.tar -C /content/alphafold/params
!rm /content/alphafold/params/alphafold_params.tar

# ----- Step 7: Final Verification & Next Steps -----

print("Installation and setup for RGN2 and AlphaFold (via ColabFold) are complete.")

# Optionally, you can now open the provided notebooks to run predictions,
# for RGN2 (e.g., using the 'rgn2_prediction.ipynb' in the rgn2 folder)
# or for AlphaFold (e.g., using the 'AlphaFold2.ipynb' notebook in the ColabFold folder).

[31mERROR: Could not find a version that satisfies the requirement tensorflow==2.15.0 (from versions: 2.16.0rc0, 2.16.1, 2.16.2, 2.17.0rc0, 2.17.0rc1, 2.17.0, 2.17.1, 2.18.0rc0, 2.18.0rc1, 2.18.0rc2, 2.18.0, 2.19.0rc0)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow==2.15.0[0m[31m
[0mLooking in links: https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
Collecting jax
  Downloading jax-0.5.2-py3-none-any.whl.metadata (22 kB)
[31mERROR: Ignored the following yanked versions: 0.4.32[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement jaxlib==0.4.10+cuda11.cudnn86 (from versions: 0.4.17, 0.4.17+cuda11.cudnn86, 0.4.17+cuda12.cudnn89, 0.4.18, 0.4.18+cuda11.cudnn86, 0.4.18+cuda12.cudnn89, 0.4.19, 0.4.19+cuda11.cudnn86, 0.4.19+cuda12.cudnn89, 0.4.20, 0.4.20+cuda11.cudnn86, 0.4.20+cuda12.cudnn89, 0.4.21, 0.4.21+cuda11.cudnn86, 0.4.21+cuda12.cudnn89, 0.4.22, 0.4.22+cuda11.cudnn86, 0.4.22+cuda12.cudnn89, 0.4.23, 0.4.23+cuda11

In [3]:
!source /opt/conda/etc/profile.d/conda.sh && conda activate af2 && pip install biopython==1.79 pandas docker immutabledict tensorflow-cpu

Collecting biopython==1.79
  Using cached biopython-1.79-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.6 MB)
Collecting pandas
  Downloading pandas-1.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker
  Downloading docker-6.1.3-py3-none-any.whl (148 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.1/148.1 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting immutabledict
  Downloading immutabledict-2.2.5-py3-none-any.whl (4.1 kB)
Collecting tensorflow-cpu
  Downloading tensorflow_cpu-2.11.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (221.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.4/221.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting python-dateutil>=2.7.3
  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl (

In [7]:
# Let's try to get more information about the Colab environment
import subprocess

def run_command(cmd):
    """Run a command and return its output"""
    process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    return stdout.decode('utf-8'), stderr.decode('utf-8')

# Check if NVIDIA GPU is available
print("Checking for NVIDIA GPU...")
gpu_info, _ = run_command("nvidia-smi")
print(gpu_info if gpu_info else "No GPU found or nvidia-smi not available")

# Check CUDA version in a different way
print("\nChecking CUDA version...")
cuda_version, _ = run_command("ls -l /usr/local/cuda/lib64/libcudart.so*")
print(cuda_version if cuda_version else "CUDA not found in standard location")

# Check currently installed packages
print("\nCurrently installed packages:")
installed_packages, _ = run_command("pip list")
print(installed_packages)

# Check if TensorFlow is already installed
print("\nChecking TensorFlow installation...")
tf_check, _ = run_command("python -c 'import tensorflow as tf; print(f\"TensorFlow version: {tf.__version__}\"); print(f\"GPU available: {tf.config.list_physical_devices(\"GPU\")}\")'")
print(tf_check if tf_check else "TensorFlow not installed or error importing")

# Check if JAX is already installed
print("\nChecking JAX installation...")
jax_check, _ = run_command("python -c 'import jax; print(f\"JAX version: {jax.__version__}\"); print(f\"JAX devices: {jax.devices()}\")'")
print(jax_check if jax_check else "JAX not installed or error importing")

# Check available TensorFlow versions
print("\nChecking latest available TensorFlow version...")
tf_latest, _ = run_command("pip install tensorflow==")
print(tf_latest)

# Check available JAX versions
print("\nChecking latest available JAX version...")
jax_latest, _ = run_command("pip install jax==")
print(jax_latest)

# Check available JAXlib versions
print("\nChecking latest available JAXlib versions...")
jaxlib_latest, _ = run_command("pip install jaxlib==")
print(jaxlib_latest)

Checking for NVIDIA GPU...
Fri Mar  7 14:36:58 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   32C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                     

In [9]:
# Let's create a simpler installation script for Google Colab

# Create the installation script
installation_script = '''
# RGN2 and AlphaFold Installation Script for Google Colab

import os
import sys
import subprocess
import time
from pathlib import Path

def run_command(cmd, description=None):
    """Run a shell command and print its output in real-time."""
    if description:
        print(f"\\n===== {description} =====")

    process = subprocess.Popen(
        cmd,
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        universal_newlines=True
    )

    # Print output in real-time
    for line in process.stdout:
        print(line.strip())

    process.wait()

    if process.returncode != 0:
        print(f"Command failed with return code {process.returncode}")
        return False
    return True

def check_package_installed(package_name):
    """Check if a package is installed."""
    try:
        __import__(package_name)
        return True
    except ImportError:
        return False

def install_rgn2_and_alphafold():
    """Main function to install RGN2 and AlphaFold in Google Colab."""
    # Define variables
    workdir = "./rgn2"
    git_repo = "https://github.com/aqlaboratory/rgn2.git"
    rgn2_param_source_url = "https://huggingface.co/christinafl/rgn2"
    rgn2_params_dir = os.path.join(workdir, "resources")
    rgn2_param_run_dir = os.path.join(rgn2_params_dir, "rgn2_runs")
    rgn2_run_dir = os.path.join(workdir, "runs")

    af2_git_repo = "https://github.com/deepmind/alphafold.git"
    af2_source_url = "https://storage.googleapis.com/alphafold/alphafold_params_2022-03-02.tar"
    af2_params_dir = "./alphafold/data/params"
    af2_params_path = os.path.join(af2_params_dir, os.path.basename(af2_source_url))

    refiner_dir = os.path.join(workdir, "ter2pdb")
    refiner_path = os.path.join(refiner_dir, "ModRefiner-l.zip")
    refiner_url = "https://zhanggroup.org/ModRefiner/ModRefiner-l.zip"

    # Check if GPU is available
    try:
        import tensorflow as tf
        gpu_available = len(tf.config.list_physical_devices("GPU")) > 0
        print(f"GPU available for TensorFlow: {gpu_available}")
    except:
        gpu_available = False
        print("Could not detect GPU with TensorFlow")

    # Install required packages for RGN2
    print("\\n===== Installing required packages for RGN2 =====")
    run_command("pip install tensorflow==2.17.1", "Installing TensorFlow")

    # Try to install JAX with GPU support if GPU is available
    if gpu_available:
        run_command(
            "pip install --upgrade jax jaxlib -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html",
            "Installing JAX with GPU support"
        )
    else:
        run_command("pip install jax jaxlib", "Installing JAX (CPU version)")

    # Install other dependencies
    run_command(
        "pip install biopython==1.79 dm-haiku==0.0.4 dm-tree==0.1.6 immutabledict==2.0.0 ml-collections==0.1.0",
        "Installing additional dependencies"
    )

    # Download RGN2
    if not os.path.exists(workdir):
        run_command(
            f"git clone {git_repo} {workdir}",
            "Downloading RGN2"
        )
    else:
        print(f"\\n===== RGN2 repository already exists at {workdir} =====")

    # Download AlphaFold
    if not os.path.exists("./alphafold"):
        run_command(
            f"git clone --branch v2.2.4 {af2_git_repo} alphafold",
            "Downloading AlphaFold"
        )
    else:
        print("\\n===== AlphaFold repository already exists =====")

    # Install AlphaFold
    run_command(
        "pip install --no-deps ./alphafold",
        "Installing AlphaFold"
    )

    # Download AlphaFold parameters
    if not os.path.exists(af2_params_dir):
        run_command(
            f"mkdir -p {af2_params_dir} && "
            f"wget -O {af2_params_path} {af2_source_url} && "
            f"tar -xvf {af2_params_path} -C {af2_params_dir} && "
            f"rm {af2_params_path}",
            "Downloading AlphaFold parameters"
        )
    else:
        print("\\n===== AlphaFold parameters already exist =====")

    # Download RGN2 weights
    if not os.path.exists(rgn2_params_dir):
        run_command(
            f"GIT_LFS_SKIP_SMUDGE=1 git clone {rgn2_param_source_url} {rgn2_params_dir} && "
            f"cd {rgn2_params_dir} && git lfs pull",
            "Downloading RGN2 weights"
        )

        # Create runs directory if it doesn't exist
        if os.path.exists(rgn2_param_run_dir) and not os.path.exists(rgn2_run_dir):
            os.makedirs(os.path.dirname(rgn2_run_dir), exist_ok=True)
            run_command(f"mv {rgn2_param_run_dir} {rgn2_run_dir}", "Moving RGN2 runs")
    else:
        print("\\n===== RGN2 weights already exist =====")

    # Download Modrefiner
    if not os.path.exists(refiner_dir):
        run_command(
            f"mkdir -p {refiner_dir} && "
            f"wget -O {refiner_path} {refiner_url} && "
            f"unzip -o {refiner_path} -d {refiner_dir} && "
            f"rm {refiner_path}",
            "Downloading Modrefiner"
        )
    else:
        print("\\n===== Modrefiner already exists =====")

    print("\\n===== Installation completed successfully =====")

    # Verify installation
    verify_installation()

    return True

def verify_installation():
    """Verify the installation of RGN2 and AlphaFold."""
    print("\\n===== Verifying Installation =====")

    # Check if repositories exist
    rgn2_exists = os.path.exists("./rgn2")
    alphafold_exists = os.path.exists("./alphafold")

    print(f"RGN2 repository: {'✓' if rgn2_exists else '✗'}")
    print(f"AlphaFold repository: {'✓' if alphafold_exists else '✗'}")

    # Check if key packages are installed
    packages = ["tensorflow", "biopython", "dm_haiku", "dm_tree", "immutabledict", "ml_collections"]
    print("\\nChecking dependencies:")

    for package in packages:
        package_name = package.replace("_", "")  # For import checking
        is_installed = check_package_installed(package_name)
        print(f"{package}: {'✓' if is_installed else '✗'}")

    # Check if JAX is installed
    jax_installed = check_package_installed("jax")
    print(f"JAX: {'✓' if jax_installed else '✗'}")

    # Check if directories and files exist
    print("\\nChecking directories and files:")
    directories = [
        ("./rgn2", "RGN2 directory"),
        ("./rgn2/runs", "RGN2 runs directory"),
        ("./alphafold", "AlphaFold directory"),
        ("./alphafold/data/params", "AlphaFold parameters directory"),
        ("./rgn2/ter2pdb", "ModRefiner directory")
    ]

    for directory, description in directories:
        exists = os.path.exists(directory)
        print(f"{description}: {'✓' if exists else '✗'}")

    # Check GPU availability for TensorFlow
    try:
        import tensorflow as tf
        gpu_available = len(tf.config.list_physical_devices("GPU")) > 0
        print(f"\\nGPU available for TensorFlow: {'✓' if gpu_available else '✗'}")
    except:
        print("\\nCould not check GPU availability for TensorFlow")

    # Check GPU availability for JAX
    if jax_installed:
        try:
            import jax
            jax_devices = jax.devices()
            gpu_available = any("gpu" in str(d).lower() for d in jax_devices)
            print(f"JAX devices: {jax_devices}")
            print(f"GPU available for JAX: {'✓' if gpu_available else '✗'}")
        except:
            print("Could not check GPU availability for JAX")

    print("\\n===== Verification completed =====")

if __name__ == "__main__":
    install_rgn2_and_alphafold()
'''

# Save the installation script
with open('install_rgn2_af2_colab.py', 'w') as f:
    f.write(installation_script)

print("Installation script for Google Colab saved to 'install_rgn2_af2_colab.py'")
print("You can run it in Colab with: %run install_rgn2_af2_colab.py")

# Create a simple example script
example_script = '''
# Example usage of RGN2 and AlphaFold in Google Colab

import os
import sys
import subprocess

def run_rgn2_example():
    """Run a simple example with RGN2."""
    print("\\n===== Running RGN2 Example =====")

    # Example protein sequence
    sequence = "MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE"

    # Save sequence to a file
    with open("example.fasta", "w") as f:
        f.write(">example\\n")
        f.write(sequence)

    # Run RGN2 prediction
    # Note: This is a simplified example. Actual usage may vary.
    try:
        import tensorflow as tf
        print("TensorFlow is available for RGN2")

        # Here you would typically import RGN2 modules and run prediction
        print("To run actual prediction, follow the instructions in the RGN2 repository")
        print("or use the provided Colab notebook: https://colab.research.google.com/github/aqlaboratory/rgn2/blob/master/rgn2_prediction.ipynb")
    except Exception as e:
        print(f"Error running RGN2 example: {e}")

def run_alphafold_example():
    """Run a simple example with AlphaFold."""
    print("\\n===== Running AlphaFold Example =====")

    # Example protein sequence (same as above)
    sequence = "MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE"

    # Save sequence to a file
    with open("example.fasta", "w") as f:
        f.write(">example\\n")
        f.write(sequence)

    # Run AlphaFold prediction
    # Note: This is a simplified example. Actual usage may vary.
    try:
        import jax
        print("JAX is available for AlphaFold")

        # Here you would typically import AlphaFold modules and run prediction
        print("To run actual prediction, follow the instructions in the AlphaFold repository")
        print("or use ColabFold: https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/AlphaFold2.ipynb")
    except Exception as e:
        print(f"Error running AlphaFold example: {e}")

if __name__ == "__main__":
    run_rgn2_example()
    run_alphafold_example()
'''

# Save the example script
with open('example_usage.py', 'w') as f:
    f.write(example_script)

print("Example usage script saved to 'example_usage.py'")
print("You can run it in Colab with: %run example_usage.py")



Installation script for Google Colab saved to 'install_rgn2_af2_colab.py'
You can run it in Colab with: %run install_rgn2_af2_colab.py
Example usage script saved to 'example_usage.py'
You can run it in Colab with: %run example_usage.py


In [10]:
%run example_usage.py


===== Running RGN2 Example =====
TensorFlow is available for RGN2
To run actual prediction, follow the instructions in the RGN2 repository
or use the provided Colab notebook: https://colab.research.google.com/github/aqlaboratory/rgn2/blob/master/rgn2_prediction.ipynb

===== Running AlphaFold Example =====
JAX is available for AlphaFold
To run actual prediction, follow the instructions in the AlphaFold repository
or use ColabFold: https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/AlphaFold2.ipynb


In [11]:
#@title Import Python Packages

%cd '/content/rgn2'

import os
import sys
import re
import hashlib
import json
import subprocess
from pathlib import Path
from IPython.utils import io
from google.colab import files

%reload_ext autoreload
%autoreload 2


sys.path.append('/content/alphafold')
from ter2pdb import ter2pdb

/content/rgn2


In [12]:
#@title ### Enter the amino acid sequence to fold ⬇️
sequence = 'MNTPALCVHSEEIKNRINCPQKAVGFPVCLKLPIQESRTGCMHIHCKLCMVQLLNRKRIPD GPLSPQDITKQRLDKRSRKFIVLQGQLNVLCPLKFAKSNTLRMLNTGRKTEVNCIDELKE YLQGAALSKVIQSMGYRNNEKRNLMLQSLPPKLSLATSSVALRSLLPSPSVRLITNRTKI PLKTSQRLQSREESNKVKNTMTTVSELQNSNSGRVETLTIAKDIKNAKVDKTNLLNNNSN ESLSNLNTSGTPGVRNLASSRVLPLTKTMEIEMCIEGTRKDLASLNNLELHTMQRNNQAS PHETLGSQITSKNSKPLESRSMANQLLQEKTTAITQRINPELFKKSSYCIHNNTRGEKKN KGIKNLVEMPSLPRNHLVLVPLLPSLLKKGNLSRLFTLNLVPEPPGLIRTSRIVHLKEQY SKNRSSLIGNNVDNKNNKKQKKLNPNLLMDKRVESLLSFLIKFFSQINLKRRLVTNMGIS LSDNVSELPEVRAFLRVKLLPASLLKHVPILLNLLNNPPEMKTINNLHTSRHQVTVPHPA LNNNRLKRSKPIGLNSKSNLLLPLRNKLLETNNSTPNLSLSRQDKLPKSSHVVPAPKPIL PFLKFKVIKKNMSLLKQKPRFPKPIIPPIIVKPSEIMENFRQLRPLRQSTIIVLPGLIKN IQAVLIQVFPPRRTSFQRLKSKPPHVPLHLKKSKGMLHKKHHKVNGLNPKVNLLLSLLLH ISIRPVNLFPIISLNPKLRILLPQNKLQQNNLTQLPKRLLKNNNNNIIQPPPQPSSPQLS LRKLSSIIIVRNPSLIHLELPRLNRIKRNKIIRKNNLPLILPPSLLNQRSLKIKNNNNIL LPKPSLLPPKLLIRLGLIQPNILKPPIIKNLIPNPSILLPSKLLQLKNIVPKKLIPILRN LPKLHLPPKVLIPPVNNLQINRRKLLKIQQPLNIVLNLKQPLLPPVLSIKKNLLNNRNTL LIRLLINKIRPIPNKPLIKNNRNPLPQIPSKPRIKPKKNTLILLVKIPLIRPRLLIRPSL LPVLKRPLNLKPKLPLRLNQPIKILPKSLILKQNIKPRKPRNIKLLNLIRLKLIPVSLRS PLLPKLNNPLRPILPSPQLPPIRPLLKKLNNLPINKPILIPPVPSSILLKPLNIILKNNP LRQNQSSLLSNLNKPSLPLLIKPLIPLPLPSSLNKIKPSLLIQNPLLKNINLPLLNPLKN'  #@param {type:"string"}
jobname = 'test' #@param {type:"string"}

# Remove whitespace
sequence = "".join(sequence.split()).upper()
jobname = "".join(jobname.split())

jobname = re.sub(r'\W+', '', jobname)
seq_hash = hashlib.blake2b(sequence.encode(), digest_size=3).hexdigest()
seq_id = f'{jobname}_{seq_hash}'

MAX_SEQUENCE_LENGTH = 1023

# Remove all whitespaces, tabs and end lines; upper-case
sequence = sequence.translate(str.maketrans('', '', ' \n\t')).upper()
aatypes = set('ACDEFGHIKLMNPQRSTVWY')  # 20 standard aatypes
if not set(sequence).issubset(aatypes):
  raise Exception(f'Input sequence contains non-amino acid letters: {set(sequence) - aatypes}. AlphaFold only supports 20 standard amino acids as inputs.')
if len(sequence) > MAX_SEQUENCE_LENGTH:
  raise Exception(f'Input sequence is too long: {len(sequence)} amino acids, while the maximum is {MAX_SEQUENCE_LENGTH}. Please use the full AlphaFold system for long sequences.')

run_inputs = {'sequence': sequence, 'seq_id': seq_id}
with open("run.json", "w") as f:
    json.dump(run_inputs, f)

DATA_DIR = 'aminobert_output'
RUN_DIR = 'runs/15106000'
OUTPUT_DIR = 'output'
REFINE_DIR = 'output/refine_model1'
SEQ_PATH = os.path.join(DATA_DIR, f'{seq_id}.fa')
TER_PATH = os.path.join(RUN_DIR, '1', 'outputsTesting', f'{seq_id}.tertiary')

Exception: Input sequence is too long: 1201 amino acids, while the maximum is 1023. Please use the full AlphaFold system for long sequences.

In [None]:
# Let's properly split the sequence into multiple fragments
# The previous script didn't correctly split the sequence

def split_sequence_for_rgn2(sequence, max_length=900, overlap=100):
    """
    Split a long protein sequence into smaller fragments with overlap.

    Args:
        sequence (str): The amino acid sequence to split
        max_length (int): Maximum length for each fragment (default: 900)
        overlap (int): Number of amino acids to overlap between fragments (default: 100)

    Returns:
        list: List of dictionaries containing fragment information
    """
    fragments = []
    seq_length = len(sequence)

    if seq_length <= max_length:
        return [{"fragment_id": 1,
                 "start": 1,
                 "end": seq_length,
                 "sequence": sequence,
                 "is_full_sequence": True}]

    # Calculate positions for each fragment
    current_pos = 0
    fragment_id = 1

    while current_pos < seq_length:
        end_pos = min(current_pos + max_length, seq_length)

        # Create fragment
        fragment = {
            "fragment_id": fragment_id,
            "start": current_pos + 1,  # 1-indexed for biologists
            "end": end_pos,
            "sequence": sequence[current_pos:end_pos],
            "is_first_fragment": fragment_id == 1,
            "is_last_fragment": end_pos == seq_length
        }
        fragments.append(fragment)

        # Move to next fragment with overlap
        current_pos = end_pos - overlap

        # If we're near the end, just finish
        if current_pos >= seq_length - overlap:
            break

        fragment_id += 1

    return fragments

# Test with the provided sequence
test_sequence = "MNTPALCVHSEEIKNRINCPQKAVGFPVCLKLPIQESRTGCMHIHCKLCMVQLLNRKRIPD" + \
                "GPLSPQDITKQRLDKRSRKFIVLQGQLNVLCPLKFAKSNTLRMLNTGRKTEVNCIDELKE" + \
                "YLQGAALSKVIQSMGYRNNEKRNLMLQSLPPKLSLATSSVALRSLLPSPSVRLITNRTKI" + \
                "PLKTSQRLQSREESNKVKNTMTTVSELQNSNSGRVETLTIAKDIKNAKVDKTNLLNNNSN" + \
                "ESLSNLNTSGTPGVRNLASSRVLPLTKTMEIEMCIEGTRKDLASLNNLELHTMQRNNQAS" + \
                "PHETLGSQITSKNSKPLESRSMANQLLQEKTTAITQRINPELFKKSSYCIHNNTRGEKKN" + \
                "KGIKNLVEMPSLPRNHLVLVPLLPSLLKKGNLSRLFTLNLVPEPPGLIRTSRIVHLKEQY" + \
                "SKNRSSLIGNNVDNKNNKKQKKLNPNLLMDKRVESLLSFLIKFFSQINLKRRLVTNMGIS" + \
                "LSDNVSELPEVRAFLRVKLLPASLLKHVPILLNLLNNPPEMKTINNLHTSRHQVTVPHPA" + \
                "LNNNRLKRSKPIGLNSKSNLLLPLRNKLLETNNSTPNLSLSRQDKLPKSSHVVPAPKPIL" + \
                "PFLKFKVIKKNMSLLKQKPRFPKPIIPPIIVKPSEIMENFRQLRPLRQSTIIVLPGLIKN" + \
                "IQAVLIQVFPPRRTSFQRLKSKPPHVPLHLKKSKGMLHKKHHKVNGLNPKVNLLLSLLLH" + \
                "ISIRPVNLFPIISLNPKLRILLPQNKLQQNNLTQLPKRLLKNNNNNIIQPPPQPSSPQLS" + \
                "LRKLSSIIIVRNPSLIHLELPRLNRIKRNKIIRKNNLPLILPPSLLNQRSLKIKNNNNIL" + \
                "LPKPSLLPPKLLIRLGLIQPNILKPPIIKNLIPNPSILLPSKLLQLKNIVPKKLIPILRN" + \
                "LPKLHLPPKVLIPPVNNLQINRRKLLKIQQPLNIVLNLKQPLLPPVLSIKKNLLNNRNTL" + \
                "LIRLLINKIRPIPNKPLIKNNRNPLPQIPSKPRIKPKKNTLILLVKIPLIRPRLLIRPSL" + \
                "LPVLKRPLNLKPKLPLRLNQPIKILPKSLILKQNIKPRKPRNIKLLNLIRLKLIPVSLRS" + \
                "PLLPKLNNPLRPILPSPQLPPIRPLLKKLNNLPINKPILIPPVPSSILLKPLNIILKNNP" + \
                "LRQNQSSLLSNLNKPSLPLLIKPLIPLPLPSSLNKIKPSLLIQNPLLKNINLPLLNPLKN"

# Remove whitespace and standardize
test_sequence = "".join(test_sequence.split()).upper()

# Split the sequence into fragments with maximum length of 900 and overlap of 100
fragments = split_sequence_for_rgn2(test_sequence, max_length=900, overlap=100)

# Print summary
print(f"Original sequence length: {len(test_sequence)} amino acids")
print(f"Split into {len(fragments)} fragments with maximum length of 900 and overlap of 100 amino acids")

# Create a modified version of the RGN2 notebook cell to handle fragments
modified_cell = """
#@title ### Enter the amino acid sequence to fold ⬇️
sequence = 'FRAGMENT_SEQUENCE'  #@param {type:"string"}
jobname = 'FRAGMENT_NAME' #@param {type:"string"}

import re
import hashlib
import json
import os

# Remove whitespace
sequence = "".join(sequence.split()).upper()
jobname = "".join(jobname.split())

jobname = re.sub(r'\\W+', '', jobname)
seq_hash = hashlib.blake2b(sequence.encode(), digest_size=3).hexdigest()
seq_id = f'{jobname}_{seq_hash}'

MAX_SEQUENCE_LENGTH = 1023

# Remove all whitespaces, tabs and end lines; upper-case
sequence = sequence.translate(str.maketrans('', '', ' \\n\\t')).upper()
aatypes = set('ACDEFGHIKLMNPQRSTVWY')  # 20 standard aatypes
if not set(sequence).issubset(aatypes):
  raise Exception(f'Input sequence contains non-amino acid letters: {set(sequence) - aatypes}. AlphaFold only supports 20 standard amino acids as inputs.')
if len(sequence) > MAX_SEQUENCE_LENGTH:
  raise Exception(f'Input sequence is too long: {len(sequence)} amino acids, while the maximum is {MAX_SEQUENCE_LENGTH}. Please use the full AlphaFold system for long sequences.')

run_inputs = {'sequence': sequence, 'seq_id': seq_id}
with open("run.json", "w") as f:
    json.dump(run_inputs, f)

DATA_DIR = 'aminobert_output'
RUN_DIR = 'runs/15106000'
OUTPUT_DIR = 'output'
REFINE_DIR = 'output/refine_model1'
SEQ_PATH = os.path.join(DATA_DIR, f'{seq_id}.fa')
TER_PATH = os.path.join(RUN_DIR, '1', 'outputsTesting', f'{seq_id}.tertiary')
"""

# Generate modified cells for each fragment
for i, fragment in enumerate(fragments):
    fragment_id = fragment["fragment_id"]
    start = fragment["start"]
    end = fragment["end"]
    sequence = fragment["sequence"]

    fragment_name = f"fragment_{fragment_id}_{start}_{end}"

    # Create the modified cell for this fragment
    current_cell = modified_cell.replace("FRAGMENT_SEQUENCE", sequence).replace("FRAGMENT_NAME", fragment_name)

    # Save to a file
    with open(f"rgn2_fragment_{fragment_id}.py", "w") as f:
        f.write(current_cell)

    print(f"Created script for fragment {fragment_id} (residues {start}-{end}, length: {len(sequence)})")

# Create a guide for using the fragments with RGN2
rgn2_guide = """
# Guide for Using RGN2 with Long Protein Sequences

## Overview
This guide explains how to predict the structure of a long protein sequence (>1023 amino acids) using RGN2 by splitting it into smaller fragments.

## Step 1: Fragment Preparation
Your long protein sequence has been split into overlapping fragments, each under the 1023 amino acid limit. Each fragment has been saved as a separate Python script that you can use in the RGN2 Colab notebook.

## Step 2: Predict Each Fragment
For each fragment:

1. Open the RGN2 Colab notebook: https://colab.research.google.com/github/aqlaboratory/rgn2/blob/master/rgn2_prediction.ipynb
2. Run the first cell to set up the environment
3. Replace the sequence input cell with the content of the fragment script (e.g., `rgn2_fragment_1.py`)
4. Run the remaining cells to generate the prediction
5. Download the resulting PDB file
6. Repeat for each fragment

## Step 3: Combine the Fragments
After predicting all fragments:

1. Use a structural alignment tool like PyMOL, UCSF Chimera, or ChimeraX
2. Align the overlapping regions of adjacent fragments
3. Merge the aligned structures to create a complete model

## Step 4: Refine the Combined Structure
1. Perform energy minimization on the combined structure
2. Check for steric clashes and resolve them
3. Validate the final structure

## Tips for Success
- Ensure significant overlap between fragments (100+ residues) for accurate alignment
- Pay attention to the quality of the predicted structures in the overlapping regions
- If a fragment fails to predict properly, try adjusting the fragment boundaries
- For very large proteins, consider domain-based modeling instead of sequential fragments

## Merging Fragments with PyMOL
You can use the provided `merge_fragments.py` script with PyMOL to merge your fragments:

In [None]:
#@title Generate AminoBERT Embeddings

%%bash
source /opt/conda/etc/profile.d/conda.sh && conda init
conda activate rgn2
python

import os
import sys
import json
sys.path.append(os.path.join(os.getcwd(), 'aminobert'))

import shutil
from aminobert.prediction import aminobert_predict_sequence
from data_processing.aminobert_postprocessing import aminobert_postprocess

DATA_DIR = 'aminobert_output'
DATASET_NAME = '1'
PREPEND_M = True
AMINOBERT_CHKPT_DIR = 'resources/aminobert_checkpoint/AminoBERT_runs_v2_uniparc_dataset_v2_5-1024_fresh_start_model.ckpt-1100000'

with open("run.json", "r") as f:
    run_inputs = json.load(f)

# Remove old data since AminoBERT combines entire directory to create dataset.
if os.path.exists(DATA_DIR):
  shutil.rmtree(DATA_DIR)
os.makedirs(DATA_DIR)

aminobert_predict_sequence(seq=run_inputs['sequence'], header=run_inputs['seq_id'],
                           prepend_m=PREPEND_M, checkpoint=AMINOBERT_CHKPT_DIR,
                           data_dir=DATA_DIR)
aminobert_postprocess(data_dir=DATA_DIR, dataset_name=DATASET_NAME, prepend_m=PREPEND_M)

In [None]:
#@title Run RGN2
#@markdown This step generates the raw RGN2-predicted C-alpha trace.

rgn2_env_init = 'source /opt/conda/etc/profile.d/conda.sh && conda init && conda activate rgn2'
try:
  with io.capture_output() as captured:
    cmd = (f"python rgn/protling.py {os.path.join(RUN_DIR, 'configuration')} "
           f"-p -e 'weighted_testing' -a -g 0")
    %shell {rgn2_env_init} && {cmd}
except subprocess.CalledProcessError:
  print(captured)
  raise

print('Prediction completed!')

In [None]:
#@title Refinement and Structure Download
#@markdown Once this cell has been executed, a PDB file with the refined
#@markdown structure will be automatically downloaded to your computer.
#@markdown **Note**: Notebook refinement pipeline is ~2x slower compared
#@markdown to local execution.
recycles = 1 #@param {type:"integer"}

ter2pdb.run_ca_to_allatom(seq_path=SEQ_PATH, ter_path=TER_PATH,
                          output_dir=OUTPUT_DIR, seq_id=seq_id)

out_suffix = '_prediction'
af2_env_init = 'source /opt/conda/etc/profile.d/conda.sh && conda init && conda activate af2'
jax_env_vars = 'TF_FORCE_UNIFIED_MEMORY=1 XLA_PYTHON_CLIENT_MEM_FRACTION=2.0'
cmd = (f"{jax_env_vars} python ter2pdb/run_af2rank.py refine_model1 "
       f"--target_list {seq_id} --af2_dir /content/alphafold/ "
       f"--out_suffix {out_suffix} --seq_dir {Path(SEQ_PATH).parent} "
       f"--pdb_dir {OUTPUT_DIR} --output_dir {OUTPUT_DIR} --deterministic "
       f"--seq_replacement - --mask_sidechains_add_cb --model_num 1 "
       f"--recycles {recycles}")
try:
  with io.capture_output() as captured:
    %shell {af2_env_init} && {cmd}
except subprocess.CalledProcessError:
  print(captured)
  raise

print('Refinement completed!')

files.download(os.path.join(REFINE_DIR, f'{seq_id}{out_suffix}.pdb'))