# Prepare input json

## Setup

In [None]:
#| default_exp core

In [None]:
#| export
import os, json, shutil, pandas as pd
from pathlib import Path
from tqdm import tqdm
from itertools import combinations
from pathlib import Path

## Instructions

### Docker image

To build docker image:

```sh
git clone https://github.com/google-deepmind/alphafold3.git
cd alphafold3
docker build -t alphafold3 -f docker/Dockerfile .
```

I also regularly updated alphafold3 docker on the dockerhub, so you can also pull the image from there:

```sh
docker pull sky1ove/alphafold3
```

### Docker command

Before running, make sure you have `af_model`, `af_output`, `af_database` folder prepared in the current directory

In [None]:
#| export
def get_docker_command(
    input_dir="af_input", # Directory for input data
    output_dir="af_output", # Directory for output results
    model_dir="af_model", # Directory containing models
    db_dir="af_database", # Directory for databases. If None, this option is ommitted
    cache_dir="cache", # Directory for JAX compilation cache. If None, this option is omitted
    gpus=0, # GPU devices to allocate (e.g., 0,1), if None, ommitted
    docker_name="sky1ove/alphafold3", # Docker image name
    embedding=True, # If True, includes the --save_embeddings=true flag
    skip_search=False, # If True, includes the --norun_data_pipeline flag
    search_only=False, # If True, sets skip_search to False and includes the --norun_inference flag
    json_path=None  # Path to JSON file. If not None, uses json_file instead of input_dir
):

    "Generate a Docker run command for Alphafold with customizable parameters."
    
    # Start building the Docker command
    cmd_parts = ["docker run --rm \\"]

    if json_path:
        json_path = Path(json_path)
        json_dir = json_path.parent
        json_basename = json_path.name
        cmd_parts.append(f'    --volume "$HOME/{json_dir}:/root/af_input" \\')
    else:
        parent_dir = input_dir.split('/')[0]
        cmd_parts.append(f'    --volume "$HOME/{parent_dir}:/root/af_input" \\')

    cmd_parts.extend([f'    --volume "$HOME/{output_dir}:/root/af_output" \\',
                      f'    --volume "$HOME/{model_dir}:/root/models" \\',
                     ])
    if db_dir:
        cmd_parts.append(f'    --volume "$HOME/{db_dir}:/root/public_databases" \\')

    if cache_dir:
        cmd_parts.append(f'    --volume "$HOME/{cache_dir}:/root/cache" \\')

    if gpus is not None:
        cmd_parts.append(f'    --gpus "device={gpus}" \\')

    cmd_parts.extend([f'    {docker_name} \\',
                      '    python run_alphafold.py \\'])
    
    if json_path:
        cmd_parts.append(f'    --json_path=/root/af_input/{json_basename} \\')
    else:
        len_dir = len(input_dir.split('/')[1:])
        if len_dir:
            mount_dir= '/'.join(input_dir.split('/')[1:])
            cmd_parts.append(f'    --input_dir=/root/af_input/{mount_dir} \\')
        else:
            cmd_parts.append('    --input_dir=/root/af_input \\')

    cmd_parts.extend(['    --output_dir=/root/af_output \\',
                      '    --model_dir=/root/models \\'])
    
    if cache_dir:
        cmd_parts.append('    --jax_compilation_cache_dir=/root/cache \\')

    if embedding:
        cmd_parts.append('    --save_embeddings=true \\')
    
    if search_only:
        skip_search = False
        cmd_parts.append('    --norun_inference \\')
    elif skip_search:
        cmd_parts.append('    --norun_data_pipeline \\')

    if cmd_parts[-1].endswith('\\'):
        cmd_parts[-1] = cmd_parts[-1].rstrip(' \\')

    docker_command = "\n".join(cmd_parts)
    print(docker_command)

Single json file:

In [None]:
# for single json file, we don't need to cache the model
get_docker_command(json_path="path/to/your/data.json",
                   output_dir="af_output/subfolder",
                   cache_dir=False)

docker run --rm \
    --volume "$HOME/path/to/your:/root/af_input" \
    --volume "$HOME/af_output/subfolder:/root/af_output" \
    --volume "$HOME/af_model:/root/models" \
    --volume "$HOME/af_database:/root/public_databases" \
    --gpus "device=0" \
    sky1ove/alphafold3 \
    python run_alphafold.py \
    --json_path=/root/af_input/data.json \
    --output_dir=/root/af_output \
    --model_dir=/root/models \
    --save_embeddings=true


Input directory with json files:

In [None]:
# For a number of json files in the input folder
get_docker_command(input_dir="path/to/your/folder",
                   output_dir="af_output/subfolder")

docker run --rm \
    --volume "$HOME/path:/root/af_input" \
    --volume "$HOME/af_output/subfolder:/root/af_output" \
    --volume "$HOME/af_model:/root/models" \
    --volume "$HOME/af_database:/root/public_databases" \
    --volume "$HOME/cache:/root/cache" \
    --gpus "device=0" \
    sky1ove/alphafold3 \
    python run_alphafold.py \
    --input_dir=/root/af_input/to/your/folder \
    --output_dir=/root/af_output \
    --model_dir=/root/models \
    --jax_compilation_cache_dir=/root/cache \
    --save_embeddings=true


Both above will search database first then run the GPU inference

## MSA

In [None]:
#| export
def copy_a3m(a3m_dir: str, # Path to the source directory containing .a3m files.
             dest_dir: str, # Path to the destination directory where files will be copied
             ):
    "Copies all .a3m files from the source directory to the destination directory."
    
    a3m_dir,dest_dir = Path(a3m_dir),Path(dest_dir)
    dest_dir.mkdir(parents=True, exist_ok=True)

    files = list(a3m_dir.glob('*.a3m'))

    for file in tqdm(files, desc="Copying files", unit="file"):
        shutil.copy(file, dest_dir / file.name)

    print(f"Copied {len(files)} a3m files from {a3m_dir} to {dest_dir}")

In [None]:
copy_a3m(a3m_dir='data',dest_dir='af_input')

Copying files: 100%|██████████| 1/1 [00:00<00:00, 937.48file/s]

Copied 1 a3m files from data to af_input





## Utils

In [None]:
#| export
def split_files_into_folders(input_folder: str, nfolder: int = 4):
    """
    Splits `.a3m` files in a folder into subfolders (folder_0, folder_1, ..., folder_N).
    
    Args:
        input_folder (str): Path to the folder containing `.a3m` files.
        nfolder (int): Number of subfolders to create. Default is 4.
    """
    input_path = Path(input_folder)
    if not input_path.is_dir():
        raise ValueError(f"Input folder {input_folder} does not exist or is not a directory.")

    # List all `.a3m` files
    a3m_files = sorted(input_path.glob("*.a3m"))
    if not a3m_files:
        print("No `.a3m` files found in the input folder.")
        return

    # Create the subfolders
    subfolders = [input_path / f"folder_{i}" for i in range(nfolder)]
    for folder in subfolders:
        folder.mkdir(exist_ok=True)

    # Distribute the files into the subfolders
    for idx, file in enumerate(a3m_files):
        target_folder = subfolders[idx % nfolder]
        shutil.move(str(file), target_folder / file.name)

    print(f"Distributed {len(a3m_files)} files into {nfolder} folders.")


## Sequence only input
> default pipeline

In [None]:
#| export
def get_AF_input_seq(name, seq):
    "Generate AF input file of protein sequence only"
    
    json_data = {
        "name": name,
        "modelSeeds": [1],
        "sequences": [
            {
                "protein": {
                    "id": "A",
                    "sequence": seq,
                }
            },
        ],
        "bondedAtomPairs": [],
        "dialect": "alphafold3",
        "version": 2
    }
    return json_data

In [None]:
data = get_AF_input_seq('proteinA','AAA')
data

{'name': 'proteinA',
 'modelSeeds': [1],
 'sequences': [{'protein': {'id': 'A', 'sequence': 'AAA'}}],
 'bondedAtomPairs': [],
 'dialect': 'alphafold3',
 'version': 2}

In [None]:
#| export
def dump_json(data, save_path):
    with open(save_path,'w') as f: 
        json.dump(data,f,indent=4)

In [None]:
dump_json(data,'data/proteinA_seq_only.json')

In [None]:
#| export
def read_json(file_path):
    with open(file_path,'r') as f: 
        data = json.load(f)
    return data

In [None]:
read_json('data/proteinA_seq_only.json')

{'name': 'proteinA',
 'modelSeeds': [1],
 'sequences': [{'protein': {'id': 'A', 'sequence': 'AAA'}}],
 'bondedAtomPairs': [],
 'dialect': 'alphafold3',
 'version': 2}

## Protein-protein input

:::{.callout-important}
Make sure a3m files are under af_input, otherwise it won't detect the files
:::

In [None]:
#| export
def a3m_to_seq(file_path:Path):
    "Get protein sequence from a3m file"
    return file_path.read_text().splitlines()[2] # protein sequence is located on line 2

In [None]:
a3m_to_seq(Path('af_input/subfolder/a3m/CD8A.a3m'))

'SQFRVSPLDRTWNLGETVELKCQVLLSNPTSGCSWLFQPRGAAASPTFLLYLSQNKPKAAEGLDTQRFSGKRLGDTFVLTLSDFRRENEGYYFCSALSNSIMYFSHFVPVFLPAKPTTTPAPRPPTPAPTIASQPLSLRPEACRPAAGGAVHTRGLDFACD'

In [None]:
#| export
def get_protein_json(gene_name, a3m_dir=".",idx = 'A',run_template=True):
    "Get alphafold format protein json from a3m file; make sure a3m_dir is under af_input"
    file_path = Path(a3m_dir)/f"{gene_name}.a3m"
    protein_sequence = a3m_to_seq(file_path)
    
    json_data = {
        'id': idx,
        'sequence': protein_sequence, 
        'modifications': [],
        'unpairedMsaPath': str("/root"/file_path), # for docker path, ECD under af_input
        'pairedMsa': '',
        'templates': None if run_template else []
    }

    return json_data

In [None]:
protein_json = get_protein_json('CD8A',a3m_dir='af_input/subfolder/a3m')

In [None]:
protein_json

{'id': 'A',
 'sequence': 'SQFRVSPLDRTWNLGETVELKCQVLLSNPTSGCSWLFQPRGAAASPTFLLYLSQNKPKAAEGLDTQRFSGKRLGDTFVLTLSDFRRENEGYYFCSALSNSIMYFSHFVPVFLPAKPTTTPAPRPPTPAPTIASQPLSLRPEACRPAAGGAVHTRGLDFACD',
 'modifications': [],
 'unpairedMsaPath': '/root/af_input/subfolder/a3m/CD8A.a3m',
 'pairedMsa': '',
 'templates': None}

In [None]:
#| export
def save_json(json_data, folder):
    "Save json under a folder"
    file_path = Path(folder)/f"{json_data['name']}.json"
    with open(file_path,'w') as f: json.dump(json_data,f,indent=4)

In [None]:
save_json(data,'data')

In [None]:
#| export
def get_AF_input(gene_list,a3m_dir,run_template=True,save_folder=None):
    'Get AF3 input json data, allows multiple genes/proteins'
    sequences = []
    alphabets = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    for index, gene in enumerate(gene_list):
        protein_json=get_protein_json(gene,a3m_dir,idx=alphabets[index],run_template=run_template)
        sequences.append({'protein':protein_json})
    name = '_'.join(gene_list)
    json_data = {
            "name": name,
            "modelSeeds": [1],
            "sequences": sequences,
            "bondedAtomPairs": [],
            "dialect": "alphafold3",
            "version": 2
        }
    if save_folder:
        save_json(json_data,save_folder)
    return json_data

In [None]:
AF_input = get_AF_input(['CD8A','CD8A'],
                        a3m_dir='af_input/subfolder/a3m',
                        save_folder='af_input/subfolder')

You can generate a list of json files under a folder.

In [None]:
AF_input.keys(), len(AF_input['sequences'])

(dict_keys(['name', 'modelSeeds', 'sequences', 'bondedAtomPairs', 'dialect', 'version']),
 2)

In [None]:
#| export
def generate_pair_df(gene_list,self_pair=True):
    "Unique pair genes in a gene list"
    pairs = list(combinations(gene_list, 2))
    pair_df = pd.DataFrame(pairs,columns=["Gene1", "Gene2"])
    
    if self_pair:
        self_pair_df = pd.DataFrame({'Gene1':gene_list, 'Gene2':gene_list})
        pair_df = pd.concat([pair_df,self_pair_df])

    return pair_df.reset_index(drop=True)

In [None]:
generate_pair_df(list('ABC'))

Unnamed: 0,Gene1,Gene2
0,A,B
1,A,C
2,B,C
3,A,A
4,B,B
5,C,C


In [None]:
df = generate_pair_df(['CD8A'])
df

Unnamed: 0,Gene1,Gene2
0,CD8A,CD8A


In [None]:
#| export
def split_nfolder(folder_dir, 
                  n=4):
    "Splits json files in a folder into subfolders (folder_0, folder_1, ..., folder_N)."
    
    folder_dir = Path(folder_dir)

    files = sorted(folder_dir.glob("*.json"))
    # print(len(files))
    subfolders = [folder_dir / f"folder_{i}" for i in range(n)]
    for folder in subfolders:
        folder.mkdir(exist_ok=True)

    for idx, file in enumerate(files):
        target_folder = subfolders[idx % n]
        shutil.move(str(file), target_folder / file.name)

    print(f"Distributed {len(files)} files into {n} folders.")

Generate json files first:

In [None]:
for idx, row in tqdm(df.iterrows(),total=len(df)):
    json_data = get_AF_input([row['Gene1'], row['Gene2']], 
                             a3m_dir='af_input/subfolder/a3m', 
                             save_folder='af_input/subfolder') 

100%|██████████| 1/1 [00:00<00:00, 190.95it/s]


Split them to subfolder:

In [None]:
split_nfolder('af_input/subfolder')

Distributed 1 files into 4 folders.


## Protein-SMILES

- First run the normal `sequence only` pipeline for the protein
- Get the output data.json file, read it, load the `["sequences"][0]["protein"]`

In [None]:
#| export
def get_AF_input_smi(smi_idx, smiles, protein_json):
    "Generate AF input file for smiles protein docking task"
    
    json_data = {
        "name": smi_idx,
        "modelSeeds": [1],
        "sequences": [
            {
                "ligand": {
                    "id": "L",
                    "smiles": smiles,
                }
            }, 
            {
                "protein": protein_json["sequences"][0]["protein"]
            },
        ],
        "bondedAtomPairs": [],
        "dialect": "alphafold3",
        "version": 2
    }
    return json_data

In [None]:
fname = 'data/seq_only_data.json'
protein_json = read_json(fname)

In [None]:
str(get_AF_input_smi('smi_name','CCC',protein_json))[:100]

"{'name': 'smi_name', 'modelSeeds': [1], 'sequences': [{'ligand': {'id': 'L', 'smiles': 'CCC'}}, {'pr"

In [None]:
#| export
def save_input_smi(df, id_col, smi_col, protein_json,save_dir):
    
    for i, r in tqdm(df.iterrows(),total=len(df)):
        
        json_data = get_AF_input_smi(r[id_col], r[smi_col],protein_json)
        file_name =Path(save_dir)/f"{r[id_col]}.json"
        dump_json(json_data,file_name)

In [None]:
df = pd.DataFrame({'idx':['a','b'],'smi':['CCC','OCO']})
df

Unnamed: 0,idx,smi
0,a,CCC
1,b,OCO


In [None]:
save_input_smi(df,'idx','smi',protein_json,save_dir='data')

100%|██████████| 2/2 [00:00<00:00, 122.72it/s]


## End

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()