# Generate json

In [None]:
#| default_exp json

In [None]:
#| export
import shutil, json, pandas as pd
from pathlib import Path

## Single protein sequence (default)
> Default pipeline, will run MSA and template search

In [None]:
#| export
def dump_json(data, save_path):
    "Save json data into a file"
    with open(save_path,'w') as f: 
        json.dump(data,f,indent=4)

In [None]:
#| export
def get_protein_json(name, # job name
                     seq, # aa sequence
                     save_path=None, # .json
                     seeds=[1]
                     ):
    "Generate json of single protein sequence for input of docker command"
    
    json_data = {
        "name": name,
        "modelSeeds": seeds,
        "sequences": [
            {
                "protein": {
                    "id": "A",
                    "sequence": seq,
                }
            },
        ],
        "bondedAtomPairs": [],
        "dialect": "alphafold3",
        "version": 2
    }
    if save_path:
        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
        dump_json(json_data,save_path)
    return json_data

In [None]:
data = get_protein_json('proteinA','AAA','data/proteinA.json',seeds=[1,2,3])
data

{'name': 'proteinA',
 'modelSeeds': [1, 2, 3],
 'sequences': [{'protein': {'id': 'A', 'sequence': 'AAA'}}],
 'bondedAtomPairs': [],
 'dialect': 'alphafold3',
 'version': 2}

## Protein-SMILES

- First run the normal `sequence only` pipeline for the protein
- Get the output data.json file, read it, load the `["sequences"][0]["protein"]`

In [None]:
#| export
def read_json(file_path):
    with open(file_path,'r') as f: 
        data = json.load(f)
    return data

In [None]:
protein_json = read_json('data/seq_only_data.json')

In [None]:
#| export
def get_protein_smiles_json(smi_id:str, 
                            SMILES:str, 
                            protein_json, # json type
                            save_path=None, # .json
                            seeds=[1]
                            ):
    
    "Get json for protein-ligand docking task"
    
    json_data = {
        "name": smi_id,
        "modelSeeds": seeds,
        "sequences": [
            {
                "ligand": {
                    "id": "L",
                    "smiles": SMILES,
                }
            }, 
            {
                "protein": protein_json["sequences"][0]["protein"]
            },
        ],
        "bondedAtomPairs": [],
        "dialect": "alphafold3",
        "version": 2
    }
    if save_path:
        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
        dump_json(json_data,save_path)
    return json_data

In [None]:
out = get_protein_smiles_json('smi_name','CCC',protein_json,'data/protein_smi.json',seeds=[1,2,3])

Let's take a look for the json:

In [None]:
str(out)[:100]

"{'name': 'smi_name', 'modelSeeds': [1, 2, 3], 'sequences': [{'ligand': {'id': 'L', 'smiles': 'CCC'}}"

In [None]:
df = pd.DataFrame({'idx':['a','b'],'smi':['CCC','OCO']})
df

Unnamed: 0,idx,smi
0,a,CCC
1,b,OCO


In [None]:
project_name='sdf'
for idx, smi in df.values:
    _ = get_protein_smiles_json(idx,smi,protein_json,f'af_input/{project_name}/{idx}.json',seeds=[1,2,3])

## Split the files to subfolder for multi-GPUs

In [None]:
#| export
def split_nfolder(folder_dir, 
                  n=4):
    "Move json files from a folder into subfolders (folder_0, folder_1, ..., folder_N)."
    
    folder_dir = Path(folder_dir)

    files = sorted(folder_dir.glob("*.json"))
    # print(len(files))
    subfolders = [folder_dir / f"folder_{i}" for i in range(n)]
    for folder in subfolders:
        folder.mkdir(exist_ok=True)

    for idx, file in enumerate(files):
        target_folder = subfolders[idx % n]
        shutil.move(str(file), target_folder / file.name)

    print(f"Distributed {len(files)} files into {n} folders.")

In [None]:
split_nfolder(f'af_input/{project_name}')

Distributed 2 files into 4 folders.


## End

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()