# Prepare input json

## Setup

In [None]:
#| default_exp core

In [None]:
#| export
import os, json, pandas as pd
from pathlib import Path
from tqdm import tqdm
from itertools import combinations

## Instructions

- Generate `af_input` folder outside of alphafold3 folder
  - Make sure `A3M dir` is within this folder 
- Have `af_model`, `af_output`, `af_database` folder prepare in the same parent folder
- When mount docker path, add the above directory
- Add `af_report` folder for saving report files

```sh
cd alphafold3
docker build -t alphafold3 -f docker/Dockerfile .
```

```sh
docker run --rm \
    --volume $HOME/af_input:/root/af_input \
    --volume $HOME/af_output:/root/af_output \
    --volume $HOME/af_model:/root/models \
    --volume $HOME/af_database:/root/public_databases \
    --gpus "device=0" \
    alphafold3 \
    python run_alphafold.py \
    --input_dir=/root/af_input/{path_to_json_folder}/ \
    --output_dir=/root/af_output/{path_to_output} \
    --model_dir=/root/models
```

Use `json_path` instead of `input_dir` if specific json file

## Sequence only input
> default pipeline

In [None]:
#| export
def get_AF_input_seq(name, seq):
    "Generate AF input file of protein sequence only"
    
    json_data = {
        "name": name,
        "modelSeeds": [1],
        "sequences": [
            {
                "protein": {
                    "id": "A",
                    "sequence": seq,
                }
            },
        ],
        "bondedAtomPairs": [],
        "dialect": "alphafold3",
        "version": 2
    }
    return json_data

In [None]:
data = get_AF_input_seq('proteinA','AAA')
data

{'name': 'proteinA',
 'modelSeeds': [1],
 'sequences': [{'protein': {'id': 'A', 'sequence': 'AAA'}}],
 'bondedAtomPairs': [],
 'dialect': 'alphafold3',
 'version': 2}

In [None]:
#| export
def write_json(data, save_path):
    with open(save_path,'w') as f: 
        json.dump(data,f,indent=4)

In [None]:
write_json(data,'data/proteinA_seq_only.json')

In [None]:
#| export
def read_json(file_path):
    with open(file_path,'r') as f: 
        data = json.load(f)
    return data

In [None]:
read_json('data/proteinA_seq_only.json')

{'name': 'proteinA',
 'modelSeeds': [1],
 'sequences': [{'protein': {'id': 'A', 'sequence': 'AAA'}}],
 'bondedAtomPairs': [],
 'dialect': 'alphafold3',
 'version': 2}

## Protein-protein input

In [None]:
#| export
def a3m_to_seq(file_path:Path):
    "Get protein sequence from a3m file"
    return file_path.read_text().splitlines()[2] # protein sequence is located on line 2

In [None]:
a3m_to_seq(Path('data/CD8A.a3m'))

'SQFRVSPLDRTWNLGETVELKCQVLLSNPTSGCSWLFQPRGAAASPTFLLYLSQNKPKAAEGLDTQRFSGKRLGDTFVLTLSDFRRENEGYYFCSALSNSIMYFSHFVPVFLPAKPTTTPAPRPPTPAPTIASQPLSLRPEACRPAAGGAVHTRGLDFACD'

In [None]:
#| export
def get_protein_json(gene_name, a3m_dir=".",idx = 'A',run_template=True):
    "Get alphafold format protein json from a3m file; make sure a3m_dir is under af_input"
    file_path = Path(a3m_dir)/f"{gene_name}.a3m"
    protein_sequence = a3m_to_seq(file_path)
    
    json_data = {
        'id': idx,
        'sequence': protein_sequence, 
        'modifications': [],
        'unpairedMsaPath': "/root/af_input/"+str(file_path), # for docker path, ECD under af_input
        'pairedMsa': '',
        'templates': None if run_template else []
    }

    return json_data

In [None]:
protein_json = get_protein_json('CD8A',a3m_dir='data',idx='A')

In [None]:
protein_json.keys()

dict_keys(['id', 'sequence', 'modifications', 'unpairedMsaPath', 'pairedMsa', 'templates'])

In [None]:
#| export
def get_AF_input(gene_list,a3m_dir,run_template=True):
    'Get AF3 input json data, allows multiple genes/proteins'
    sequences = []
    alphabets = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    for index, gene in enumerate(gene_list):
        protein_json=get_protein_json(gene,a3m_dir,idx=alphabets[index],run_template=run_template)
        sequences.append({'protein':protein_json})
    name = '_'.join(gene_list)
    json_data = {
            "name": name,
            "modelSeeds": [1],
            "sequences": sequences,
            "bondedAtomPairs": [],
            "dialect": "alphafold3",
            "version": 2
        }
    return json_data

In [None]:
AF_input = get_AF_input(['CD8A','CD8A'],a3m_dir='data')

In [None]:
AF_input.keys(), len(AF_input['sequences'])

(dict_keys(['name', 'modelSeeds', 'sequences', 'bondedAtomPairs', 'dialect', 'version']),
 2)

In [None]:
#| export
def save_json(json_data, folder):
    "Save json to file"
    file_path = Path(folder)/f"{json_data['name']}.json"
    with open(file_path,'w') as f: json.dump(json_data,f,indent=4)

In [None]:
save_json(AF_input,'data')

In [None]:
#| export
def generate_pair_df(gene_list,self_pair=True):
    "Unique pair genes in a gene list"
    pairs = list(combinations(gene_list, 2))
    pair_df = pd.DataFrame(pairs,columns=["Gene1", "Gene2"])
    
    if self_pair:
        self_pair_df = pd.DataFrame({'Gene1':gene_list, 'Gene2':gene_list})
        pair_df = pd.concat([pair_df,self_pair_df])

    return pair_df.reset_index(drop=True)

In [None]:
generate_pair_df(list('ABC'))

Unnamed: 0,Gene1,Gene2
0,A,B
1,A,C
2,B,C
3,A,A
4,B,B
5,C,C


In [None]:
df = generate_pair_df(['CD8A'])
df

Unnamed: 0,Gene1,Gene2
0,CD8A,CD8A


In [None]:
#| export
def save_input(pair_df, a3m_dir, save_dir, nfolder=4):
    "Save Alphafold input file in a directory with separate folder for parallel calculating"
    
    save_dir = Path(save_dir) 
    
    for idx, row in tqdm(pair_df.iterrows(),total=len(pair_df)):
        for n in range(nfolder):
            folder_path = save_dir / f'folder_{n}'
            folder_path.mkdir(parents=True, exist_ok=True)

        # for faster speed, can removethe check
        # check_fname = list(save_dir.glob(f'*/{row["Gene1"]}_{row["Gene2"]}.json'))
        # if not check_fname: 
            
        json_data = get_AF_input([row['Gene1'], row['Gene2']], a3m_dir=a3m_dir)  
        save_fname = save_dir / f'folder_{idx % nfolder}' / f'{row["Gene1"]}_{row["Gene2"]}.json'
        write_json(json_data,save_fname)        

In [None]:
save_input(df,a3m_dir='data',save_dir='data')

100%|██████████| 1/1 [00:00<00:00, 276.92it/s]


In `data` folder, it will generate folder_0 to folder_n

### Example for multiple genes:

```python
check_genes = ['CD8A','geneB']
print(len(df.id.unique()))

for gene in check_genes:
    gene_df = pd.DataFrame({'Gene1':gene,'Gene2':df.id.unique()})
    save_input(gene_df,a3m_dir='a3m_dir',save_dir=f'af_input/{gene}')
    print('done for', gene)
    break
```

## Protein-SMILES

- First run the normal `sequence only` pipeline for the protein
- Get the output data.json file, read it, load the `["sequences"][0]["protein"]`

In [None]:
#| export
def get_AF_input_smi(smi_idx, smiles, protein_json):
    "Generate AF input file for smiles protein docking task"
    
    json_data = {
        "name": smi_idx,
        "modelSeeds": [1],
        "sequences": [
            {
                "ligand": {
                    "id": "L",
                    "smiles": smiles,
                }
            }, 
            {
                "protein": protein_json["sequences"][0]["protein"]
            },
        ],
        "bondedAtomPairs": [],
        "dialect": "alphafold3",
        "version": 2
    }
    return json_data

In [None]:
fname = 'data/seq_only_data.json'
protein_json = read_json(fname)

In [None]:
str(get_AF_input_smi('smi_name','CCC',protein_json))[:100]

"{'name': 'smi_name', 'modelSeeds': [1], 'sequences': [{'ligand': {'id': 'L', 'smiles': 'CCC'}}, {'pr"

In [None]:
#| export
def save_input_smi(df, id_col, smi_col, protein_json,save_dir):
    
    for i, r in tqdm(df.iterrows(),total=len(df)):
        
        json_data = get_AF_input_smi(r[id_col], r[smi_col],protein_json)
        file_name =Path(save_dir)/f"{r[id_col]}.json"
        write_json(json_data,file_name)

In [None]:
df = pd.DataFrame({'idx':['a','b'],'smi':['CCC','OCO']})
df

Unnamed: 0,idx,smi
0,a,CCC
1,b,OCO


In [None]:
save_input_smi(df,'idx','smi',protein_json,save_dir='data')

100%|██████████| 2/2 [00:00<00:00, 93.69it/s]


## End

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()