# Protein pairs & ColabFold pipeline

- order: 3

We use ColabFold MSA for protein pairs pipeline, as it takes shorter time

## Setup

In [None]:
#| default_exp af3.protein_pairs

In [1]:
#| export
import os, json, shutil, pandas as pd
from tqdm import tqdm
from itertools import combinations
from pathlib import Path
from kdock.af3.json import *
from kdock.af3.docker import *

## Setup

```sh
docker pull sky1ove/alphafold3
```

## Protein pairs

Since protein protein screening involves a lot of proteins, it takes a long time for AF3 default MSA pipeline, so we used colabfold MSA pipeline

In [None]:
#| export
def get_colabfold_cmd(csv_path,project_name):
    print('Run below in terminal:')
    print(f'\n colabfold_batch {csv_path} msa_{project_name} --msa-only')

In [None]:
project_name='sdf'

In [None]:
get_colabfold_cmd('sdf.csv',project_name)

Run below in terminal:

 colabfold_batch sdf.csv msa_sdf --msa-only


## MSA

In [None]:
#| export
def copy_a3m(a3m_dir: str, # Path to the source directory containing .a3m files.
             dest_dir: str, # Path to the destination directory where files will be copied
             ):
    "Copies all .a3m files from the source directory to the destination directory."
    
    a3m_dir,dest_dir = Path(a3m_dir),Path(dest_dir)
    dest_dir.mkdir(parents=True, exist_ok=True)

    files = list(a3m_dir.glob('*.a3m'))

    for file in tqdm(files, desc="Copying files", unit="file"):
        shutil.copy(file, dest_dir / file.name)

    print(f"Copied {len(files)} a3m files from {a3m_dir} to {dest_dir}")

In [None]:
copy_a3m(a3m_dir='data',dest_dir='af_input')

Copying files: 100%|██████████| 1/1 [00:00<00:00, 637.53file/s]

Copied 1 a3m files from data to af_input





## Protein-protein input

:::{.callout-important}
Make sure a3m files are under af_input, otherwise it won't detect the files
:::

In [None]:
#| export
def a3m_to_seq(file_path:Path):
    "Get protein sequence from a3m file"
    return file_path.read_text().splitlines()[2] # protein sequence is located on line 2

In [None]:
a3m_to_seq(Path(f'af_input/{project_name}/a3m/CD8A.a3m'))

'SQFRVSPLDRTWNLGETVELKCQVLLSNPTSGCSWLFQPRGAAASPTFLLYLSQNKPKAAEGLDTQRFSGKRLGDTFVLTLSDFRRENEGYYFCSALSNSIMYFSHFVPVFLPAKPTTTPAPRPPTPAPTIASQPLSLRPEACRPAAGGAVHTRGLDFACD'

In [None]:
#| export
def get_protein_subjson(gene_name, a3m_dir=".",idx = 'A',run_template=True):
    "Get subjson (protein part) with colabfold unpairedMSA .a3m path"
    file_path = Path(a3m_dir)/f"{gene_name}.a3m"
    protein_sequence = a3m_to_seq(file_path)
    
    json_data = {
        'id': idx,
        'sequence': protein_sequence, 
        'modifications': [],
        'unpairedMsaPath': str("/root"/file_path), # for docker path, ECD under af_input
        'pairedMsa': '',
        'templates': None if run_template else []
    }

    return json_data

In [None]:
sub_json = get_protein_subjson('CD8A',a3m_dir=f'af_input/{project_name}/a3m')

In [None]:
sub_json

{'id': 'A',
 'sequence': 'SQFRVSPLDRTWNLGETVELKCQVLLSNPTSGCSWLFQPRGAAASPTFLLYLSQNKPKAAEGLDTQRFSGKRLGDTFVLTLSDFRRENEGYYFCSALSNSIMYFSHFVPVFLPAKPTTTPAPRPPTPAPTIASQPLSLRPEACRPAAGGAVHTRGLDFACD',
 'modifications': [],
 'unpairedMsaPath': '/root/af_input/sdf/a3m/CD8A.a3m',
 'pairedMsa': '',
 'templates': None}

In [None]:
#| export
def dump_json_folder(json_data, folder):
    "Save json under a folder"
    file_path = Path(folder)/f"{json_data['name']}.json"
    with open(file_path,'w') as f: json.dump(json_data,f,indent=4)

In [None]:
#| export
def get_multi_protein_json(gene_list,a3m_dir,run_template=True,save_folder=None):
    'Get json of multiple proteins, with unpaired MSA path indicated (from colabfold MSA)'
    sequences = []
    alphabets = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    for index, gene in enumerate(gene_list):
        sub_json=get_protein_subjson(gene,a3m_dir,idx=alphabets[index],run_template=run_template)
        sequences.append({'protein':sub_json})
    name = '_'.join(gene_list)
    json_data = {
            "name": name,
            "modelSeeds": [1],
            "sequences": sequences,
            "bondedAtomPairs": [],
            "dialect": "alphafold3",
            "version": 2
        }
    if save_folder:
        dump_json_folder(json_data,save_folder)
    return json_data

In [None]:
AF_input = get_multi_protein_json(['CD8A','CD8A'],
                        a3m_dir=f'af_input/{project_name}/a3m',
                        save_folder=f'af_input/{project_name}')

You can generate a list of json files under a folder.

In [None]:
AF_input.keys(), len(AF_input['sequences'])

(dict_keys(['name', 'modelSeeds', 'sequences', 'bondedAtomPairs', 'dialect', 'version']),
 2)

In [None]:
#| export
def generate_pair_df(gene_list,self_pair=True):
    "Unique pair genes in a gene list"
    pairs = list(combinations(gene_list, 2))
    pair_df = pd.DataFrame(pairs,columns=["Gene1", "Gene2"])
    
    if self_pair:
        self_pair_df = pd.DataFrame({'Gene1':gene_list, 'Gene2':gene_list})
        pair_df = pd.concat([pair_df,self_pair_df])

    return pair_df.reset_index(drop=True)

In [None]:
generate_pair_df(list('ABC'))

Unnamed: 0,Gene1,Gene2
0,A,B
1,A,C
2,B,C
3,A,A
4,B,B
5,C,C


In [None]:
df = generate_pair_df(['CD8A'])
df

Unnamed: 0,Gene1,Gene2
0,CD8A,CD8A


Generate json files first:

In [None]:
for idx, row in tqdm(df.iterrows(),total=len(df)):
    json_data = get_multi_protein_json([row['Gene1'], row['Gene2']], 
                             a3m_dir=f'af_input/{project_name}/a3m', 
                             save_folder=f'af_input/{project_name}') 

100%|██████████| 1/1 [00:00<00:00, 147.81it/s]


Split them to subfolder:

In [None]:
split_nfolder(f'af_input/{project_name}')

Distributed 1 files into 4 folders.


## Docker

Todo: Pair proteins

```python
for i in range(4):
    get_docker_command(input_dir=f"af_input/{project_name}/folder_{i}",
                       output_dir=f"af_output/{project_name}",
                       gpus=i)
```

## End

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

## Utils

In [None]:
# #| export
# def split_files_into_subfolders(input_folder: str, nfolder: int = 4):
    
#     "Splits `.a3m` files in a folder into subfolders (folder_0, folder_1, ..., folder_N)."
    
#     input_path = Path(input_folder)
#     if not input_path.is_dir():
#         raise ValueError(f"Input folder {input_folder} does not exist or is not a directory.")

#     # List all `.a3m` files
#     a3m_files = sorted(input_path.glob("*.a3m"))
#     if not a3m_files:
#         print("No `.a3m` files found in the input folder.")
#         return

#     # Create the subfolders
#     subfolders = [input_path / f"folder_{i}" for i in range(nfolder)]
#     for folder in subfolders:
#         folder.mkdir(exist_ok=True)

#     # Distribute the files into the subfolders
#     for idx, file in enumerate(a3m_files):
#         target_folder = subfolders[idx % nfolder]
#         shutil.move(str(file), target_folder / file.name)

#     print(f"Distributed {len(a3m_files)} files into {nfolder} folders.")
