# Protenix

This is for protein-ligand task using Protenix github repository.

## Install

```bash
git clone https://github.com/bytedance/Protenix.git
cd Protenix
pip install .
```

## Setup

In [None]:
#| default_exp px.core

In [None]:
#| export
import json
from pathlib import Path

## Single job json

Run the protein sequence on server to get msa folder that contains pairing.a3m and unpairing.a3m

Use the folder as the `msa_dir`

In [None]:
#| export
def get_single_job(job_name, protein_seq, msa_dir, SMILES=None,CCD=None):
    "Get protenix json format of protein and ligand."
    
    if SMILES and CCD:
        raise ValueError("Please provide only one of SMILES or CCD, not both.")
    if not SMILES and not CCD:
        raise ValueError("You must provide either SMILES or CCD.")

    ligand_value = SMILES if SMILES else f"CCD_{CCD}"
    
    return {
        "name": job_name,
        "sequences": [
            {
                "proteinChain": {
                    "count": 1,
                    "sequence": protein_seq,
                    "msa": {
                        "precomputed_msa_dir": msa_dir,
                        "pairing_db": "uniref100"
                    }
                }
            },
            {
                "ligand": {
                    "count": 1,
                    "ligand": ligand_value
                }
            }
        ]
    }

In [None]:
get_single_job('job_name', 'AAA', './msa', SMILES='CCC',CCD=None)

{'name': 'job_name',
 'sequences': [{'proteinChain': {'count': 1,
    'sequence': 'AAA',
    'msa': {'precomputed_msa_dir': './msa', 'pairing_db': 'uniref100'}}},
  {'ligand': {'count': 1, 'ligand': 'CCC'}}]}

In [None]:
#| export
def get_single_protein_ligand_json(job_name, 
                                   protein_seq, 
                                   msa_dir, 
                                   SMILES=None, 
                                   CCD=None, 
                                   json_path=None):
    "Generate json input for one protein-ligand job."
    data = [get_single_job(job_name, protein_seq, msa_dir, SMILES=SMILES, CCD=CCD)]

    if json_path:
        save_path = Path(json_path)
        save_path.parent.mkdir(parents=True, exist_ok=True)
        with save_path.open("w") as f:
            json.dump(data, f, indent=4)
        print(f"JSON saved to {save_path}")

    return data

In [None]:
# _ = get_single_protein_ligand_json('kras_g12d_mrtx',
#                                  g12d,
#                                  msa_dir='kras_g12d_msa',
#                                  SMILES="C#CC1=C(C=CC2=CC(=CC(=C21)C3=NC=C4C(=C3F)N=C(N=C4N5CC6CCC(C5)N6)OC[C@@]78CCCN7C[C@@H](C8)F)O)F",
#                                  json_path='g12d_mrtx.json'
#                                 )

Use the json as input file for protenix

```bash
protenix predict --input input.json --out_dir  ./output --seeds 101
```

## Different protein-ligand pairs in df

In [None]:
#| export
def get_protein_ligand_df_json(df,
                               id_col,
                               seq_col, 
                               msa_col, 
                               smi_col=None, 
                               ccd_col=None, 
                               save_json=None):
    "Get json file of protein and ligand in a dataframe."
    
    if smi_col and ccd_col:
        raise ValueError("Provide only one of smi_col or ccd_col, not both.")
    if not smi_col and not ccd_col:
        raise ValueError("You must provide either smi_col or ccd_col.")

    use_smiles = smi_col is not None

    def build_job(row):
        job_name = row[id_col]
        protein_seq = row[seq_col]
        msa_dir = row[msa_col]
        SMILES = row[smi_col] if use_smiles else None
        CCD = None if use_smiles else row[ccd_col]
        return get_single_job(job_name, protein_seq, msa_dir, SMILES=SMILES, CCD=CCD)

    all_jobs = df.apply(build_job, axis=1).tolist()

    if save_json:
        save_path = Path(save_json)
        save_path.parent.mkdir(parents=True, exist_ok=True)
        with save_path.open("w") as f:
            json.dump(all_jobs, f, indent=4)
        print(f"JSON saved to {save_path}")

    return all_jobs


In [None]:
# _ = get_protein_ligand_df_json(df,
                               # id_col='ID',
                               # seq_col='sequence', 
                               # msa_col='msa_dir', 
                               # smi_col="SMILES", 
                               # ccd_col=None, 
                               # save_json="input.json")

## Virtual screening
> single protein against multiple ligands

In [None]:
#| export
def get_virtual_screening_json(df, 
                               protein_seq, 
                               msa_dir, 
                               id_col,
                               smi_col=None, 
                               ccd_col=None, 
                               save_json=None):
    "Get json file of single protein against multiple SMILES in a dataframe."
    if smi_col and ccd_col:
        raise ValueError("Provide only one of smi_col or ccd_col, not both.")
    if not smi_col and not ccd_col:
        raise ValueError("You must provide either smi_col or ccd_col.")

    use_smiles = smi_col is not None

    def build_job(row):
        job_name = row[id_col]
        SMILES = row[smi_col] if use_smiles else None
        CCD = None if use_smiles else row[ccd_col]
        return get_single_job(job_name, protein_seq, msa_dir, SMILES=SMILES, CCD=CCD)

    all_jobs = df.apply(build_job, axis=1).tolist()

    if save_json:
        save_path = Path(save_json)
        save_path.parent.mkdir(parents=True, exist_ok=True)
        with save_path.open("w") as f:
            json.dump(all_jobs, f, indent=4)
        print(f"JSON saved to {save_path}")

    return all_jobs

In [None]:
# _ = get_virtual_screening_json(df,
#                                g12d_seq,
#                                'kras_g12d_msa',
#                                id_col='ID',
#                                smi_col='SMILES',
#                                save_json='kras_g12d_input.json')

## End

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()