# Batch prediction example

This notebook performs a batch prediction for protein-ligand pairs with LigPose

In [1]:
import os
import sys
sys.path.append('../')
import pandas as pd
import torch

from utils.prediction import predict

### Settings: input files

In [2]:
input_file = 'example_files/example_input.csv'          # a csv file containing all the predicted tasks (i.e. protein ligand pairs)

# It is formatted like this
pd.read_csv(input_file)

Unnamed: 0,protein,ligand,ref_pocket_center
0,example_files/3g0g/3g0g_protein.pdb,example_files/3g0g/3g0g_ligand.mol2,example_files/3g0g/3g0g_ligand.mol2
1,example_files/4r6e/4r6e_protein.pdb,example_files/4r6e/4r6e_ligand.mol2,example_files/4r6e/4r6e_ligand.mol2
2,example_files/5a7b/5a7b_protein.pdb,example_files/5a7b/5a7b_ligand.mol2,example_files/5a7b/5a7b_ligand.mol2
3,example_files/6qlt/6qlt_protein.pdb,example_files/6qlt/6qlt_ligand.mol2,example_files/6qlt/6qlt_ligand.mol2


### Settings: model configuration

In [3]:
device = 'cuda:0'                             # device (GPU recommended)
param_path = '../suppl/LigPose_param.chk'     # path to LigPose parameters
ens = 3                                       # ensemble number (large ens gives more accurate and stable predictions)
batch_size = 4                                # batch_size, which will only be used when ens = 1
task_list = ['structure']                     # tasks (['structure'] for structure prediction // ['screening'] for virtual screening // ['structure', 'screening'] for both two tasks)

### Settings: data preprocessing and output

In [4]:
cache_path = './cache'                                  # a temporary path for saving processed files
prepare_data_with_multi_cpu = True                      # set to True to process input files with multiple CPUs

output_structure_path='./output_structure'              # path to saving output structures
output_result_path='./output_result.csv'                # path to saving output records (in csv format)

### Run LigPose

In [5]:
predict(
    param_path,
    device=device,
    batch_csv=input_file,
    prepare_data_with_multi_cpu=prepare_data_with_multi_cpu,
    cache_path=cache_path,
    ens=ens,
    batch_size=batch_size,
    pred_type=task_list,
    output_structure_path=output_structure_path,
    output_result_path=output_result_path,
)

Preparing data...
Prepared data: 4/4, 100.00%


Predicting:   0%|          | 0/4 [00:00<?, ?it/s]

DONE


### Check ouputs

The output csv file contains input file settings. The `index` column shows the index number of output structure files, e.g. index = 1 for `1.pdb`

The `screening_score` shows the potential binding strength between proteins and ligands. The more positive the value, the stronger the binding.

In [6]:
df_result = pd.read_csv(output_result_path)
df_result

Unnamed: 0,index,protein,ligand
0,0,example_files/3g0g/3g0g_protein.pdb,example_files/3g0g/3g0g_ligand.mol2
1,1,example_files/4r6e/4r6e_protein.pdb,example_files/4r6e/4r6e_ligand.mol2
2,2,example_files/5a7b/5a7b_protein.pdb,example_files/5a7b/5a7b_ligand.mol2
3,3,example_files/6qlt/6qlt_protein.pdb,example_files/6qlt/6qlt_ligand.mol2
