# Run workflow using API

In [1]:
from pathlib import Path

from rich import print as pprint

In [2]:
import logging

logging.basicConfig(level=logging.WARNING)
# Set to WARNING to see only warnings
# Set to INFO to see sparql queries
# Set to DEBUG to see raw results

## Setup session directory

In this directory files like PDB files are stored and a DuckDB database for meta data.

In [3]:
session_dir = Path("session1")
session_dir

PosixPath('session1')

## Search Uniprot for structures

In [4]:
from protein_detective.uniprot import Query
from protein_detective.workflow import search_structures_in_uniprot

In [5]:
query = Query(
    taxon_id="9606",
    reviewed=True,
    subcellular_location_uniprot="nucleus",
    subcellular_location_go="GO:0005634",  # Cellular component - Nucleus
    molecular_function_go="GO:0003677",  # Molecular function - DNA binding
)

In [6]:
nr_uniprot, nr_pdbe, nr_af = search_structures_in_uniprot(query, session_dir, limit=100)
nr_uniprot, nr_pdbe, nr_af



(84, 100, 84)

Use [database queries](#query-session-database) to see what was found.

## Fetch structures from PDBe and Alphafold found in Uniprot


In [7]:
from protein_detective.workflow import retrieve_structures

In [8]:
download_path, nr_pdbs, nr_alphafolds = retrieve_structures(session_dir)
download_path, nr_pdbs, nr_alphafolds

Downloading PDBe mmCIF files: 100%|██████████| 100/100 [00:12<00:00,  8.12it/s]
Fetching Alphafold summaries: 100%|██████████| 84/84 [00:03<00:00, 21.60it/s]
Downloading AlphaFold files: 100%|██████████| 84/84 [00:00<00:00, 137.21it/s]


(PosixPath('session1/downloads'), 100, 84)

## Filter out AlphafoldDB structures with low confidence

And write PDB files without those low confidence residues to new directory.

In [9]:
from protein_detective.alphafold.density import DensityFilterQuery
from protein_detective.workflow import density_filter

In [10]:
dquery = DensityFilterQuery(
    confidence=70.0,
    min_threshold=100,
    max_threshold=500,
)

In [11]:
result = density_filter(session_dir, dquery)
pprint(result)

## Prune PDBe files
Prune the PDB files to only keep the first chain of the found Uniprot entries and rename that chain to A.

In [12]:
from protein_detective.workflow import SingleChainQuery, prune_pdbs

Use the number of residues in the chain as a proxy for the volume of the unknown density.

In [13]:
(single_chain_dir, nr_passed) = prune_pdbs(
    session_dir,
    SingleChainQuery(
        min_residues=100,
        max_residues=500,
    ),
)
(single_chain_dir, nr_passed)

Saving single chain PDB files from PDBe: 100%|██████████| 100/100 [01:03<00:00,  1.59it/s]


(PosixPath('session1/single_chain'), 79)

## Powerfit


In [14]:
from protein_detective.powerfit.options import PowerfitOptions
from protein_detective.powerfit.workflow import powerfit_commands

To run you must have cloned the https://github.com/haddocking/powerfit-tutorial repository in '../../powerfit-tutorial'.

In [15]:
options = PowerfitOptions(
    target=Path("../../powerfit-tutorial/ribosome-KsgA.map"),
    resolution=13,
    angle=20,
    laplace=True,
)

### Run

Here we use printed commands to run powerfit, see [powerfit.ipynb](powerfit.ipynb) for running powerfits using the API.

In [16]:
commands, powerfit_run_id = powerfit_commands(session_dir, options)

In [17]:
powerfit_run_id

1

In [18]:
cwd = str(Path.cwd())
rel_commands = [c.replace(cwd, ".") for c in commands]
pprint(rel_commands)

These commands should be run on a cluster. Here we will just run a couple of them to show how it works.

In [19]:
!{rel_commands[6]}

Target file read from:                                                          
[35m/home/verhoes/git/protein-detective/protein-detective/docs/session1/powerfit/1/[0m[95mr[0m
[95mibosome-KsgA.map[0m                                                                
Target resolution: [1;36m13.00[0m                                                        
Initial shape of density: [1;36m128[0m [1;36m128[0m [1;36m128[0m                                           
Shape after trimming: [1;36m60[0m [1;36m73[0m [1;36m67[0m                                                  
Shape after extending: [1;36m60[0m [1;36m75[0m [1;36m70[0m                                                 
Template file read from:                                                        
[35m/home/verhoes/git/protein-detective/protein-detective/docs/session1/single_chain[0m
[35m/[0m[95mO00268_7egf_d2A.pdb[0m                                                            
Reading in rotations.     

In [20]:
!{rel_commands[0]}

Target file read from:                                                          
[35m/home/verhoes/git/protein-detective/protein-detective/docs/session1/powerfit/1/[0m[95mr[0m
[95mibosome-KsgA.map[0m                                                                
Target resolution: [1;36m13.00[0m                                                        
Initial shape of density: [1;36m128[0m [1;36m128[0m [1;36m128[0m                                           
Shape after trimming: [1;36m60[0m [1;36m73[0m [1;36m67[0m                                                  
Shape after extending: [1;36m60[0m [1;36m75[0m [1;36m70[0m                                                 
Template file read from:                                                        
[35m/home/verhoes/git/protein-detective/protein-detective/docs/session1/single_chain[0m
[35m/[0m[95mO00268_7edx_D2A.pdb[0m                                                            
Reading in rotations.     

In [21]:
!{rel_commands[-1]}

Target file read from:                                                          
[35m/home/verhoes/git/protein-detective/protein-detective/docs/session1/powerfit/1/[0m[95mr[0m
[95mibosome-KsgA.map[0m                                                                
Target resolution: [1;36m13.00[0m                                                        
Initial shape of density: [1;36m128[0m [1;36m128[0m [1;36m128[0m                                           
Shape after trimming: [1;36m60[0m [1;36m73[0m [1;36m67[0m                                                  
Shape after extending: [1;36m60[0m [1;36m75[0m [1;36m70[0m                                                 
Template file read from:                                                        
[35m/home/verhoes/git/protein-detective/protein-detective/docs/session1/density_filt[0m
[35mered/[0m[95mAF-O00327-F1-model_v4.pdb[0m                                                  
Reading in rotations.     

See [powerfit.ipynb](powerfit.ipynb) for running all powerfits using the API.

### Report

Once all powerfit jobs are done the results can be parsed and reported.


In [22]:
from protein_detective.powerfit.workflow import powerfit_report

In [23]:
solutions = powerfit_report(session_dir)

In [24]:
len(solutions)

4420

In [25]:
solutions

Unnamed: 0,powerfit_run_id,structure,rank,cc,fishz,relz,translation,rotation,af_id,pdb_id,pdb_file,uniprot_acc
0,1,O00268_7edx_D2A,1,0.282,0.289,9.595,"[150.43, 150.43, 181.13]","[0.184, 0.816, -0.548, -0.816, -0.184, -0.548,...",,7EDX,session1/single_chain/O00268_7edx_D2A.pdb,O00268
1,1,O00268_7egf_d2A,1,0.275,0.282,9.141,"[205.69, 217.97, 187.27]","[0.0, 0.604, 0.797, -0.0, -0.797, 0.604, 1.0, ...",,7EGF,session1/single_chain/O00268_7egf_d2A.pdb,O00268
2,1,O00268_7edx_D2A,2,0.274,0.281,9.329,"[239.46, 153.5, 187.27]","[0.548, -0.184, 0.816, -0.632, 0.548, 0.548, -...",,7EDX,session1/single_chain/O00268_7edx_D2A.pdb,O00268
3,1,O00268_7edx_D2A,3,0.272,0.279,9.238,"[254.81, 267.09, 178.06]","[0.797, 0.0, -0.604, -0.604, -0.0, -0.797, 0.0...",,7EDX,session1/single_chain/O00268_7edx_D2A.pdb,O00268
4,1,O00268_7egf_d2A,2,0.269,0.276,8.958,"[196.48, 184.2, 156.57]","[0.0, -0.797, -0.604, 0.0, -0.604, 0.797, -1.0...",,7EGF,session1/single_chain/O00268_7egf_d2A.pdb,O00268
...,...,...,...,...,...,...,...,...,...,...,...,...
4415,1,AF-O00327-F1-model_v4,1505,0.107,0.108,4.857,"[257.88, 260.95, 168.85]","[-0.362, 0.896, -0.258, 0.896, 0.258, -0.362, ...",O00327,,session1/density_filtered/AF-O00327-F1-model_v...,O00327
4416,1,AF-O00327-F1-model_v4,1507,0.106,0.106,4.792,"[150.43, 98.24, 193.41]","[0.797, -0.604, 0.0, 0.0, -0.0, 1.0, -0.604, -...",O00327,,session1/density_filtered/AF-O00327-F1-model_v...,O00327
4417,1,AF-O00327-F1-model_v4,1506,0.106,0.107,4.815,"[214.9, 89.03, 116.66]","[0.0, -0.797, -0.604, 0.0, -0.604, 0.797, -1.0...",O00327,,session1/density_filtered/AF-O00327-F1-model_v...,O00327
4418,1,AF-O00327-F1-model_v4,1509,0.105,0.106,4.768,"[251.74, 273.23, 122.8]","[-0.258, 0.362, -0.896, -0.362, -0.896, -0.258...",O00327,,session1/density_filtered/AF-O00327-F1-model_v...,O00327


### Fit model to solution

Rotate/translate the input model PDB files to the top 5 powerfit solutions.

In [26]:
from protein_detective.powerfit.workflow import powerfit_fit_models

In [27]:
fitted = powerfit_fit_models(session_dir, top=5)
fitted

Writing fitted model PDB files: 100%|██████████| 5/5 [00:00<00:00, 128.84it/s]


Unnamed: 0_level_0,powerfit_run_id,structure,rank,fitted_model_file,unfitted_model_file
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,O00268_7edx_D2A,1,session1/powerfit/1/O00268_7edx_D2A/fit_1.pdb,session1/single_chain/O00268_7edx_D2A.pdb
1,1,O00268_7egf_d2A,1,session1/powerfit/1/O00268_7egf_d2A/fit_1.pdb,session1/single_chain/O00268_7egf_d2A.pdb
2,1,O00268_7edx_D2A,2,session1/powerfit/1/O00268_7edx_D2A/fit_2.pdb,session1/single_chain/O00268_7edx_D2A.pdb
3,1,O00268_7edx_D2A,3,session1/powerfit/1/O00268_7edx_D2A/fit_3.pdb,session1/single_chain/O00268_7edx_D2A.pdb
4,1,O00268_7egf_d2A,2,session1/powerfit/1/O00268_7egf_d2A/fit_2.pdb,session1/single_chain/O00268_7egf_d2A.pdb
