# Run workflow using API

In [1]:
from pathlib import Path

from rich import print as pprint

In [2]:
import logging

logging.basicConfig(level=logging.WARNING)
# Set to WARNING to see only warnings
# Set to INFO to see sparql queries
# Set to DEBUG to see raw results

## Setup session directory

In this directory files like PDB files are stored and a DuckDB database for meta data.

In [3]:
session_dir = Path("session1")
session_dir

PosixPath('session1')

## Search Uniprot for structures

In [4]:
from protein_detective.uniprot import Query
from protein_detective.workflow import search_structures_in_uniprot

In [5]:
query = Query(
    taxon_id="9606",
    reviewed=True,
    subcellular_location_uniprot="nucleus",
    subcellular_location_go="GO:0005634",  # Cellular component - Nucleus
    molecular_function_go="GO:0003677",  # Molecular function - DNA binding
)

In [6]:
nr_uniprot, nr_pdbe, nr_af = search_structures_in_uniprot(query, session_dir, limit=80)
nr_uniprot, nr_pdbe, nr_af



(71, 14, 71)

Use [database queries](#query-session-database) to see what was found.

## Fetch structures from PDBe and Alphafold found in Uniprot


In [7]:
from protein_detective.workflow import retrieve_structures

In [8]:
download_path, nr_pdbs, nr_alphafolds = retrieve_structures(session_dir)
download_path, nr_pdbs, nr_alphafolds

Downloading PDBe mmCIF files: 100%|██████████| 14/14 [00:00<00:00, 14.65it/s]
Fetching Alphafold summaries: 100%|██████████| 71/71 [00:02<00:00, 30.69it/s]
Downloading AlphaFold files: 100%|██████████| 71/71 [00:00<00:00, 153.18it/s]


(PosixPath('session1/downloads'), 14, 71)

## Filter out AlphafoldDB structures with low confidence

And write PDB files without those low confidence residues to new directory.

In [9]:
from protein_detective.alphafold.density import DensityFilterQuery
from protein_detective.workflow import density_filter

In [10]:
dquery = DensityFilterQuery(
    confidence=70.0,
    min_threshold=100,
    max_threshold=500,
)

In [11]:
result = density_filter(session_dir, dquery)
pprint(result)

## Prune PDBe files
Prune the PDB files to only keep the first chain of the found Uniprot entries and rename that chain to A.

In [12]:
from protein_detective.workflow import prune_pdbs

In [13]:
(single_chain_dir, nr_passed) = prune_pdbs(session_dir)
(single_chain_dir, nr_passed)

Saving single chain PDB files from PDBe: 100%|██████████| 14/14 [00:03<00:00,  3.95it/s]


(PosixPath('session1/single_chain'), 14)

## Powerfit


In [None]:
from protein_detective.powerfit.options import PowerfitOptions
from protein_detective.powerfit.workflow import powerfit_commands

To run you must have cloned the https://github.com/haddocking/powerfit-tutorial repository in '../../powerfit-tutorial'.

In [15]:
options = PowerfitOptions(
    target=Path("../../powerfit-tutorial/ribosome-KsgA.map"),
    resolution=13,
    angle=20,
    laplace=True,
)

### Run

Here we use printed commands to run powerfit, see [powerfit.ipynb](powerfit.ipynb) for running powerfits using the API.

In [16]:
commands, powerfit_run_id = powerfit_commands(session_dir, options)

In [17]:
powerfit_run_id

1

In [18]:
cwd = str(Path.cwd())
rel_commands = [c.replace(cwd, ".") for c in commands]
pprint(rel_commands)

These commands should be run on a cluster. Here we will just run a couple of them to show how it works.

In [19]:
!{rel_commands[6]}

Target file read from:                                                          
[35m/home/stefanv/git/protein-detective/protein-detective/docs/session1/powerfit/1/[0m[95mr[0m
[95mibosome-KsgA.map[0m                                                                
Target resolution: [1;36m13.00[0m                                                        
Initial shape of density: [1;36m128[0m [1;36m128[0m [1;36m128[0m                                           
Shape after trimming: [1;36m60[0m [1;36m73[0m [1;36m67[0m                                                  
Shape after extending: [1;36m60[0m [1;36m75[0m [1;36m70[0m                                                 
Template file read from:                                                        
[35m/home/stefanv/git/protein-detective/protein-detective/docs/session1/single_chain[0m
[35m/[0m[95mA8MT69_7xho_X2A.pdb[0m                                                            
Reading in rotations.     

In [20]:
!{rel_commands[0]}

Target file read from:                                                          
[35m/home/stefanv/git/protein-detective/protein-detective/docs/session1/powerfit/1/[0m[95mr[0m
[95mibosome-KsgA.map[0m                                                                
Target resolution: [1;36m13.00[0m                                                        
Initial shape of density: [1;36m128[0m [1;36m128[0m [1;36m128[0m                                           
Shape after trimming: [1;36m60[0m [1;36m73[0m [1;36m67[0m                                                  
Shape after extending: [1;36m60[0m [1;36m75[0m [1;36m70[0m                                                 
Template file read from:                                                        
[35m/home/stefanv/git/protein-detective/protein-detective/docs/session1/single_chain[0m
[35m/[0m[95mA8MT69_4dra_E2A.pdb[0m                                                            
Reading in rotations.     

In [21]:
!{rel_commands[-1]}

Target file read from:                                                          
[35m/home/stefanv/git/protein-detective/protein-detective/docs/session1/powerfit/1/[0m[95mr[0m
[95mibosome-KsgA.map[0m                                                                
Target resolution: [1;36m13.00[0m                                                        
Initial shape of density: [1;36m128[0m [1;36m128[0m [1;36m128[0m                                           
Shape after trimming: [1;36m60[0m [1;36m73[0m [1;36m67[0m                                                  
Shape after extending: [1;36m60[0m [1;36m75[0m [1;36m70[0m                                                 
Template file read from:                                                        
[35m/home/stefanv/git/protein-detective/protein-detective/docs/session1/density_filt[0m
[35mered/[0m[95mAF-B4DX44-F1-model_v4.pdb[0m                                                  
Reading in rotations.     

See [powerfit.ipynb](powerfit.ipynb) for running all powerfits using the API.

### Report

Once all powerfit jobs are done the results can be parsed and reported.


In [None]:
from protein_detective.powerfit.workflow import powerfit_report

In [5]:
solutions = powerfit_report(session_dir)

In [6]:
len(solutions)

62757

In [7]:
solutions

Unnamed: 0,powerfit_run_id,structure,rank,cc,fishz,relz,translation,rotation,density_filter_id,af_id,pdb_id,pdb_file,uniprot_acc
0,1,A8MT69_4ne6_B2A,1,0.456,0.492,11.071,"[227.18, 242.53, 211.83]","[0.0, -0.0, -1.0, 0.604, -0.797, 0.0, -0.797, ...",,,4NE6,session1/single_chain/A8MT69_4ne6_B2A.pdb,A8MT69
1,1,A8MT69_4drb_J2A,1,0.444,0.477,10.588,"[227.18, 242.53, 214.9]","[0.797, -0.604, 0.0, 0.604, 0.797, 0.0, 0.0, 0...",,,4DRB,session1/single_chain/A8MT69_4drb_J2A.pdb,A8MT69
2,1,A8MT69_4dra_E2A,1,0.443,0.476,10.402,"[214.9, 187.27, 214.9]","[1.0, -0.0, 0.0, 0.0, -0.0, 1.0, -0.0, -1.0, -...",,,4DRA,session1/single_chain/A8MT69_4dra_E2A.pdb,A8MT69
3,1,A8MT69_4e44_B2A,1,0.440,0.472,10.099,"[224.11, 236.39, 227.18]","[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]",,,4E44,session1/single_chain/A8MT69_4e44_B2A.pdb,A8MT69
4,1,A8MT69_7xhn_X2A,1,0.439,0.471,10.136,"[230.25, 242.53, 217.97]","[-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, 1.0]",,,7XHN,session1/single_chain/A8MT69_7xhn_X2A.pdb,A8MT69
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62752,1,AF-A8MTY0-F1-model_v4,1463,0.095,0.095,5.424,"[168.85, 202.62, 236.39]","[0.0, 1.0, 0.0, 0.797, 0.0, -0.604, -0.604, 0....",1,A8MTY0,,session1/density_filtered/AF-A8MTY0-F1-model_v...,A8MTY0
62753,1,AF-A8MTY0-F1-model_v4,1462,0.095,0.095,5.425,"[122.8, 174.99, 147.36]","[0.797, 0.604, 0.0, 0.604, -0.797, 0.0, 0.0, 0...",1,A8MTY0,,session1/density_filtered/AF-A8MTY0-F1-model_v...,A8MTY0
62754,1,AF-A8MTY0-F1-model_v4,1461,0.095,0.095,5.429,"[150.43, 211.83, 104.38]","[0.548, 0.548, 0.632, 0.184, -0.816, 0.548, 0....",1,A8MTY0,,session1/density_filtered/AF-A8MTY0-F1-model_v...,A8MTY0
62755,1,AF-A8MTY0-F1-model_v4,1460,0.095,0.096,5.437,"[224.11, 282.44, 150.43]","[0.0, 0.797, 0.604, -1.0, 0.0, -0.0, 0.0, -0.6...",1,A8MTY0,,session1/density_filtered/AF-A8MTY0-F1-model_v...,A8MTY0


### Fit model to solution

Rotate/translate the input model PDB files to the top 5 powerfit solutions.

In [None]:
from protein_detective.powerfit.workflow import powerfit_fit_models

In [6]:
fitted = powerfit_fit_models(session_dir, top=5)
fitted

Writing fitted model PDB files: 100%|██████████| 5/5 [00:00<00:00, 351.19it/s]


Unnamed: 0_level_0,powerfit_run_id,structure,rank,fitted_model_file,unfitted_model_file
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,A8MT69_4ne6_B2A,1,session1/powerfit/1/A8MT69_4ne6_B2A/fit_1.pdb,session1/single_chain/A8MT69_4ne6_B2A.pdb
1,1,A8MT69_4drb_J2A,1,session1/powerfit/1/A8MT69_4drb_J2A/fit_1.pdb,session1/single_chain/A8MT69_4drb_J2A.pdb
2,1,A8MT69_4dra_E2A,1,session1/powerfit/1/A8MT69_4dra_E2A/fit_1.pdb,session1/single_chain/A8MT69_4dra_E2A.pdb
3,1,A8MT69_4e44_B2A,1,session1/powerfit/1/A8MT69_4e44_B2A/fit_1.pdb,session1/single_chain/A8MT69_4e44_B2A.pdb
4,1,A8MT69_7xhn_X2A,1,session1/powerfit/1/A8MT69_7xhn_X2A/fit_1.pdb,session1/single_chain/A8MT69_7xhn_X2A.pdb
