# Run workflow using API

In [4]:
from pathlib import Path

from rich import print as pprint

In [5]:
import logging

logging.basicConfig(level=logging.WARNING)
# Set to WARNING to see only warnings
# Set to INFO to see sparql queries
# Set to DEBUG to see raw results

In [6]:
# Silence dask logging
!dask config set logging.distributed warning



## Setup session directory

In this directory files like structure files are stored and a DuckDB database for meta data.

In [7]:
session_dir = Path("session1")
session_dir

PosixPath('session1')

## Search Uniprot for structures

In [4]:
from protein_quest.uniprot import Query

from protein_detective.workflow import search_structures_in_uniprot

In [5]:
query = Query(
    taxon_id="9606",
    reviewed=True,
    subcellular_location_uniprot="nucleus",
    subcellular_location_go=["GO:0005634"],  # Cellular component - Nucleus
    molecular_function_go=["GO:0003677"],  # Molecular function - DNA binding
)

In [6]:
nr_uniprot, nr_pdbe, nr_pdb_uniprot, nr_af = search_structures_in_uniprot(query, session_dir, limit=100)
nr_uniprot, nr_pdbe, nr_pdb_uniprot, nr_af



(100, 100, 100, 100)

Use [database queries](#query-session-database) to see what was found.

## Fetch structures from PDBe and Alphafold found in Uniprot


In [7]:
from protein_detective.workflow import async_retrieve_structures

In [8]:
# Allow asyncio to be nested (needed for jupyter notebooks)
download_path, nr_pdbs, nr_alphafolds = await async_retrieve_structures(session_dir)
download_path, nr_pdbs, nr_alphafolds

Downloading PDBe mmCIF files: 100%|██████████| 100/100 [00:01<00:00, 62.86it/s]
Fetching Alphafold summaries: 100%|██████████| 100/100 [00:03<00:00, 30.30it/s]
Downloading AlphaFold files: 100%|██████████| 100/100 [00:00<00:00, 284.82it/s]


(PosixPath('session1/downloads'), 100, 100)

## Filter structures

Prepare structures for powerfitting by filtering them based on confidence and nr of residues.

In [8]:
from protein_detective.filter import ConfidenceFilterQuery, FilterOptions, SecondaryStructureFilterQuery
from protein_detective.workflow import filter_structures

In [9]:
options = FilterOptions(
    confidence=ConfidenceFilterQuery(confidence=70, min_residues=100, max_residues=1000),
    secondary_structure=SecondaryStructureFilterQuery(),
)

In [10]:
filtered_dir, filtered_results = filter_structures(session_dir, options)
pprint(filtered_results[:2])
pprint(filtered_results[-2:])
filtered_dir

  0%|          | 0/100 [00:00<?, ?file/s]

PosixPath('session1/filtered')

In [12]:
total_nr = len(filtered_results)
total_nr_passed = sum(1 for r in filtered_results if r.passed)
total_nr_discarded = total_nr - total_nr_passed
pprint(f"Total entries: {total_nr}, passed: {total_nr_passed}, discarded: {total_nr_discarded}")

## Powerfit


In [13]:
from protein_detective.powerfit.options import PowerfitOptions
from protein_detective.powerfit.workflow import powerfit_commands

To run you must have cloned the https://github.com/haddocking/powerfit-tutorial repository in '../../powerfit-tutorial'.

In [14]:
options = PowerfitOptions(
    target=Path("../../powerfit-tutorial/ribosome-KsgA.map"),
    resolution=13,
    angle=20,
    laplace=True,
)

### Run

Here we use printed commands to run powerfit, see [powerfit.ipynb](powerfit.ipynb) for running powerfits using the API.

In [15]:
commands, powerfit_run_id = powerfit_commands(session_dir, options)

In [16]:
powerfit_run_id

1

In [17]:
cwd = str(Path.cwd())
rel_commands = [c.replace(cwd, ".") for c in commands]
pprint(rel_commands[:10])

These commands should be run on a cluster. Here we will just run a couple of them to show how it works.

In [18]:
!{rel_commands[6]}

Target file read from:                                                          
[35m/home/stefanv/git/protein-detective/protein-detective/docs/session1/powerfit/1/[0m[95mr[0m
[95mibosome-KsgA.map[0m                                                                
Target resolution: [1;36m13.00[0m                                                        
Initial shape of density: [1;36m128[0m [1;36m128[0m [1;36m128[0m                                           
Shape after trimming: [1;36m60[0m [1;36m73[0m [1;36m67[0m                                                  
Shape after extending: [1;36m60[0m [1;36m75[0m [1;36m70[0m                                                 
Template file read from:                                                        
[35m/home/stefanv/git/protein-detective/protein-detective/docs/session1/filtered/[0m[95mAF-[0m
[95mA1YPR0-F1-model_v4.cif[0m                                                          
Reading in rotations.     

In [19]:
!{rel_commands[0]}

Target file read from:                                                          
[35m/home/stefanv/git/protein-detective/protein-detective/docs/session1/powerfit/1/[0m[95mr[0m
[95mibosome-KsgA.map[0m                                                                
Target resolution: [1;36m13.00[0m                                                        
Initial shape of density: [1;36m128[0m [1;36m128[0m [1;36m128[0m                                           
Shape after trimming: [1;36m60[0m [1;36m73[0m [1;36m67[0m                                                  
Shape after extending: [1;36m60[0m [1;36m75[0m [1;36m70[0m                                                 
Template file read from:                                                        
[35m/home/stefanv/git/protein-detective/protein-detective/docs/session1/filtered/[0m[95mAF-[0m
[95mA0A087WUV0-F1-model_v4.cif[0m                                                      
Reading in rotations.     

In [20]:
!{rel_commands[-1]}

Target file read from:                                                          
[35m/home/stefanv/git/protein-detective/protein-detective/docs/session1/powerfit/1/[0m[95mr[0m
[95mibosome-KsgA.map[0m                                                                
Target resolution: [1;36m13.00[0m                                                        
Initial shape of density: [1;36m128[0m [1;36m128[0m [1;36m128[0m                                           
Shape after trimming: [1;36m60[0m [1;36m73[0m [1;36m67[0m                                                  
Shape after extending: [1;36m60[0m [1;36m75[0m [1;36m70[0m                                                 
Template file read from:                                                        
[35m/home/stefanv/git/protein-detective/protein-detective/docs/session1/filtered/[0m[95m5db[0m
[95m2_A2A.cif[0m                                                                       
Reading in rotations.     

See [powerfit.ipynb](powerfit.ipynb) for running all powerfits using the API.

### Report

Once all powerfit jobs are done the results can be parsed and reported.


In [21]:
from protein_detective.powerfit.workflow import powerfit_report

In [22]:
solutions = powerfit_report(session_dir)

In [23]:
len(solutions)

4812

In [24]:
solutions

Unnamed: 0,powerfit_run_id,structure,rank,cc,fishz,relz,translation,rotation,pdb_file,uniprot_acc,pdb_id
0,1,AF-A1YPR0-F1-model_v4,1,0.280,0.288,11.087,"[162.71, 132.01, 181.13]","[-1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, -1.0]",session1/filtered/AF-A1YPR0-F1-model_v4.cif,A1YPR0,
1,1,AF-A1YPR0-F1-model_v4,2,0.279,0.287,11.049,"[156.57, 156.57, 135.08]","[-0.548, 0.184, 0.816, -0.548, -0.816, -0.184,...",session1/filtered/AF-A1YPR0-F1-model_v4.cif,A1YPR0,
2,1,AF-A1YPR0-F1-model_v4,3,0.275,0.282,10.870,"[165.78, 153.5, 141.22]","[-0.632, 0.548, 0.548, -0.548, -0.816, 0.184, ...",session1/filtered/AF-A1YPR0-F1-model_v4.cif,A1YPR0,
3,1,AF-A1YPR0-F1-model_v4,4,0.266,0.273,10.518,"[257.88, 270.16, 178.06]","[0.0, -0.797, -0.604, 0.0, -0.604, 0.797, -1.0...",session1/filtered/AF-A1YPR0-F1-model_v4.cif,A1YPR0,
4,1,AF-A1YPR0-F1-model_v4,5,0.265,0.272,10.484,"[141.22, 159.64, 205.69]","[0.548, -0.816, -0.184, -0.548, -0.184, -0.816...",session1/filtered/AF-A1YPR0-F1-model_v4.cif,A1YPR0,
...,...,...,...,...,...,...,...,...,...,...,...
4807,1,5db2_A2A,1814,0.081,0.081,4.472,"[233.32, 202.62, 159.64]","[-0.548, -0.548, -0.632, 0.184, -0.816, 0.548,...",session1/filtered/5db2_A2A.cif,O00255,5DB2
4808,1,5db2_A2A,1813,0.081,0.081,4.475,"[150.43, 168.85, 141.22]","[0.258, -0.362, 0.896, 0.362, 0.896, 0.258, -0...",session1/filtered/5db2_A2A.cif,O00255,5DB2
4809,1,5db2_A2A,1812,0.081,0.081,4.475,"[125.87, 168.85, 214.9]","[-0.362, 0.258, 0.896, -0.258, 0.896, -0.362, ...",session1/filtered/5db2_A2A.cif,O00255,5DB2
4810,1,5db2_A2A,1811,0.081,0.081,4.488,"[125.87, 162.71, 217.97]","[-0.632, 0.548, 0.548, 0.548, -0.184, 0.816, 0...",session1/filtered/5db2_A2A.cif,O00255,5DB2


### Fit model to solution

Rotate/translate the input model PDB files to the top 5 powerfit solutions.

In [14]:
from protein_detective.powerfit.workflow import powerfit_fit_models

In [15]:
fitted = powerfit_fit_models(session_dir, top=5)
fitted

Writing fitted model PDB files: 100%|██████████| 5/5 [00:00<00:00, 90.13it/s]


Unnamed: 0_level_0,powerfit_run_id,structure,rank,fitted_model_file,unfitted_model_file
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,AF-A1YPR0-F1-model_v4,1,session1/powerfit/1/AF-A1YPR0-F1-model_v4/fit_...,session1/filtered/AF-A1YPR0-F1-model_v4.cif
1,1,AF-A1YPR0-F1-model_v4,1,session1/powerfit/1/AF-A1YPR0-F1-model_v4/fit_...,session1/filtered/AF-A1YPR0-F1-model_v4.cif
2,1,AF-A1YPR0-F1-model_v4,1,session1/powerfit/1/AF-A1YPR0-F1-model_v4/fit_...,session1/filtered/AF-A1YPR0-F1-model_v4.cif
3,1,AF-A1YPR0-F1-model_v4,1,session1/powerfit/1/AF-A1YPR0-F1-model_v4/fit_...,session1/filtered/AF-A1YPR0-F1-model_v4.cif
4,1,AF-A1YPR0-F1-model_v4,1,session1/powerfit/1/AF-A1YPR0-F1-model_v4/fit_...,session1/filtered/AF-A1YPR0-F1-model_v4.cif
