# Run workflow using API

In [1]:
from pathlib import Path
from pprint import pprint

In [2]:
import logging

logging.basicConfig(level=logging.WARNING)
# Set to WARNING to see only warnings
# Set to INFO to see sparql queries
# Set to DEBUG to see raw results

## Setup session directory

In this directory files like PDB files are stored and a DuckDB database for meta data.

In [3]:
session_dir = Path("session1")
session_dir

PosixPath('session1')

## Search Uniprot for structures

In [4]:
from protein_detective.uniprot import Query
from protein_detective.workflow import search_structures_in_uniprot

In [5]:
query = Query(
    taxon_id="9606",
    reviewed=True,
    subcellular_location_uniprot="nucleus",
    subcellular_location_go="GO:0005634",  # Cellular component - Nucleus
    molecular_function_go="GO:0003677",  # Molecular function - DNA binding
)

In [6]:
nr_uniprot, nr_pdbe, nr_af = search_structures_in_uniprot(query, session_dir, limit=80)
nr_uniprot, nr_pdbe, nr_af



(55, 13, 55)

Use [database queries](#query-session-database) to see what was found.

## Fetch structures from PDBe and Alphafold found in Uniprot


In [4]:
from protein_detective.workflow import retrieve_structures

In [5]:
download_path, nr_pdbs, nr_alphafolds = retrieve_structures(session_dir)
download_path, nr_pdbs, nr_alphafolds

Downloading PDBe files: 100%|██████████| 13/13 [00:00<00:00, 54635.22it/s]
Fetching Alphafold summaries: 100%|██████████| 55/55 [00:02<00:00, 25.34it/s]
Downloading AlphaFold files: 100%|██████████| 110/110 [00:00<00:00, 66672.46it/s]


(PosixPath('session1/downloads'), 13, 55)

## Filter out AlphafoldDB structures with low confidence

And write PDB files without those low confidence residues to new directory.

In [12]:
from protein_detective.alphafold.density import DensityFilterQuery
from protein_detective.workflow import density_filter

In [13]:
dquery = DensityFilterQuery(
    confidence=70.0,
    min_threshold=100,
    max_threshold=500,
)

In [14]:
result = density_filter(session_dir, dquery)
pprint(result)

DensityFilterSessionResult(density_filtered_dir=PosixPath('session1/density_filtered'),
                           nr_kept=24,
                           nr_discarded=31)


## Prune PDBe files
Prune the PDB files to only keep the first chain of the found Uniprot entries and rename that chain to A.

In [6]:
from protein_detective.workflow import prune_pdbs

In [7]:
(single_chain_dir, nr_passed) = prune_pdbs(session_dir)
(single_chain_dir, nr_passed)

Saving single chain PDB files from PDBe: 100%|██████████| 13/13 [00:02<00:00,  5.76it/s]


(PosixPath('session1/single_chain'), 13)

### Query session database

In [8]:
import duckdb

from protein_detective.db import db_path

In [9]:
database = db_path(session_dir)
%load_ext sql
conn = duckdb.connect(database)
%sql conn --alias duckdb

Config,value
displaylimit,100


In [10]:
%sql SELECT * FROM proteins

uniprot_acc
A6NMT0
A8MQ14
A0A087WUV0
A6NFD8
A8MT65
A8K830
A6NDZ8
A0A2R8Y619
A6NFQ7
A6NHT5


In [11]:
%sql SELECT * FROM pdbs

pdb_id,method,resolution,pdb_file
4NE5,X-Ray_Crystallography,2.5,downloads/pdb4ne5.ent.gz
7XHO,Electron_Microscopy,3.289999961853028,downloads/pdb7xho.ent.gz
7R5S,Electron_Microscopy,2.8299999237060547,downloads/pdb7r5s.ent.gz
4NDY,X-Ray_Crystallography,7.0,downloads/pdb4ndy.ent.gz
4NE3,X-Ray_Crystallography,1.7999999523162842,downloads/pdb4ne3.ent.gz
7YWX,Electron_Microscopy,12.0,downloads/pdb7ywx.ent.gz
4E45,X-Ray_Crystallography,2.0,downloads/pdb4e45.ent.gz
7XHN,Electron_Microscopy,3.7100000381469727,downloads/pdb7xhn.ent.gz
4DRB,X-Ray_Crystallography,2.630000114440918,downloads/pdb4drb.ent.gz
4NE1,X-Ray_Crystallography,6.5,downloads/pdb4ne1.ent.gz


In [12]:
%sql SELECT * FROM proteins_pdbs

uniprot_acc,pdb_id,uniprot_chains,single_chain_pdb_file
A8MT69,4NE5,B/D/F/H=8-81,single_chain/A8MT69_pdb4ne5.ent_B2A.pdb
A8MT69,7XHO,X=1-81,single_chain/A8MT69_pdb7xho.ent_X2A.pdb
A8MT69,7R5S,X=1-81,single_chain/A8MT69_pdb7r5s.ent_X2A.pdb
A8MT69,4NDY,B/D/H/L/M/N/U/V/W/X=8-81,single_chain/A8MT69_pdb4ndy.ent_B2A.pdb
A8MT69,4NE3,B=8-81,single_chain/A8MT69_pdb4ne3.ent_B2A.pdb
A8MT69,7YWX,X=1-81,single_chain/A8MT69_pdb7ywx.ent_X2A.pdb
A8MT69,4E45,B/D/G/I/L/N=1-81,single_chain/A8MT69_pdb4e45.ent_B2A.pdb
A8MT69,7XHN,X=1-81,single_chain/A8MT69_pdb7xhn.ent_X2A.pdb
A8MT69,4DRB,J/K/L/M/N/O=1-81,single_chain/A8MT69_pdb4drb.ent_J2A.pdb
A8MT69,4NE1,B/D/H/L/M/N/U/V/W/X/Z/b/d/h/i/j/o/p/q/r=8-81,single_chain/A8MT69_pdb4ne1.ent_B2A.pdb


In [13]:
%sql SELECT * FROM alphafolds LIMIT 1

uniprot_acc,summary,pdb_file,pae_file
A0A087WUV0,"{""entryId"": ""AF-A0A087WUV0-F1"", ""gene"": ""Unknown"", ""sequenceChecksum"": ""5DE83E4BE25B68BD"", ""sequenceVersionDate"": ""2014-10-29"", ""uniprotAccession"": ""A0A087WUV0"", ""uniprotId"": ""A0A087WUV0_HUMAN"", ""uniprotDescription"": ""Uncharacterized protein"", ""taxId"": 9606, ""organismScientificName"": ""Homo sapiens"", ""uniprotStart"": 1, ""uniprotEnd"": 522, ""uniprotSequence"": ""MEPEGRGSLFEDSDLLHAGNPKENDVTAVLLTPGSQELMIRDMAEALTQWRQLNSPQGDVPEKPRNLVLLGLPISTPDVISQLEHEEELEREVSKAASQKHWETIPESKELTPEKDISEEESAPGVLIVRFSKESSSECEDSLESQQENHEKHLIQEAVTEKSSRERSYQSDEFRRNCTQRSLLVQQQGERLHHCDSFKNNLKQNSDIIRHERICAGKKPWKCNECEKAFSYYSAFVLHQRIHTGEKPYECNECGKAFSQSIHLTLHQRIHTGEKPYECHECGKAFSHRSALIRHHIIHTGEKPYECNECGKAFNQSSYLTQHQRIHTGEKPYECNECGKAFSQSTFLTQHQVIHTGEKPYKCNECGKAFSDRSGLIQHQRTHTGERPYECNECGKAFGYCSALTQHQRTHTGEKPYKCNDCAKAFSDRSALIRHQRTHTGEKPYKCKDCGKAFSQSSSLTKHQKTHTGEKPYKCKECGKAFSQSSSLSQHQKTHAGVKTKKYVQALSEHLTFGQHKRIHTG"", ""modelCreatedDate"": ""2022-06-01"", ""latestVersion"": 4, ""allVersions"": [1, 2, 3, 4], ""bcifUrl"": ""https://alphafold.ebi.ac.uk/files/AF-A0A087WUV0-F1-model_v4.bcif"", ""cifUrl"": ""https://alphafold.ebi.ac.uk/files/AF-A0A087WUV0-F1-model_v4.cif"", ""pdbUrl"": ""https://alphafold.ebi.ac.uk/files/AF-A0A087WUV0-F1-model_v4.pdb"", ""paeImageUrl"": ""https://alphafold.ebi.ac.uk/files/AF-A0A087WUV0-F1-predicted_aligned_error_v4.png"", ""paeDocUrl"": ""https://alphafold.ebi.ac.uk/files/AF-A0A087WUV0-F1-predicted_aligned_error_v4.json"", ""amAnnotationsUrl"": ""https://alphafold.ebi.ac.uk/files/AF-A0A087WUV0-F1-aa-substitutions.csv"", ""amAnnotationsHg19Url"": null, ""amAnnotationsHg38Url"": ""https://alphafold.ebi.ac.uk/files/AF-A0A087WUV0-F1-hg38.csv"", ""isReviewed"": false, ""isReferenceProteome"": true}",downloads/AF-A0A087WUV0-F1-model_v4.pdb,downloads/AF-A0A087WUV0-F1-predicted_aligned_error_v4.json


In [14]:
%sql SELECT count(*) FROM alphafolds

count_star()
55


In [15]:
# Fetch fields from inside summary
%sql SELECT uniprot_acc, summary.taxId, summary.uniprotStart, summary.uniprotEnd, summary.gene FROM alphafolds

uniprot_acc,taxId,uniprotStart,uniprotEnd,gene
A0A087WUV0,9606,1,522,"""Unknown"""
A0A0C5B5G6,9606,1,16,"""MT-RNR1"""
A0A0U1RQI7,9606,1,1052,"""KLF18"""
A0A1B0GTS1,9606,1,333,"""HSFX4"""
A0A1B0GVZ6,9606,1,204,"""MBD3L2B"""
A0A1B0GWH4,9606,1,333,"""HSFX3"""
A0A1W2PPF3,9606,1,345,"""DUXB"""
A0A1W2PPK0,9606,1,400,"""Unknown"""
A0A1W2PPM1,9606,1,405,"""CPHXL"""
A0A1W2PQ73,9606,1,354,"""ERFL"""


In [16]:

%%sql
SELECT 
f.confidence, f.min_threshold, f.max_threshold,
density_filtered_alphafolds.*, 
alphafolds.summary.uniprotStart, 
alphafolds.summary.uniprotEnd, 
length(alphafolds.summary.uniprotSequence) AS uniprot_length
FROM density_filtered_alphafolds
JOIN density_filters  AS f USING (density_filter_id) 
JOIN alphafolds USING (uniprot_acc)
LIMIT 100;

confidence,min_threshold,max_threshold,density_filter_id,uniprot_acc,nr_residues_above_confidence,keep,pdb_file,uniprotStart,uniprotEnd,uniprot_length
70.0,100,500,1,A0A087WUV0,283,True,density_filtered/AF-A0A087WUV0-F1-model_v4.pdb,1,522,524
70.0,100,500,1,A0A0C5B5G6,10,False,,1,16,18
70.0,100,500,1,A0A0U1RQI7,192,True,density_filtered/AF-A0A0U1RQI7-F1-model_v4.pdb,1,1052,1054
70.0,100,500,1,A0A1B0GTS1,116,True,density_filtered/AF-A0A1B0GTS1-F1-model_v4.pdb,1,333,335
70.0,100,500,1,A0A1B0GVZ6,54,False,,1,204,206
70.0,100,500,1,A0A1B0GWH4,117,True,density_filtered/AF-A0A1B0GWH4-F1-model_v4.pdb,1,333,335
70.0,100,500,1,A0A1W2PPF3,124,True,density_filtered/AF-A0A1W2PPF3-F1-model_v4.pdb,1,345,347
70.0,100,500,1,A0A1W2PPK0,71,False,,1,400,402
70.0,100,500,1,A0A1W2PPM1,68,False,,1,405,407
70.0,100,500,1,A0A1W2PQ73,86,False,,1,354,356
