In [26]:
## Imports and settings
import pandas as pd
import numpy as np
import math
import seaborn as sns
import urllib
import glob
import os
from urllib.error import HTTPError

import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull
#%matplotlib inline
sns.set_style("darkgrid")

import ipywidgets as widgets
from IPython.display import display, Markdown, clear_output


# from tqdm.auto import tqdm
from tqdm.notebook import tnrange, tqdm

tqdm.pandas()  # activate tqdm progressbar for pandas apply

pd.options.mode.chained_assignment = (
    None  # default='warn', remove pandas warning when adding a new column
)
pd.set_option("display.max_columns", None)

from IPython.core.interactiveshell import InteractiveShell


InteractiveShell.ast_node_interactivity = "all"
#%config InlineBackend.figure_format ='svg' #better quality figure figure
np.seterr(divide='ignore', invalid='ignore')

import matplotlib.gridspec as gridspec


import MDAnalysis as mda
import nglview as nv

{'divide': 'ignore', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [27]:
%run "./00-SETUP.ipynb"

In [28]:
DATASET = pd.read_pickle(f"{WORKDIR}/DATASET_peprmint_d25.pkl")


In [29]:
print(ALFAFOLDFOLDER)

/Home/ii/alexanderp/tubiana_etal_2022/notebooks/peprmint/databases/alfafold/


In [30]:
REBUILD = True
use_uniprot_boundaries = True
use_all_AFmodels = True

import requests
import json
from biopandas.pdb import PandasPdb
from Bio import AlignIO
from urllib.error import URLError
import re
REGEX = re.compile("^(\w+)\|(\w+)\/(\d+)-(\d+)")


EXCLUDE_LIST=["Q54C71","O94827",'Q54C71','Q22070','P39960','Q62077', #PH
             'Q06839', #PX
             ]
EXCLUDE_DOMAIN = ["FYVE"]
    
def fetch_pdb_alfafold(uniprotids, domain):
    print(domain)
    nomodels=[]
    withmodels=[]
    outfolder = f"{ALFAFOLDFOLDER}/{domain}/raw"
    if not os.path.exists(outfolder):
        print("Path does not exist")
        os.makedirs(outfolder)
        
    extractedfolder = f"{ALFAFOLDFOLDER}/{domain}/extracted"
    if not os.path.exists(extractedfolder):
        os.makedirs(extractedfolder)
    else:
        if REBUILD == True: #delete extracted files
            files = glob.glob(f"{extractedfolder}/*.pdb")
            for f in files:
                os.remove(f)
    
    jsonfolder = f"{ALFAFOLDFOLDER}/{domain}/json"
    if not os.path.exists(jsonfolder):
        os.makedirs(jsonfolder)

    for uniprot_id in tqdm(uniprotids, desc="Downloading "):
        url = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v1.pdb"
        destination = f"{outfolder}/{uniprot_id}.pdb"
        if not os.path.isfile(destination): 
            try:
                urllib.request.urlretrieve(url, destination)
            except urllib.error.HTTPError as err:
                nomodels.append(uniprot_id)
                continue
        withmodels.append(uniprot_id)
        

    
    print(f"{len(nomodels)} out of {len(uniprotids)} without alfafold2 models ({len(nomodels)/len(uniprotids)*100:.2f}%)")
    return withmodels,nomodels



def get_prosite_boundaries_dict(domain):
    boundaries = {}
    prosite_ids = DOMAIN_PROSITE[domain]
    if type(prosite_ids) != type([]):
        prosite_ids = [prosite_ids]
    for msafile in prosite_ids:
        msafilepath = f"{PROSITEFOLDER}/msa/{msafile}.msa"
        msa = AlignIO.read(msafilepath,'fasta')
        for record in msa:
            seqid = record.id
            match = REGEX.match(seqid)
            if match:
                uniprot_id = match.group(2)
                start = match.group(3)
                end = match.group(4)
                boundaries[uniprot_id] = (int(start),int(end))
    return boundaries

def get_json(uniprot_acc, domain, source='ssf'):
    def request_URL(link, trial=1):
        try:
            response = requests.get(link).text
            return response
        except URLError as e:
            print(e, link)
            if trial >3 :
                print('3rd fail, skipping this one.')
                return None
            else:
                print(f"Trial {trial}, waiting 10s and trying again")
                sleep(10)
                return request_URL(link, trial=trial+1)
            
            
    jsonfolder = f"{ALFAFOLDFOLDER}/{domain}/json"
    if not os.path.exists(jsonfolder):
        os.makedirs(jsonfolder)
        
    jsonfile = f"{jsonfolder}/{uniprot_acc}.json"
    if os.path.isfile(jsonfile):
        f = open(jsonfile)
        interpro = json.load(f)
    else:
        #make the query on ebi/interpro
        response = request_URL(f"https://www.ebi.ac.uk/interpro/api/entry/{source}/protein/reviewed/{uniprot_acc}/?page_size=200")
        if response == None:
            return None
        try:
            interpro = json.loads(response)
        except:
            print(f"no data for {uniprot_acc}.")
            return None
        with open(jsonfile,'w') as out:
            json.dump(interpro, out, indent=2)
            
    return(interpro)
        
    
        

def get_domain_fragment_query(uniprot_acc, domain, boundaries_prosite):
    start_PS,end_PS = boundaries_prosite[uniprot_acc]
    starts_ends = [boundaries_prosite[uniprot_acc]]

    if DOMAIN_INTERPRO_REFINE[domain] == True:
        if domain == "PLA":
            source = 'cathgene3d'
        else:
            source = 'ssf'
        interpro = get_json(uniprot_acc, domain, source)
        if interpro == None:
            return None
        QueryString = None
        
        for result in interpro["results"]:
            if result["metadata"]["accession"] == DOMAIN_INTERPRO[domain]:
                entry_protein_locations = result["proteins"][0]["entry_protein_locations"]
                for entry in entry_protein_locations: #Get the number of truncation in the domain.
                    nfrag = len(entry['fragments'])
                    
                    if domain == 'PLA': #Special case for PLA, we will ignore PROSITE annotation that are actually wrong.
                        frag = entry['fragments'][0] #Get first monomer only
                        s = entry['fragments'][0].get('start')
                        e = entry['fragments'][0].get('end')
                        starts_ends = [[s,e]]
                    else:
                        if nfrag >= 2 and ( entry['fragments'][0].get('start') - 50 <= start_PS <= entry['fragments'][0].get('start')+50) : #if truncated domain AND correspond to the prosite domain
                            print(f"splitting {domain}-{uniprot_acc}")
                            queries = []
                            starts_ends = []
                            for frag in entry['fragments']:
                                s=int(frag.get("start"))
                                e=int(frag.get("end"))
                                starts_ends.append([s,e])
                            if use_uniprot_boundaries == True:
                                starts_ends[0][0] = start_PS
                                starts_ends[-1][-1] = end_PS

                        else: #use prosite fragment
                            starts_ends = [[start_PS, end_PS]]
                    

                QueryString = " or ".join([f"({x} <= residue_number <= {y})" for x,y in starts_ends])
        
    else:
        QueryString = " or ".join([f"({x} <= residue_number <= {y})" for x,y in starts_ends])
    
    return QueryString





## ------- MAIN
domains = DATASET.domain.unique()
#domains = ['PLA']

for domain in domains:
    #if domain in EXCLUDE_DOMAIN:
    #    continue
    group = DATASET.query("domain == @domain")
    uniprot_acc_cathpdb = group.query("data_type == 'cathpdb'").uniprot_acc.unique()
    print(f"----- PROCESSING DOMAIN {domain} -----")

    seqs_no_pdb = group[group["pdb"].isnull()].uniprot_acc.unique()
    boundaries_prosite = get_prosite_boundaries_dict(domain)


    if use_all_AFmodels:
        prosite_uniprot_acc = list(boundaries_prosite.keys()) 
        uniprot_acc_cathpdb = [acc for acc in uniprot_acc_cathpdb if acc in prosite_uniprot_acc]

        uniprot_acc_list = prosite_uniprot_acc + uniprot_acc_cathpdb

        seqs_with_model, seqs_without_model=fetch_pdb_alfafold(uniprot_acc_list, 
                                                               domain,
                                                              )
    else:
        seqs_with_model, seqs_without_model=fetch_pdb_alfafold(seqs_no_pdb, 
                                                               domain,
                                                              )


    for uniprot_id in tqdm(seqs_with_model, desc = "processing"):
        if uniprot_id in EXCLUDE_LIST:
            continue
        try:
            pdbfile =  f"{ALFAFOLDFOLDER}{domain}/raw/{uniprot_id}.pdb"
        except Exception as e:
            print(e)
            


        # structure = PDBParser().get_structure('uniprot_id',)    

        if os.path.isfile(pdbfile) and REBUILD == False:
            #skip the file if already exist
            continue


        query = get_domain_fragment_query(uniprot_id, domain, boundaries_prosite)
        if query == None:
            continue
        try:
            ppdb = PandasPdb().read_pdb(pdbfile)
        except Exception as e:
            print(e)
        
        ppdb.df["ATOM"] = ppdb.df["ATOM"].query(f"{query}")
        ppdb.to_pdb(f"{ALFAFOLDFOLDER}/{domain}/extracted/{uniprot_id}.pdb")

----- PROCESSING DOMAIN ANNEXIN -----
ANNEXIN


Downloading :   0%|          | 0/116 [00:00<?, ?it/s]

59 out of 116 without alfafold2 models (50.86%)


processing:   0%|          | 0/57 [00:00<?, ?it/s]

----- PROCESSING DOMAIN BAR -----
BAR


Downloading :   0%|          | 0/82 [00:00<?, ?it/s]

18 out of 82 without alfafold2 models (21.95%)


processing:   0%|          | 0/64 [00:00<?, ?it/s]

----- PROCESSING DOMAIN C1 -----
C1


Downloading :   0%|          | 0/301 [00:00<?, ?it/s]

72 out of 301 without alfafold2 models (23.92%)


processing:   0%|          | 0/229 [00:00<?, ?it/s]

----- PROCESSING DOMAIN C2 -----
C2


Downloading :   0%|          | 0/745 [00:00<?, ?it/s]

149 out of 745 without alfafold2 models (20.00%)


processing:   0%|          | 0/596 [00:00<?, ?it/s]

----- PROCESSING DOMAIN C2DIS -----
C2DIS


Downloading :   0%|          | 0/112 [00:00<?, ?it/s]

42 out of 112 without alfafold2 models (37.50%)


processing:   0%|          | 0/70 [00:00<?, ?it/s]

----- PROCESSING DOMAIN ENTH -----
ENTH


Downloading :   0%|          | 0/69 [00:00<?, ?it/s]

1 out of 69 without alfafold2 models (1.45%)


processing:   0%|          | 0/68 [00:00<?, ?it/s]

----- PROCESSING DOMAIN FYVE -----
FYVE


Downloading :   0%|          | 0/2461 [00:00<?, ?it/s]

665 out of 2461 without alfafold2 models (27.02%)


processing:   0%|          | 0/1796 [00:00<?, ?it/s]

----- PROCESSING DOMAIN GLA -----
GLA


Downloading :   0%|          | 0/124 [00:00<?, ?it/s]

85 out of 124 without alfafold2 models (68.55%)


processing:   0%|          | 0/39 [00:00<?, ?it/s]

----- PROCESSING DOMAIN PH -----
PH


Downloading :   0%|          | 0/1025 [00:00<?, ?it/s]

215 out of 1025 without alfafold2 models (20.98%)


processing:   0%|          | 0/810 [00:00<?, ?it/s]

splitting PH-Q99490
splitting PH-Q8L751
splitting PH-Q06412
splitting PH-Q63644
splitting PH-P11433
splitting PH-Q3UHD9
splitting PH-Q9UPQ3
splitting PH-Q6PAJ1
splitting PH-Q5VTM2
splitting PH-P11274
splitting PH-Q5VW22
splitting PH-Q12979
splitting PH-Q9NGC3
splitting PH-A6NIR3
splitting PH-Q55E26
splitting PH-Q9FIT8
splitting PH-Q8BXK8
splitting PH-Q9SAF0
splitting PH-Q8VHH5
splitting PH-Q8TF27
splitting PH-Q62868
splitting PH-A0A0G2JTR4
splitting PH-Q9XX14
splitting PH-O13992
splitting PH-Q13464
splitting PH-P70335
splitting PH-Q5W7F2
splitting PH-Q96P47
no data for Q06315.
splitting PH-Q5A950
splitting PH-Q9SU36
splitting PH-Q9SMX5
splitting PH-O75116
splitting PH-P08630
splitting PH-Q96P64
splitting PH-Q940Y1
splitting PH-P70336
splitting PH-O80866
splitting PH-Q5VUJ5
splitting PH-Q5SSL4
splitting PH-O13817
splitting PH-Q8CGU4
splitting PH-Q552C1
splitting PH-Q9Y7U5
splitting PH-F1LXF1
splitting PH-Q99490
splitting PH-Q62868
----- PROCESSING DOMAIN PLA -----
PLA


Downloading :   0%|          | 0/585 [00:00<?, ?it/s]

526 out of 585 without alfafold2 models (89.91%)


processing:   0%|          | 0/59 [00:00<?, ?it/s]

no data for Q8N271.
no data for Q9U256.
no data for Q8WXA2.
no data for Q6ZRS4.
no data for Q5TA77.
no data for Q9H295.
no data for Q71RC9.
no data for Q8BT42.
no data for Q5TA76.
----- PROCESSING DOMAIN PLD -----
PLD


Downloading :   0%|          | 0/228 [00:00<?, ?it/s]

180 out of 228 without alfafold2 models (78.95%)


processing:   0%|          | 0/48 [00:00<?, ?it/s]

----- PROCESSING DOMAIN PX -----
PX


Downloading :   0%|          | 0/271 [00:00<?, ?it/s]

90 out of 271 without alfafold2 models (33.21%)


processing:   0%|          | 0/181 [00:00<?, ?it/s]

splitting PX-Q59TN9
splitting PX-Q9Y7N9
splitting PX-Q07528
----- PROCESSING DOMAIN SEC14 -----
SEC14


Downloading :   0%|          | 0/170 [00:00<?, ?it/s]

65 out of 170 without alfafold2 models (38.24%)


processing:   0%|          | 0/105 [00:00<?, ?it/s]

----- PROCESSING DOMAIN SH2 -----
SH2


Downloading :   0%|          | 0/509 [00:00<?, ?it/s]

129 out of 509 without alfafold2 models (25.34%)


processing:   0%|          | 0/380 [00:00<?, ?it/s]

----- PROCESSING DOMAIN START -----
START


Downloading :   0%|          | 0/118 [00:00<?, ?it/s]

32 out of 118 without alfafold2 models (27.12%)


processing:   0%|          | 0/86 [00:00<?, ?it/s]

splitting START-Q9W145


In [8]:
get_domain_fragment_query("Q9Z0L3", 'PLA', boundaries_prosite)

KeyError: 'Q9Z0L3'

In [9]:
from Bio import AlignIO
import re
REGEX = re.compile("^(\w+)\|(\w+)\/(\d+)-(\d+)")         


all_uniprot_acc = []
for domain in DATASET.domain.unique():
    prosite_ids = DOMAIN_PROSITE[domain]
    if type(prosite_ids) != type([]):
        prosite_ids = [prosite_ids]
    for msafile in prosite_ids:
        msafilepath = f"{PROSITEFOLDER}/msa/{msafile}.msa"
        msa = AlignIO.read(msafilepath,'fasta')
        print(dir(msa))


['__add__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_append', '_get_per_column_annotations', '_per_col_annotations', '_records', '_set_per_column_annotations', '_str_line', 'annotations', 'append', 'column_annotations', 'extend', 'get_alignment_length', 'sort', 'substitutions']
['__add__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__',

In [10]:
def get_domain_fragment_query(uniprot_acc, domain, boundaries_prosite):
    start_PS,end_PS = boundaries_prosite[uniprot_acc]
    starts_ends = [boundaries_prosite[uniprot_acc]]

    if DOMAIN_INTERPRO_REFINE[domain] == True:
        interpro = get_json(uniprot_acc, domain)
        
        for result in interpro["results"]:
            if result["metadata"]["accession"] == DOMAIN_INTERPRO[domain]:
                entry_protein_locations = result["proteins"][0]["entry_protein_locations"]
                

                for entry in entry_protein_locations: #Get the number of truncation in the domain.
                    nfrag = len(entry['fragments'])
                        
                    if nfrag >= 2 and ( entry['fragments'][0].get('start') - 50 <= start_PS <= entry['fragments'][0].get('start')+50) : #if truncated domain AND correspond to the prosite domain
                        queries = []
                        starts_ends = []
                        for frag in entry['fragments']:
                            print(frag)
                            s=int(frag.get("start"))
                            e=int(frag.get("end"))
                            starts_ends.append([s,e])
                        if use_uniprot_boundaries == True:
                            starts_ends[0][0] = start_PS
                            starts_ends[-1][-1] = end_PS
                            
                    else: #use prosite fragment
                        starts_ends = [[start_PS, end_PS]]
                    

                QueryString = " or ".join([f"({x} <= residue_number <= {y})" for x,y in starts_ends])
        
    else:
        QueryString = " or ".join([f"({x} <= residue_number <= {y})" for x,y in starts_ends])
    return QueryString



boundaries = get_prosite_boundaries_dict("PH")
query = get_domain_fragment_query('Q55E26', "PH", boundaries)
query = get_domain_fragment_query('F1LXF1', "PH", boundaries)
query

{'start': 853, 'end': 942, 'dc-status': 'C_TERMINAL_DISC'}
{'start': 994, 'end': 1026, 'dc-status': 'N_TERMINAL_DISC'}
{'start': 705, 'end': 774, 'dc-status': 'C_TERMINAL_DISC'}
{'start': 832, 'end': 867, 'dc-status': 'N_TERMINAL_DISC'}


'(707 <= residue_number <= 774) or (832 <= residue_number <= 865)'

In [11]:
print(query)
query[0]["fragments"][0]['start']

(707 <= residue_number <= 774) or (832 <= residue_number <= 865)


TypeError: string indices must be integers

read the MSA and get the position.

In [12]:
from Bio import AlignIO
import re
regex = re.compile("^(\w+)\|(\w+)\/(\d+)-(\d+)")
boundaries = {}

prosite_ids = DOMAIN_PROSITE[domain]
if type(prosite_ids) != type([]):
    prosite_ids = [prosite_ids]
for msafile in prosite_ids:
    msafilepath = f"{PROSITEFOLDER}/msa/{msafile}.msa"
    msa = AlignIO.read(msafilepath,'fasta')
    for record in msa:
        seqid = record.id
        match = regex.match(seqid)
        if match:
            uniprot_id = match.group(2)
            start = match.group(3)
            end = match.group(4)
            boundaries[uniprot_id] = (int(start),int(end))

In [13]:
seqs_no_pdb

['Q9P2P6', 'Q80TF6', 'Q63744', 'Q96QB1', 'Q92502', 'Q9Y3M8', 'A7E300', 'Q8K031', 'P58864']
Categories (9, object): ['Q9P2P6', 'Q80TF6', 'Q63744', 'Q96QB1', ..., 'Q9Y3M8', 'A7E300', 'Q8K031', 'P58864']

In [14]:
cnt = 0
for ids in tqdm(seqs_with_model):
    print(ids)
    cnt += 1
print(cnt)

  0%|          | 0/86 [00:00<?, ?it/s]

Q9SE43
Q9Y365
Q9S9Z0
Q8RWU4
Q6AST1
Q92502
Q9Y3M8
Q9EPQ7
Q9DFS4
Q9DBK0
Q2QM96
P53809
Q0WV12
Q0J9X2
Q6ZAR0
Q99NB7
Q96DR4
Q8VZF6
Q8L7H4
Q99JV5
Q39123
Q7Y0V9
Q9M9P4
Q93V99
Q9NQZ5
Q9UKL6
Q9Y5P4
Q9FVI6
P49675
A3BPF2
Q7Y0V7
Q9JMD3
Q6TAQ6
Q9ZU11
Q9ZV65
Q5ZAY0
Q8WYK0
Q69T58
P59096
Q9M2E8
Q9FJS2
Q9R0Z9
O04292
Q96QB1
Q63744
Q923Q2
Q5QMZ9
Q14849
Q9NSY2
P46607
Q9Y0T2
Q9DG10
P97826
P53808
O17883
Q5JMF3
Q8R1R3
Q336P2
Q8K031
Q9AV49
Q61542
Q9W145
Q9EQG9
P51557
Q8VHQ9
Q94C37
Q6EPF0
F4JSE7
Q9LMT8
Q9FX31
P59095
Q9LTK3
Q54N86
Q5M7Y0
O04291
Q8WXI4
Q9FFI0
Q9Y5P4
Q9NSY2
Q8WXI4
Q99JV5
Q14849
P49675
Q9Y3M8
P59095
Q9UKL6
86


In [15]:
len(boundaries_prosite)

109

Reading alfafold pdbs and saving only PH domains.

In [16]:
from Bio.PDB import PDBParser


for uniprot_id in tqdm(seqs_with_model):
    start,end = boundaries_prosite[uniprot_id]
    pdbfile =  f"{ALFAFOLDFOLDER}/{domain}/raw/{uniprot_id}.pdb"
    # structure = PDBParser().get_structure('uniprot_id',)    

    from biopandas.pdb import PandasPdb
    ppdb = PandasPdb().read_pdb(pdbfile)

    ppdb.df["ATOM"] = ppdb.df["ATOM"].query("@start <= residue_number <= @end")
    ppdb.to_pdb(f"{ALFAFOLDFOLDER}/{domain}/{uniprot_id}.pdb")

  0%|          | 0/86 [00:00<?, ?it/s]

In [17]:
def fetch_json_information(uniprot_acc):
    response = requests.get(f"https://www.ebi.ac.uk/interpro/api/entry/ssf/protein/reviewed/{uniprot_acc}/?page_size=200").text
    interpro = json.loads(response)

In [18]:
%%time
import requests
import json
domain='PH'
uniprot_acc = "F1LXF1"
response = requests.get(f"https://www.ebi.ac.uk/interpro/api/entry/ssf/protein/reviewed/{uniprot_acc}/?page_size=200").text
interpro = json.loads(response)

CPU times: user 10 ms, sys: 1.04 ms, total: 11.1 ms
Wall time: 231 ms


In [19]:
interpro

{'count': 5,
 'next': None,
 'previous': None,
 'results': [{'metadata': {'accession': 'SSF48065',
    'name': 'DBL homology domain (DH-domain)',
    'source_database': 'ssf',
    'type': 'homologous_superfamily',
    'integrated': 'IPR035899',
    'member_databases': None,
    'go_terms': None},
   'proteins': [{'accession': 'f1lxf1',
     'protein_length': 1270,
     'source_database': 'reviewed',
     'organism': '10116',
     'entry_protein_locations': [{'fragments': [{'start': 490,
         'end': 691,
         'dc-status': 'CONTINUOUS'}],
       'model': '0053817',
       'score': 6.67e-45}]}]},
  {'metadata': {'accession': 'SSF48350',
    'name': 'GTPase activation domain, GAP',
    'source_database': 'ssf',
    'type': 'homologous_superfamily',
    'integrated': 'IPR008936',
    'member_databases': None,
    'go_terms': None},
   'proteins': [{'accession': 'f1lxf1',
     'protein_length': 1270,
     'source_database': 'reviewed',
     'organism': '10116',
     'entry_protein_lo

In [20]:
%%time

use_uniprot_boundaries = False
try:
    start_PS,end_PS = boundaries[uniprot_acc]
except:
    print("Uniprot acc does not exist")

for result in interpro["results"]:
    if result["metadata"]["accession"] == DOMAIN_INTERPRO[domain]:
        fragments = result["proteins"][0]["entry_protein_locations"][0]["fragments"]
        if len(fragments) >= 2:
            starts_ends = []
            queries = []
            for frag in fragments:
                s=int(frag.get("start"))
                e=int(frag.get("end"))
                starts_ends.append([s,e])
                queries.append(f"({s} <= residue_number <= {e})")
            if use_uniprot_boundaries == True:
                starts_ends[0][0] = start_PS
                starts_ends[-1][-1] = end_PS
        else: #use prosite fragment
            starts_ends = [start_PS, end_PS]
            
        QueryString = " or ".join([f"({x} <= residue_number <= {y})" for x,y in starts_ends])
        
            
            
print(QueryString)

Uniprot acc does not exist
(705 <= residue_number <= 774) or (832 <= residue_number <= 867)
CPU times: user 65 µs, sys: 0 ns, total: 65 µs
Wall time: 67.7 µs


In [21]:
boundaries

{'Q9SE43': (151, 379),
 'Q9GKI7': (415, 618),
 'Q9Y365': (42, 224),
 'Q9S9Z0': (218, 456),
 'Q8RWU4': (253, 484),
 'P70114': (66, 279),
 'F7B909': (248, 443),
 'Q6AST1': (171, 371),
 'Q9P2P6': (4565, 4700),
 'Q92502': (822, 992),
 'Q28996': (67, 280),
 'Q9Y3M8': (914, 1082),
 'Q9EPQ7': (1, 213),
 'Q9DFS4': (232, 445),
 'Q9DBK0': (364, 513),
 'A2ZMN9': (168, 368),
 'Q2QM96': (168, 368),
 'P53809': (30, 212),
 'Q0WV12': (315, 546),
 'Q0J9X2': (286, 523),
 'Q6ZAR0': (290, 522),
 'Q99NB7': (364, 513),
 'Q96DR4': (1, 205),
 'Q8VZF6': (200, 372),
 'Q8L7H4': (229, 466),
 'Q99JV5': (34, 224),
 'Q39123': (150, 378),
 'P02720': (30, 212),
 'Q7Y0V9': (306, 559),
 'A2WLR5': (162, 381),
 'Q9M9P4': (204, 438),
 'Q93V99': (244, 476),
 'Q9NQZ5': (137, 327),
 'P79245': (67, 280),
 'Q9DE06': (91, 279),
 'Q9UKL6': (1, 212),
 'P58864': (1, 65),
 'Q9Y5P4': (415, 618),
 'Q9FVI6': (206, 439),
 'P49675': (67, 280),
 'A3BPF2': (256, 494),
 'Q7Y0V7': (340, 583),
 'Q9JMD3': (42, 224),
 'A2ZAI7': (340, 584),
 'Q6

In [22]:
get_domain_fragment_query("Q54C71", domain, boundaries)

KeyError: 'Q54C71'

In [23]:
def get_json(uniprot_acc, domain):
    jsonfolder = f"{ALFAFOLDFOLDER}/{domain}/json"
    if not os.path.exists(jsonfolder):
        os.makedirs(jsonfolder)
        
    jsonfile = f"{jsonfolder}/{uniprot_acc}.json"
    if os.path.isfile(jsonfile):
        f = open(jsonfile)
        interpro = json.load(f)
    else:
        #make the query on ebi/interpro
        response = requests.get(f"https://www.ebi.ac.uk/interpro/api/entry/ssf/protein/reviewed/{uniprot_acc}/?page_size=200").text
        interpro = json.loads(response)
        with open(jsonfile,'w') as out:
            json.dump(interpro, out, indent=2)
            
    return(interpro)
                    

In [24]:
DATASET.groupby("cathpdb").head()

Unnamed: 0,atom_number,atom_name,residue_name,chain_id,residue_number,x_coord,y_coord,z_coord,occupancy,b_factor,sec_struc,sec_struc_full,prot_block,sasa_rel_dssp,ASA_res_freesasa_florian,RSA_freesasa_florian,ASA_total_freesasa,ASA_mainchain_freesasa,ASA_sidechain_freesasa,RSA_sidechain_freesasa,RSA_total_freesasa_tien,RSA_sidechain_freesasa_tien,sec_struc_segment,pdb,domain,cathpdb,chain,uniprot_acc,data_type,Experimental Method,convhull_vertex,co_insertable_neighbors,density,is_co_insertable,is_hydrophobic_protrusion,neighboursID,neighboursList,protrusion,LDCI,S35,S60,S95,S100,S100Count,resolution,uniprot_id,origin,residue_index,alignment_position,prositeName,prositeID,ali_range,location,CR:prositeID,taxon,CR:prositeName,uniref50,uniref90,uniref100,shannon,shannonH10,type
1,76,CA,GLY,A,12,-28.926,73.250,-11.735,1.0,37.62,C,-,Z,100.00,133.681833,128.540224,133.681833,133.681833,0.000000,0.000000,128.540224,0.000000,C1,1A8A,ANNEXIN,1a8aA01,A,P14668,cathpdb,X-ray diffraction,True,[],0,False,False,,,False,False,6,6.1,6.1.2,6.1.2.1,4,1.9,ANXA5_RAT,RAT,0,-18.0,ANNEXIN_1,PS00223,,[unkown],"[PS00223, PS51897]",Eukaryota/Metazoa,"[ANNEXIN_1, ANNEXIN_2]",P14668,P14668,P14668,,,Non-polar
5,76,CA,GLY,A,12,-28.778,73.218,-11.873,1.0,37.06,C,-,Z,100.00,133.857284,128.708927,133.857284,133.857284,0.000000,0.000000,128.708927,0.000000,C1,1A8B,ANNEXIN,1a8bA01,A,P14668,cathpdb,X-ray diffraction,True,[],0,False,False,,,False,False,6,6.1,6.1.2,6.1.2.1,5,1.9,ANXA5_RAT,RAT,0,-18.0,ANNEXIN_1,PS00223,,[unkown],"[PS00223, PS51897]",Eukaryota/Metazoa,"[ANNEXIN_1, ANNEXIN_2]",P14668,P14668,P14668,,,Non-polar
9,49,CA,GLY,A,12,5.752,9.820,-4.437,1.0,24.59,C,-,Z,100.00,134.155867,128.996026,134.155867,134.155867,0.000000,0.000000,128.996026,0.000000,C1,1ANN,ANNEXIN,1annA01,A,P13214,cathpdb,X-ray diffraction,True,[],0,False,False,,,False,False,4,4.1,4.1.2,4.1.2.1,2,2.3,ANXA4_BOVIN,BOVIN,0,-18.0,ANNEXIN_1,PS00223,,[Zymogen granule membrane],"[PS00223, PS51897]",Eukaryota/Metazoa,"[ANNEXIN_1, ANNEXIN_2]",P09525,P09525,P13214,,,Non-polar
13,93,CA,GLY,A,14,13.215,20.580,9.017,1.0,25.40,C,-,Z,100.00,138.375982,133.053829,138.375982,138.375982,0.000000,0.000000,133.053829,0.000000,C1,1ANW,ANNEXIN,1anwA01,A,P08758,cathpdb,X-ray diffraction,True,[],0,False,False,,,False,False,6,6.1,6.1.3,6.1.3.1,7,2.4,ANXA5_HUMAN,HUMAN,0,-18.0,ANNEXIN_1,PS00223,,[unkown],"[PS00223, PS51897]",Eukaryota/Metazoa,"[ANNEXIN_1, ANNEXIN_2]",P08758,P08758,P08758,,,Non-polar
17,2612,CA,GLY,B,14,12.657,-20.420,19.061,1.0,21.86,C,-,Z,100.00,135.061921,129.867232,135.061921,135.061921,0.000000,0.000000,129.867232,0.000000,C1,1ANW,ANNEXIN,1anwB01,B,P08758,cathpdb,X-ray diffraction,True,[],0,False,False,,,False,False,6,6.1,6.1.3,6.1.3.1,8,2.4,ANXA5_HUMAN,HUMAN,0,-18.0,ANNEXIN_1,PS00223,,[unkown],"[PS00223, PS51897]",Eukaryota/Metazoa,"[ANNEXIN_1, ANNEXIN_2]",P08758,P08758,P08758,,,Non-polar
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6569315,50,CA,LEU,A,8,83.720,7.896,-42.949,1.0,39.79,C,-,Z,95.12,157.345098,78.281143,157.345098,22.336922,135.008176,96.586190,78.281143,67.168247,C1,Q6UY11.pdb,FYVE,Q6UY11,A,Q6UY11,alfafold,AFmodel,True,[],0,False,False,,,False,False,,,,,0,,DLK2_HUMAN,HUMAN,1,1.0,ZF_RING_1,PS00518,,[Membrane],"[PS00010, PS00022, PS01186, PS50026, PS01187]",Eukaryota/Metazoa,"[ASX_HYDROXYL, EGF_1, EGF_2, EGF_3, EGF_CA]",Q6UY11,Q6UY11,Q6UY11,,,"Hydrophobic,H-non-aromatic"
6569317,52,CB,LEU,A,8,82.526,8.210,-43.884,1.0,39.79,C,-,Z,95.12,157.345098,78.281143,157.345098,22.336922,135.008176,96.586190,78.281143,67.168247,C1,Q6UY11.pdb,FYVE,Q6UY11,A,Q6UY11,alfafold,AFmodel,True,"[10, 14]",11,True,True,7;9;10;11;12,CYS-7;HIS-9;LEU-10;VAL-11;CYS-12,True,False,,,,,0,,DLK2_HUMAN,HUMAN,1,1.0,ZF_RING_1,PS00518,,[Membrane],"[PS00010, PS00022, PS01186, PS50026, PS01187]",Eukaryota/Metazoa,"[ASX_HYDROXYL, EGF_1, EGF_2, EGF_3, EGF_CA]",Q6UY11,Q6UY11,Q6UY11,,,"Hydrophobic,H-non-aromatic"
6569425,68,CA,LEU,A,10,86.072,12.577,-44.664,1.0,45.36,C,T,l,82.93,136.670583,67.995315,136.670583,3.128219,133.542364,95.537533,67.995315,66.438987,C1,Q6UY11.pdb,FYVE,Q6UY11,A,Q6UY11,alfafold,AFmodel,False,[],0,False,False,,,False,False,,,,,0,,DLK2_HUMAN,HUMAN,3,3.0,ZF_RING_1,PS00518,,[Membrane],"[PS00010, PS00022, PS01186, PS50026, PS01187]",Eukaryota/Metazoa,"[ASX_HYDROXYL, EGF_1, EGF_2, EGF_3, EGF_CA]",Q6UY11,Q6UY11,Q6UY11,,,"Hydrophobic,H-non-aromatic"
6569514,307,CA,LEU,A,38,8.772,0.561,-12.476,1.0,90.25,H,H,m,65.85,108.794608,54.126671,108.794608,0.174339,108.620269,77.708019,54.126671,54.039935,H1,Q6Q5X2.pdb,FYVE,Q6Q5X2,A,Q6Q5X2,alfafold,AFmodel,False,[],0,False,False,,,False,False,,,,,0,,YD34B_YEAST,YEAST,4,4.0,ZF_RING_1,PS00518,,"[Cytoplasm, Cell cortex, Membrane]",[],Eukaryota/Fungi,[],Q6Q5X2,Q6Q5X2,Q6Q5X2,,,"Hydrophobic,H-non-aromatic"


In [25]:
DATASET

Unnamed: 0,atom_number,atom_name,residue_name,chain_id,residue_number,x_coord,y_coord,z_coord,occupancy,b_factor,sec_struc,sec_struc_full,prot_block,sasa_rel_dssp,ASA_res_freesasa_florian,RSA_freesasa_florian,ASA_total_freesasa,ASA_mainchain_freesasa,ASA_sidechain_freesasa,RSA_sidechain_freesasa,RSA_total_freesasa_tien,RSA_sidechain_freesasa_tien,sec_struc_segment,pdb,domain,cathpdb,chain,uniprot_acc,data_type,Experimental Method,convhull_vertex,co_insertable_neighbors,density,is_co_insertable,is_hydrophobic_protrusion,neighboursID,neighboursList,protrusion,LDCI,S35,S60,S95,S100,S100Count,resolution,uniprot_id,origin,residue_index,alignment_position,prositeName,prositeID,ali_range,location,CR:prositeID,taxon,CR:prositeName,uniref50,uniref90,uniref100,shannon,shannonH10,type
1,76,CA,GLY,A,12,-28.926,73.250,-11.735,1.0,37.62,C,-,Z,100.0,133.681833,128.540224,133.681833,133.681833,0.0,0.0,128.540224,0.0,C1,1A8A,ANNEXIN,1a8aA01,A,P14668,cathpdb,X-ray diffraction,True,[],0,False,False,,,False,False,6,6.1,6.1.2,6.1.2.1,4,1.9,ANXA5_RAT,RAT,0,-18.0,ANNEXIN_1,PS00223,,[unkown],"[PS00223, PS51897]",Eukaryota/Metazoa,"[ANNEXIN_1, ANNEXIN_2]",P14668,P14668,P14668,,,Non-polar
5,76,CA,GLY,A,12,-28.778,73.218,-11.873,1.0,37.06,C,-,Z,100.0,133.857284,128.708927,133.857284,133.857284,0.0,0.0,128.708927,0.0,C1,1A8B,ANNEXIN,1a8bA01,A,P14668,cathpdb,X-ray diffraction,True,[],0,False,False,,,False,False,6,6.1,6.1.2,6.1.2.1,5,1.9,ANXA5_RAT,RAT,0,-18.0,ANNEXIN_1,PS00223,,[unkown],"[PS00223, PS51897]",Eukaryota/Metazoa,"[ANNEXIN_1, ANNEXIN_2]",P14668,P14668,P14668,,,Non-polar
9,49,CA,GLY,A,12,5.752,9.820,-4.437,1.0,24.59,C,-,Z,100.0,134.155867,128.996026,134.155867,134.155867,0.0,0.0,128.996026,0.0,C1,1ANN,ANNEXIN,1annA01,A,P13214,cathpdb,X-ray diffraction,True,[],0,False,False,,,False,False,4,4.1,4.1.2,4.1.2.1,2,2.3,ANXA4_BOVIN,BOVIN,0,-18.0,ANNEXIN_1,PS00223,,[Zymogen granule membrane],"[PS00223, PS51897]",Eukaryota/Metazoa,"[ANNEXIN_1, ANNEXIN_2]",P09525,P09525,P13214,,,Non-polar
13,93,CA,GLY,A,14,13.215,20.580,9.017,1.0,25.40,C,-,Z,100.0,138.375982,133.053829,138.375982,138.375982,0.0,0.0,133.053829,0.0,C1,1ANW,ANNEXIN,1anwA01,A,P08758,cathpdb,X-ray diffraction,True,[],0,False,False,,,False,False,6,6.1,6.1.3,6.1.3.1,7,2.4,ANXA5_HUMAN,HUMAN,0,-18.0,ANNEXIN_1,PS00223,,[unkown],"[PS00223, PS51897]",Eukaryota/Metazoa,"[ANNEXIN_1, ANNEXIN_2]",P08758,P08758,P08758,,,Non-polar
17,2612,CA,GLY,B,14,12.657,-20.420,19.061,1.0,21.86,C,-,Z,100.0,135.061921,129.867232,135.061921,135.061921,0.0,0.0,129.867232,0.0,C1,1ANW,ANNEXIN,1anwB01,B,P08758,cathpdb,X-ray diffraction,True,[],0,False,False,,,False,False,6,6.1,6.1.3,6.1.3.1,8,2.4,ANXA5_HUMAN,HUMAN,0,-18.0,ANNEXIN_1,PS00223,,[unkown],"[PS00223, PS51897]",Eukaryota/Metazoa,"[ANNEXIN_1, ANNEXIN_2]",P08758,P08758,P08758,,,Non-polar
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10038483,0,CA,UNK,,33,,,,,,,,,,,,,,,,,,,,GLA,,,P83238,prosite,,False,,0,False,False,,,False,,,,,,0,,OSTCN_DANRE,DANRE,35,35.0,GLA_2,PS50998,1-44,[Secreted],"[PS00011, PS50998]",Eukaryota/Metazoa,"[GLA_1, GLA_2]",P83238,P83238,P83238,0.322130,0.253026,none
10038484,0,CA,UNK,,40,,,,,,,,,,,,,,,,,,,,GLA,,,P86863,prosite,,False,,0,False,False,,,False,,,,,,0,,OSTC2_SOLSE,SOLSE,43,43.0,GLA_2,PS50998,1-43,[Secreted],"[PS00011, PS50998]",Eukaryota/Metazoa,"[GLA_1, GLA_2]",P83238,P86863,P86863,0.328352,0.349075,none
10038485,0,CA,UNK,,5,,,,,,,,,,,,,,,,,,,,GLA,,,P83238,prosite,,False,,0,False,False,,,False,,,,,,0,,OSTCN_DANRE,DANRE,7,7.0,GLA_2,PS50998,1-44,[Secreted],"[PS00011, PS50998]",Eukaryota/Metazoa,"[GLA_1, GLA_2]",P83238,P83238,P83238,0.619195,0.553519,none
10038486,0,CA,UNK,,45,,,,,,,,,,,,,,,,,,,,PLA,,,P82893,prosite,,False,,0,False,False,,,False,,,,,,0,,PA2B2_TRIST,TRIST,2,2.0,PA2_HIS,PS00118,43-50,[Secreted],[PS00118],Eukaryota/Metazoa,[PA2_HIS],B3EWP6,P82893,P82893,0.300149,0.294352,none
