In [13]:
import pandas as pd
import numpy as np
import os
import sys
import gzip
import shutil
import json
import re
import argparse
import Bio.PDB as bpdb
from Bio import Align
import requests

In [18]:
# response = requests.get("https://www.ebi.ac.uk:443/interpro/api/entry/all/protein/UniProt/P05067/")
response = requests.get("https://www.ebi.ac.uk/interpro/api/entry/interpro/protein/UniProt/P13671/")
# response = requests.get("https://www.ebi.ac.uk/interpro/api/entry/interpro/protein/UniProt/P01031/")

In [19]:
json_file = response.json()["results"]
print(json_file[0]["metadata"])

{'accession': 'IPR000436', 'name': 'Sushi/SCR/CCP domain', 'source_database': 'interpro', 'type': 'domain', 'integrated': None, 'member_databases': {'profile': {'PS50923': 'Sushi/CCP/SCR domain profile'}, 'cdd': {'cd00033': 'Complement control protein (CCP) modules (aka short consensus repeats SCRs or SUSHI repeats) have been identified in several proteins of the complement system'}, 'smart': {'SM00032': 'Domain abundant in complement control proteins; SUSHI repeat; short complement-like repeat (SCR)'}, 'pfam': {'PF00084': 'Sushi repeat (SCR repeat)'}}, 'go_terms': None}


In [20]:
print(json.dumps(json_file, indent=4, sort_keys=True))

[
    {
        "metadata": {
            "accession": "IPR000436",
            "go_terms": null,
            "integrated": null,
            "member_databases": {
                "cdd": {
                    "cd00033": "Complement control protein (CCP) modules (aka short consensus repeats SCRs or SUSHI repeats) have been identified in several proteins of the complement system"
                },
                "pfam": {
                    "PF00084": "Sushi repeat (SCR repeat)"
                },
                "profile": {
                    "PS50923": "Sushi/CCP/SCR domain profile"
                },
                "smart": {
                    "SM00032": "Domain abundant in complement control proteins; SUSHI repeat; short complement-like repeat (SCR)"
                }
            },
            "name": "Sushi/SCR/CCP domain",
            "source_database": "interpro",
            "type": "domain"
        },
        "proteins": [
            {
                "accession": "p13

In [21]:
domain_stuff = [x for x in json_file if x["metadata"]["type"] == "domain"]
domain_boundaries = {x["metadata"]["accession"]: x["proteins"][0] for x in domain_stuff}
print(domain_boundaries)

{'IPR000436': {'accession': 'p13671', 'protein_length': 934, 'source_database': 'reviewed', 'organism': '9606', 'entry_protein_locations': [{'fragments': [{'start': 642, 'end': 701, 'dc-status': 'CONTINUOUS', 'representative': False}], 'model': None, 'score': None}, {'fragments': [{'start': 702, 'end': 763, 'dc-status': 'CONTINUOUS', 'representative': False}], 'model': None, 'score': None}]}, 'IPR002350': {'accession': 'p13671', 'protein_length': 934, 'source_database': 'reviewed', 'organism': '9606', 'entry_protein_locations': [{'fragments': [{'start': 785, 'end': 839, 'dc-status': 'CONTINUOUS', 'representative': False}], 'model': None, 'score': None}]}, 'IPR003884': {'accession': 'p13671', 'protein_length': 934, 'source_database': 'reviewed', 'organism': '9606', 'entry_protein_locations': [{'fragments': [{'start': 767, 'end': 839, 'dc-status': 'CONTINUOUS', 'representative': False}], 'model': None, 'score': None}, {'fragments': [{'start': 861, 'end': 934, 'dc-status': 'CONTINUOUS', '

In [9]:
print(json.dumps(domain_boundaries, indent=4, sort_keys=True))

{
    "IPR002223": {
        "accession": "p05067",
        "entry_protein_locations": [
            {
                "fragments": [
                    {
                        "dc-status": "CONTINUOUS",
                        "end": 342,
                        "representative": false,
                        "start": 288
                    }
                ],
                "model": null,
                "score": null
            }
        ],
        "organism": "9606",
        "protein_length": 770,
        "source_database": "reviewed"
    },
    "IPR003599": {
        "accession": "p05067",
        "entry_protein_locations": null,
        "organism": "9606",
        "protein_length": 770,
        "source_database": "reviewed"
    },
    "IPR007110": {
        "accession": "p05067",
        "entry_protein_locations": null,
        "organism": "9606",
        "protein_length": 770,
        "source_database": "reviewed"
    },
    "IPR008154": {
        "accession": "p05067",


In [12]:
for k, v in domain_boundaries.items():
    if v["entry_protein_locations"] is not None:
        print(k, v["entry_protein_locations"][0]["fragments"][0]["start"], v["entry_protein_locations"][-1]["fragments"][0]["end"] )

cd22607 294 341
IPR002223 288 342
IPR008154 24 189
IPR011178 132 188


In [6]:
for x in domain_stuff:
    print(x["metadata"])
    print(x["proteins"])

{'accession': 'IPR000436', 'name': 'Sushi/SCR/CCP domain', 'source_database': 'interpro', 'type': 'domain', 'integrated': None, 'member_databases': {'profile': {'PS50923': 'Sushi/CCP/SCR domain profile'}, 'cdd': {'cd00033': 'Complement control protein (CCP) modules (aka short consensus repeats SCRs or SUSHI repeats) have been identified in several proteins of the complement system'}, 'smart': {'SM00032': 'Domain abundant in complement control proteins; SUSHI repeat; short complement-like repeat (SCR)'}, 'pfam': {'PF00084': 'Sushi repeat (SCR repeat)'}}, 'go_terms': None}
[{'accession': 'p13671', 'protein_length': 934, 'source_database': 'reviewed', 'organism': '9606', 'entry_protein_locations': [{'fragments': [{'start': 642, 'end': 701, 'dc-status': 'CONTINUOUS', 'representative': False}], 'model': None, 'score': None}, {'fragments': [{'start': 702, 'end': 763, 'dc-status': 'CONTINUOUS', 'representative': False}], 'model': None, 'score': None}]}]
{'accession': 'IPR002350', 'name': 'Kaz

In [71]:
print(domain_stuff[3]["proteins"][0]["entry_protein_locations"])
for x in domain_stuff[3]["proteins"][0]["entry_protein_locations"]:
    print(x["fragments"])

[{'fragments': [{'start': 176, 'end': 522, 'dc-status': 'CONTINUOUS', 'representative': False}], 'model': None, 'score': None}]
[{'start': 176, 'end': 522, 'dc-status': 'CONTINUOUS', 'representative': False}]


In [40]:
interpro_data = pd.DataFrame([x['metadata'] for x in domain_stuff])
other_data = pd.DataFrame([x['proteins'][0] for x in domain_stuff])
print(other_data)

  accession  protein_length source_database organism  \
0    p05067             770        reviewed     9606   
1    p05067             770        reviewed     9606   
2    p05067             770        reviewed     9606   
3    p05067             770        reviewed     9606   
4    p05067             770        reviewed     9606   
5    p05067             770        reviewed     9606   
6    p05067             770        reviewed     9606   
7    p05067             770        reviewed     9606   

                             entry_protein_locations  
0                                               None  
1                                               None  
2  [{'fragments': [{'start': 294, 'end': 341, 'dc...  
3  [{'fragments': [{'start': 288, 'end': 342, 'dc...  
4                                               None  
5                                               None  
6  [{'fragments': [{'start': 24, 'end': 189, 'dc-...  
7  [{'fragments': [{'start': 132, 'end': 188, 'dc...  
