In [1]:
from helpers.setup import setup_django

In [2]:
setup_django()

In [62]:
import time
import datetime
import os
import json
from collections import Counter

import pandas as pd

from django.conf import settings
from django.db import connection
from django.db.models import F, Q, Prefetch
from django.contrib.postgres.aggregates import ArrayAgg
from django.core.exceptions import ObjectDoesNotExist

from structure.models import Structure
from residue.models import Residue
from contactnetwork.models import InteractingResiduePair, Interaction
from signprot.models import SignprotStructure, SignprotBarcode, SignprotInteractions, SignprotComplex
from signprot.views import SequenceSignature, SignatureMatch
from protein.models import Protein, ProteinConformation, ProteinSegment
from residue.models import ResidueGenericNumberEquivalent

from signprot.interactions import get_class_slug, get_generic_numbers, get_signature_features, group_signature_features, prepare_signature_match

In [4]:
print(
    InteractingResiduePair.objects.exclude(
    res1__protein_conformation=F('res2__protein_conformation')
).count()
)

print(
InteractingResiduePair.objects.exclude(
    Q(res1__protein_segment__proteinfamily='GPCR') | Q(res2__protein_segment__proteinfamily='GPCR')
).count()
)

532
0


In [5]:
# what albert suggested

# get all protein ids for alpha subunits
alpha_prot = SignprotComplex.objects.values_list('protein', flat=True).distinct()
# get all associated protein conformations
alpha_prot_conf = ProteinConformation.objects.filter(protein__in=alpha_prot).values_list('id', flat=True)

# get all associated residues
alpha_prot_residues = Residue.objects.filter(
    protein_conformation__in=alpha_prot_conf
).values_list('id', flat=True)

# How many residues do I have for alpha subunits?
print(alpha_prot_residues.count())

# Any of these residues a member in any interacting residue pair, either as residue1 or as residue2?
InteractingResiduePair.objects.filter(
    Q(res1__in=alpha_prot_residues) | Q(res2__in=alpha_prot_residues)
).count()

2210


0

In [6]:
# incorrect receptor entry names - the ones without '_a
complex_names = SignprotComplex.objects.values_list('structure__protein_conformation__protein__entry_name', flat=True)
# protein conformations for those
prot_conf = ProteinConformation.objects.filter(protein__entry_name__in=complex_names).values_list('id', flat=True)

# getting all the receptor residues for those protein conformations
prot_residues = Residue.objects.filter(
    protein_conformation__in=prot_conf
).values_list('id', flat=True)

# how many receptor residues do I have
print(prot_residues.count())

# Any of these residues a member in any interacting residue pair, either as residue1 or as residue2?
InteractingResiduePair.objects.filter(
    Q(res1__in=prot_residues) | Q(res2__in=prot_residues)
).count()

7162


5035

In [7]:
# Are there residue pairs where only one of the residues is in a receptor but never both residues of the pair are in a receptor?
InteractingResiduePair.objects.filter(
    Q(res1__in=prot_residues) | Q(res2__in=prot_residues)
).exclude(
    Q(res1__in=prot_residues) & Q(res2__in=prot_residues)
).count()

532

In [8]:
# correct receptor entry names - the ones with '_a' appended
complex_objs = SignprotComplex.objects.prefetch_related('structure__protein_conformation__protein').all()
complex_names = [complex_obj.structure.protein_conformation.protein.entry_name + '_' + complex_obj.alpha.lower() for complex_obj in complex_objs]
# protein conformations for those
prot_conf = ProteinConformation.objects.filter(protein__entry_name__in=complex_names).values_list('id', flat=True)

# getting all the receptor residues for those protein conformations
prot_residues = Residue.objects.filter(
    protein_conformation__in=prot_conf
).values_list('id', flat=True)

# how many receptor residues do I have
print(prot_residues.count())

# Any of these residues a member in any interacting residue pair, either as residue1 or as residue2?
InteractingResiduePair.objects.filter(
    Q(res1__in=prot_residues) | Q(res2__in=prot_residues)
).count()

4672


532

In [9]:
# How about this question again, but this time for the 'correct' proteins?
# Are there residue pairs where only one of the residues is in a receptor but never both residues of the pair are in a receptor?
InteractingResiduePair.objects.filter(
    Q(res1__in=prot_residues) | Q(res2__in=prot_residues)
).exclude(
    Q(res1__in=prot_residues) & Q(res2__in=prot_residues)
).count()

532

In [10]:
complex_objs.count()

23

In [11]:
len(complex_names)

23

In [12]:
prot_conf.count()

20

In [13]:
def sort_a_by_b(a, b, remove_invalid=False):
    '''Sort one list based on the order of elements from another list'''
    # https://stackoverflow.com/q/12814667    
    # a = ['alpha_mock', 'van-der-waals', 'ionic']
    # b = ['ionic', 'aromatic', 'hydrophobic', 'polar', 'van-der-waals', 'alpha_mock']
    # sort_a_by_b(a,b) -> ['ionic', 'van-der-waals', 'alpha_mock']
    if remove_invalid:
        a = [a_elem for a_elem in a if a_elem in b]
    return sorted(a, key=lambda x: b.index(x))

In [14]:
InteractingResiduePair.objects.order_by('res1__display_generic_number__label', 'res2__display_generic_number__label').values(int_ty=ArrayAgg('interaction__interaction_type'))

<QuerySet [{'int_ty': ['hydrophobic', 'hydrophobic']}, {'int_ty': ['hydrophobic', 'hydrophobic']}, {'int_ty': ['hydrophobic', 'hydrophobic']}, {'int_ty': ['hydrophobic', 'hydrophobic']}, {'int_ty': ['aromatic', 'aromatic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'van-der-waals', 'van-der-waals', 'van-der-waals']}, {'int_ty': ['van-der-waals', 'hydrophobic', 'van-der-waals', 'van-der-waals', 'aromatic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'aromatic']}, {'int_ty': ['hydrophobic', 'aromatic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'aromatic', 'hydrophobic', 'hydrophobic', 'hydrophobic', 'hydrophobic']}, {'int_ty': ['hydrophobic']}, {'int_ty': ['van-der-waals', 'hydrophobic', 'hydrophobic', 'aromatic', 'aromatic', 'hydrophobic', 'hydrophobic', 'van-

In [15]:
def interface_dataset():
    # correct receptor entry names - the ones with '_a' appended
    complex_objs = SignprotComplex.objects.prefetch_related('structure__protein_conformation__protein')
    complex_names = [complex_obj.structure.protein_conformation.protein.entry_name + '_' + complex_obj.alpha.lower() for complex_obj in complex_objs]
    complex_struc_ids = [co.structure_id for co in complex_objs]
    # protein conformations for those
    prot_conf = ProteinConformation.objects.filter(protein__entry_name__in=complex_names).values_list('id', flat=True)

    interaction_sort_order = [
        "ionic",
        "aromatic",
        "hydrophobic",
        "polar",
        "van-der-waals",    
    ]

    # getting all the signal protein residues for those protein conformations
    prot_residues = Residue.objects.filter(
        protein_conformation__in=prot_conf
    ).values_list('id', flat=True)

    interactions = InteractingResiduePair.objects.filter(
        Q(res1__in=prot_residues) | Q(res2__in=prot_residues),
        referenced_structure__in=complex_struc_ids
    ).exclude(
        Q(res1__in=prot_residues) & Q(res2__in=prot_residues)
    ).prefetch_related(
        'interaction__interaction_type',
        'referenced_structure__pdb_code__index',
        'referenced_structure__signprot_complex__protein__entry_name',
        'referenced_structure__protein_conformation__protein__parent__entry_name',
        'res1__amino_acid',
        'res1__sequence_number',
        'res1__generic_number__label',
        'res2__amino_acid',
        'res2__sequence_number',
        'res2__generic_number__label',
    ).order_by(
        'res1__generic_number__label',
        'res2__generic_number__label'
    ).values(
        int_id=F('id'),
        int_ty=ArrayAgg(
            'interaction__interaction_type',
            distinct=True,
#             ordering=interaction_sort_order
        ),

        pdb_id=F('referenced_structure__pdb_code__index'),
        conf_id=F('referenced_structure__protein_conformation_id'),
        gprot=F('referenced_structure__signprot_complex__protein__entry_name'),
        entry_name=F('referenced_structure__protein_conformation__protein__parent__entry_name'),

        rec_aa=F('res1__amino_acid'),
        rec_pos=F('res1__sequence_number'),
        rec_gn=F('res1__generic_number__label'),

        sig_aa=F('res2__amino_acid'),
        sig_pos=F('res2__sequence_number'),
        sig_gn=F('res2__display_generic_number__label')
    )

    conf_ids = set()
    for i in interactions:
            i['int_ty'] = sort_a_by_b(i['int_ty'], interaction_sort_order)
            conf_ids.update([i['conf_id']])

    return list(conf_ids), list(interactions)

In [16]:
conf_ids, interactions = interface_dataset()

In [17]:
interactions[0]

{'conf_id': 20804,
 'entry_name': 'ntr1_human',
 'gprot': 'gnai1_human',
 'int_id': 143537,
 'int_ty': ['hydrophobic', 'van-der-waals'],
 'pdb_id': '6OSA',
 'rec_aa': 'Q',
 'rec_gn': '2x36',
 'rec_pos': 98,
 'sig_aa': 'D',
 'sig_gn': 'G.H5.22',
 'sig_pos': 350}

In [18]:
%%timeit
interface_dataset()

178 ms ± 4.62 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%%timeit
gprotein_order = ProteinSegment.objects.filter(proteinfamily='Alpha').values('id', 'slug')

292 µs ± 71.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [20]:
%%timeit
struc = SignprotComplex.objects.prefetch_related(
  'structure__stabilizing_agents',
    'structure__pdb_code',
    'structure__protein_conformation__protein__species',
).select_related(
    'structure__protein_conformation__protein__parent__parent__parent',
    'structure__signprot_complex__protein__family__parent__parent__parent__parent',
)
complex_info = []
for s in struc:
    r = {}
    s = s.structure
    r['pdb_id'] = s.pdb_code.index
    r['name'] = s.protein_conformation.protein.parent.short()
    r['entry_name'] = s.protein_conformation.protein.parent.entry_name
    r['class'] = s.protein_conformation.protein.get_protein_class()
    r['family'] = s.protein_conformation.protein.get_protein_family()
    r['conf_id'] = s.protein_conformation.id
    r['organism'] = s.protein_conformation.protein.species.common_name
    try:
        r['gprot'] = s.get_stab_agents_gproteins()
    except Exception:
        r['gprot'] = ''
    try:
        r['gprot_class'] = s.get_signprot_gprot_family()
    except Exception:
        r['gprot_class'] = ''
    complex_info.append(r)

208 ms ± 33.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
%%timeit
remaining_residues = Residue.objects.filter(
        protein_conformation_id__in=conf_ids,
        ).prefetch_related(
            "protein_conformation",
            "protein_conformation__protein",
            "protein_conformation__structure",
            "protein_conformation__protein__parent",
            "protein_conformation__structure__pdb_code",
        ).values(
            rec_id = F('protein_conformation__protein__id'),
            name = F('protein_conformation__protein__parent__name'),
            entry_name = F('protein_conformation__protein__parent__entry_name'),
            pdb_id = F('protein_conformation__structure__pdb_code__index'),
            rec_aa = F('amino_acid'),
            rec_gn = F('display_generic_number__label'),
        ).exclude(
            Q(rec_gn=None)
        )

1.37 ms ± 32.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [22]:
for i in interactions:
    if i['pdb_id'] == '5G53':
        print(i)

In [23]:
pdb_list = ['5G53','6GDG','6FUF']
complex_objs = SignprotComplex.objects.filter(structure__pdb_code__index__in=pdb_list)
complex_names = [complex_obj.structure.protein_conformation.protein.entry_name + '_' + complex_obj.alpha.lower() for complex_obj in complex_objs]
prot_conf = ProteinConformation.objects.filter(protein__entry_name__in=complex_names).values_list('id', flat=True)

In [24]:
complex_names

['6gdg_d', '6fuf_b', '5g53_c']

In [25]:
complex_objs.values_list('id')

<QuerySet [(6,), (20,), (1,)]>