##### Setup

This boilerplate enables the use of django models in the notebook.

In [2]:
import os, sys
PWD = os.getenv('PWD')
os.chdir(PWD)
sys.path.insert(0, os.getenv('PWD'))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "protwis.settings")
import django
django.setup()

  """)


In [3]:
from signprot.views import interface_dataset
dataset = interface_dataset()
from django.db.models import F
from django.db.models import Q
from structure.models import Structure
from protein.models import Protein, ProteinSegment
from residue.models import Residue

# generate complex info dataset
filt = [e.upper() for e in list(dataset)]
struc = Structure.objects.filter(pdb_code__index__in=filt).prefetch_related('protein_conformation__protein__parent')

complex_info = []
for s in struc:
    r = {}
    r['pdb_id'] = str.lower(s.pdb_code.index)
    r['name'] = s.protein_conformation.protein.parent.name
    r['entry_name'] = s.protein_conformation.protein.parent.entry_name
    r['class'] = s.protein_conformation.protein.get_protein_class()
    r['family'] = s.protein_conformation.protein.get_protein_family()
    r['conf_id'] = s.protein_conformation.id
    try:
        r['gprot'] = s.get_stab_agents_gproteins()
    except Exception:
        r['gprot'] = ''
    complex_info.append(r)

interactions_metadata = complex_info
gprotein_order = ProteinSegment.objects.filter(proteinfamily='Alpha').values('id', 'slug')
prot_conf_ids = [i['conf_id'] for i in complex_info]
remaining_residues = Residue.objects.filter(
        protein_conformation_id__in=prot_conf_ids,
        ).values(
            rec_id = F('protein_conformation__protein__id'),
            name = F('protein_conformation__protein__parent__name'),
            entry_name = F('protein_conformation__protein__parent__entry_name'),
            rec_aa = F('amino_acid'),
            rec_gn = F('display_generic_number__label'),
        ).exclude(
            Q(rec_gn=None)
        )

new_dataset = []
for pdb_key in dataset:
    for residue_list in dataset[pdb_key]:
        curr_meta = None
        while curr_meta is None:
            for meta in interactions_metadata:
                if meta['pdb_id'].upper() == pdb_key.upper():
                    curr_meta = meta
            break
        if curr_meta is not None:
            gprot = curr_meta['gprot']
            entry_name = curr_meta['entry_name']
            pdb_id = curr_meta['pdb_id']
            residue_list.extend([gprot, entry_name, pdb_id])
            new_dataset.append(residue_list)

In [15]:
import pandas as pd
pd.DataFrame(new_dataset).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,R,V,331,5.61x61,A,L,393,G.H5.25,[hydrophobic],Gs,glp1r_human,6b3j,Gs,glp1r_human,6b3j
1,R,S,352,6.41x41,A,L,393,G.H5.25,"[polar-sidechain-backbone, van-der-waals, hydr...",Gs,glp1r_human,6b3j,Gs,glp1r_human,6b3j
2,R,E,262,4.38x39,A,K,34,G.HN.51,"[ionic, polar-sidechain-sidechain]",Gs,glp1r_human,6b3j,Gs,glp1r_human,6b3j
3,R,V,331,5.61x61,A,L,388,G.H5.20,[hydrophobic],Gs,glp1r_human,6b3j,Gs,glp1r_human,6b3j
4,R,L,251,3.54x54,A,Y,391,G.H5.23,"[van-der-waals, hydrophobic]",Gs,glp1r_human,6b3j,Gs,glp1r_human,6b3j


In [2]:
from tqdm import tqdm
from contactnetwork.models import *
from protein.models import ProteinFamily


class InteractionStatistics:
    
    def __init__(self, prot_class=None, debug=False):
        self.debug = debug
        self.prot_class = prot_class
        self.interactions = None
        self.parsed_interactions = []
    
    def set_class(self, choice=-1):
        fam_objects = ProteinFamily.objects.filter(parent_id=1)
        prot_classes = [i.name for i in fam_objects]
        if not choice >= 0:
            for index, name in enumerate(prot_classes):
                print('[{}]: {}'.format(index, name))
            choice = input('Pick a class (number): ')
        self.prot_class = fam_objects[int(choice)]
        
    def get_interactions(self):
        x = self.prot_class.id
        self.interactions = Interaction.objects.filter(
                interacting_pair__referenced_structure__protein_conformation__protein__parent__family__parent__parent__parent__id=x,
                interacting_pair__referenced_structure__state__id=3  #active
            ).select_related('interacting_pair')
        self._parse_interactions()
        return 'Interactions queried and parsed'
    
    def _parse_interactions(self):
        for a in tqdm(list(self.interactions)):
            d = {}
            d['res1'] = {}
            d['res2'] = {}
            d['class'] = self.prot_class.name
            d['int_ty'] = a.interaction_type
            d['int_ty_spe'] = a.specific_type
            d['int_ty_lev'] = a.interaction_level
            d['res1']['aa'] = a.interacting_pair.res1.amino_acid
            d['res1']['gn'] = a.interacting_pair.res1.display_generic_number.label
            d['res2']['aa'] = a.interacting_pair.res1.amino_acid
            d['res2']['gn'] = a.interacting_pair.res2.display_generic_number.label
            self.parsed_interactions.append(d)
            if self.debug and len(self.parsed_interactions) > 1000:
                break

    def _check_parsed(self):
        if not self.parsed_interactions:
            print('Interactions have not been parsed yet, run _parse_interactions')
            return False
        else:
            return True
        
    
    def summarize_interaction_types(self):
        '''Print the total number of occurence for each interaction type'''
        d = {}
        if self._check_parsed():
            for i in self.parsed_interactions:
                d.setdefault(i['int_ty'], 0)
                d[i['int_ty']] += 1
            return d

    def count_interaction_types(self, which_res='res1'):
        '''Count the interaction types per each generic number'''
        d = {}
        if self._check_parsed():
            for i in self.parsed_interactions:
                int_ty = i['int_ty']
                gn = i[which_res]['gn']
                d.setdefault(gn, {})
                d[gn].setdefault(int_ty, 0)
                d[gn][int_ty] += 1
            return d

    def calc_frequencies(self, which_res='res1'):
        '''Calculate the frequency of each interaction type per generic number'''
        d = self.count_interaction_types(which_res)
        for key in d:
            num_int_ty = sum(d[key].values())
            for int_ty in d[key]:
                d[key][int_ty] = d[key][int_ty] / num_int_ty
        return d

### Accessing Interactions
## Class A

Proof of concept for when the signal protein residues are added to the database.
For now this only gets the interactions that are used in the contact network.

In [3]:
stats_a = InteractionStatistics(debug=True)
stats_a.set_class(0)

In [4]:
stats_a.get_interactions()
frq_a = stats_a.calc_frequencies()

  3%|▎         | 997/36909 [00:35<05:19, 112.29it/s]

In [5]:
import plotly.offline as py
import cufflinks as cf
import pandas as pd

py.offline.init_notebook_mode(connected=True)
cf.set_config_file(offline=True)

df = pd.DataFrame(frq_a).T
df.head()

Unnamed: 0,Aromatic,Hydrophobic,Polar,VanDerWaals,h-bond
1.30x30,,0.5,,0.5,
1.31x31,,0.5,,0.5,
1.32x32,,0.4,0.4,0.2,
1.35x35,,1.0,,,
1.36x36,,0.5,0.25,0.25,
