#### Setup

In [1]:
import os, sys
PWD = os.getenv('PWD')
print(PWD)

/protwis/sites/protwis


In [2]:
os.chdir(PWD)
sys.path.insert(0, os.getenv('PWD'))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "protwis.settings")
import django
django.setup()

  """)


#### Imports

In [45]:
from itertools import combinations
import pickle

from tqdm import tqdm
import pandas as pd


from signprot.interactions import *
from signprot.views import interface_dataset
from protein.models import Protein, ProteinSegment, ProteinFamily, ProteinGProteinPair
from structure.models import Structure
from signprot.models import SignprotComplex
from seqsign.sequence_signature import SequenceSignature

## Defining the protein sets

### G Proteins

In [4]:
sets = [
    'Gi/o',
    'Gq/11',
    'Gs',
    'G12/13',
]

set_combinations = list(combinations(sets, 2))
segments = list(ProteinSegment.objects.filter(proteinfamily='GPCR'))

In [41]:
ProteinFamily.objects.filter(parent=533)

<QuerySet [<ProteinFamily: Gs>, <ProteinFamily: Gi/o>, <ProteinFamily: Gq/11>, <ProteinFamily: G12/13>]>

In [5]:
segments

[<ProteinSegment: N-term>,
 <ProteinSegment: TM1>,
 <ProteinSegment: ICL1>,
 <ProteinSegment: TM2>,
 <ProteinSegment: ECL1>,
 <ProteinSegment: TM3>,
 <ProteinSegment: ICL2>,
 <ProteinSegment: TM4>,
 <ProteinSegment: ECL2>,
 <ProteinSegment: TM5>,
 <ProteinSegment: ICL3>,
 <ProteinSegment: TM6>,
 <ProteinSegment: ECL3>,
 <ProteinSegment: TM7>,
 <ProteinSegment: ICL4>,
 <ProteinSegment: H8>,
 <ProteinSegment: C-term>]

### Querying all Complexes

In [65]:
rec_classes = ProteinFamily.objects.filter(parent=1).exclude(slug=100)

In [7]:
complexes = []
for pc in SignprotComplex.objects.all().prefetch_related('structure', 'protein'):
    t = {}
    
    t['gprot']  = pc.protein.family.name
    t['rec_pdb'] = pc.structure.protein_conformation.protein.entry_short()
    t['rec_obj'] = pc.structure.protein_conformation.protein
    t['rec_class'] = pc.structure.protein_conformation.protein.get_protein_class()
    t['rec_short'] = pc.structure.protein_conformation.protein.parent.entry_short()
    t['rec_name'] = pc.structure.protein_conformation.protein.parent.name
    
    complexes.append(t)

pd.DataFrame(complexes)

Unnamed: 0,gprot,rec_class,rec_name,rec_obj,rec_pdb,rec_short
0,Gs,Class B1 (Secretin),CT receptor,5uz7,5UZ7,CALCR
1,Gs,Class B1 (Secretin),CT receptor,6niy,6NIY,CALCR
2,Gs,Class B1 (Secretin),GLP-1 receptor,5vai,5VAI,G1SGD4
3,Gs,Class B1 (Secretin),GLP-1 receptor,6b3j,6B3J,GLP1R
4,Gs,Class A (Rhodopsin),A<sub>2A</sub> receptor,6gdg,6GDG,AA2AR
5,Gs,Class A (Rhodopsin),&beta;<sub>2</sub>-adrenoceptor,3sn6,3SN6,ADRB2
6,Gi/o,Class A (Rhodopsin),&mu; receptor,6dde,6DDE,OPRM
7,Gi/o,Class A (Rhodopsin),&mu; receptor,6ddf,6DDF,OPRM
8,Gi/o,Class A (Rhodopsin),Rhodopsin,6cmo,6CMO,OPSD
9,Gi/o,Class A (Rhodopsin),CB<sub>1</sub> receptor,6n4b,6N4B,CNR1


## Calculating a Feature Consensus via the Sequence Signature Tool

In [8]:
def extract_per_attr(data, attr, name):
    '''Return elements of data for which attr is equal to name'''
    return [elem for elem in data if elem[attr] == name]
    

signatures = []
for rec_class in rec_classes:
    class_name = rec_class.name
    data_per_class = None
    data_per_class = extract_per_attr(complexes, 'rec_class', class_name)

    if data_per_class:
        
        for gprot in sets:
            data_per_gprot = None
            data_per_gprot = extract_per_attr(data_per_class, 'gprot', gprot)
            
            if data_per_gprot:
        
                pos_set = [elem['rec_obj'] for elem in data_per_gprot]

                signature = SequenceSignature()
                signature.setup_alignments(segments, pos_set)
                signature.calculate_signature_onesided()

                signatures.append({
                    'rec_class': class_name,
                    'gprot': gprot,
                    'signature': signature,
                })


In [9]:
signatures

[{'gprot': 'Gi/o',
  'rec_class': 'Class A (Rhodopsin)',
  'signature': <seqsign.sequence_signature.SequenceSignature at 0x7f6fbb845908>},
 {'gprot': 'Gs',
  'rec_class': 'Class A (Rhodopsin)',
  'signature': <seqsign.sequence_signature.SequenceSignature at 0x7f6fbae0b9b0>},
 {'gprot': 'Gs',
  'rec_class': 'Class B1 (Secretin)',
  'signature': <seqsign.sequence_signature.SequenceSignature at 0x7f6fba4c57f0>}]

In [10]:
for signature_dict in signatures:
    signature = signature_dict['signature']

    sig_data = signature.prepare_display_data()
    gn = get_generic_numbers(sig_data)
    gn_flat = list(chain.from_iterable(gn))
            
    signature_dict['consensus'] = get_signature_consensus(sig_data, gn_flat)

In [11]:
{'{} - {}'.format(sig['rec_class'], sig['gprot']): len(sig['consensus']) for sig in signatures}

{'Class A (Rhodopsin) - Gi/o': 380,
 'Class A (Rhodopsin) - Gs': 292,
 'Class B1 (Secretin) - Gs': 438}

In [12]:
data = []
for entry in signatures:
    consensus = entry['consensus']
    rec_class = entry['rec_class']
    gprot = entry['gprot']
    while len(consensus) > 0:
        a = consensus.pop()
        a['gprot'] = gprot
        a['rec_class'] = rec_class
        a['origin'] = 'seqsig'
        data.append(a)

df_signatures = pd.DataFrame(data)
df_signatures

Unnamed: 0,code,cons,feature,gn,gprot,key,length,origin,rec_class,score
0,-,8,Gap (no amino acid),C.01-C-term-0009,Gi/o,379,,seqsig,Class A (Rhodopsin),67
1,-,8,Gap (no amino acid),C.01-C-term-0008,Gi/o,378,,seqsig,Class A (Rhodopsin),67
2,-,8,Gap (no amino acid),C.01-C-term-0007,Gi/o,377,,seqsig,Class A (Rhodopsin),67
3,-,8,Gap (no amino acid),C.01-C-term-0006,Gi/o,376,,seqsig,Class A (Rhodopsin),67
4,-,8,Gap (no amino acid),C.01-C-term-0005,Gi/o,375,,seqsig,Class A (Rhodopsin),67
5,-,8,Gap (no amino acid),C.01-C-term-0004,Gi/o,374,,seqsig,Class A (Rhodopsin),67
6,-,8,Gap (no amino acid),C.01-C-term-0003,Gi/o,373,,seqsig,Class A (Rhodopsin),67
7,-,8,Gap (no amino acid),C.01-C-term-0002,Gi/o,372,,seqsig,Class A (Rhodopsin),67
8,-,7,Gap (no amino acid),C.01-C-term-0001,Gi/o,371,,seqsig,Class A (Rhodopsin),50
9,-,9,Gap (no amino acid),8.60x60,Gi/o,370,,seqsig,Class A (Rhodopsin),83


## Calculating a Feature Consensus via the Interaction Interface Matrix

In [13]:
from pathlib import Path
p = Path('signprot/notebooks/pickles').glob('**/*.p')
files = [x for x in p if x.is_file()]

interface_signatures = []

for file in files:
    if file.is_file():
        with file.open('rb') as f:
            name_raw = str(file)
            a = name_raw.split('/')
            a = a[-1].split('.')
            a = a[0].split('-')
            class_name = a[0].strip()
            gprot = 'Gi/o' if a[1].strip() == 'Gio' else a[1].strip()

            obj = pickle.load(f)            
            interface_signatures.append({
                    'rec_class': class_name,
                    'gprot': gprot,
                    'signature': obj['signature']
                })

In [14]:
interface_signatures

[{'gprot': 'Gs',
  'rec_class': 'Class B1 (Secretin)',
  'signature': <seqsign.sequence_signature.SequenceSignature at 0x7f6fba850a20>},
 {'gprot': 'Gi/o',
  'rec_class': 'Class A (Rhodopsin)',
  'signature': <seqsign.sequence_signature.SequenceSignature at 0x7f6fba015b70>},
 {'gprot': 'Gs',
  'rec_class': 'Class A (Rhodopsin)',
  'signature': <seqsign.sequence_signature.SequenceSignature at 0x7f6fb9f5c5f8>}]

In [52]:
for signature_dict in interface_signatures:
    signature = signature_dict['signature']

    sig_data = signature.prepare_display_data()
    gn = get_generic_numbers(sig_data)
    gn_flat = list(chain.from_iterable(gn))
            
    signature_dict['consensus'] = get_signature_consensus(sig_data, gn_flat)

In [53]:
{'{} - {}'.format(sig['rec_class'], sig['gprot']): len(sig['consensus']) for sig in interface_signatures}

{'Class A (Rhodopsin) - Gi/o': 36,
 'Class A (Rhodopsin) - Gs': 27,
 'Class B1 (Secretin) - Gs': 21}

In [55]:
data = []
for entry in interface_signatures:
    consensus = entry['consensus']
    rec_class = entry['rec_class']
    gprot = entry['gprot']
    while len(consensus) > 0:
        a = consensus.pop()
        a['gprot'] = gprot
        a['rec_class'] = rec_class
        a['origin'] = 'matrix'
        data.append(a)

df_interface_signatures = pd.DataFrame(data)
df_interface_signatures

Unnamed: 0,code,cons,feature,gn,gprot,key,length,origin,rec_class,score
0,E,6,Charged negative [E],8.49x49,Gs,20,4,matrix,Class B1 (Secretin),33
1,N,10,Hydrogen bonding [N],8.48x48,Gs,19,,matrix,Class B1 (Secretin),100
2,N,6,Hydrogen bonding [N],8.47x47,Gs,18,,matrix,Class B1 (Secretin),33
3,HY,8,Hydrophobic,7.60x60,Gs,17,any,matrix,Class B1 (Secretin),67
4,E,8,Charged negative [E],6.53x53,Gs,16,4,matrix,Class B1 (Secretin),67
5,G,10,α-Helix flexibility [G],6.50x50,Gs,15,0,matrix,Class B1 (Secretin),100
6,P,10,α-Helix kink [P],6.47x47,Gs,14,2,matrix,Class B1 (Secretin),100
7,HA,10,Hydrophobic aliphatic,6.46x46,Gs,13,2-3,matrix,Class B1 (Secretin),100
8,T,8,Hydrogen bonding [T],6.42x42,Gs,12,,matrix,Class B1 (Secretin),67
9,HA,10,Hydrophobic aliphatic,5.61x61,Gs,11,2-3,matrix,Class B1 (Secretin),100


### Evaluating which consensus features co-occur in both datasets
#### SeqSig and Interaction Interface derived

In [18]:
drop_list = [
    'origin',
    'key',
    'score',
    'cons',
]
v_ds1 = df_signatures.drop(drop_list, 1)
v_ds2 = df_interface_signatures.drop(drop_list, 1)
colnames = v_ds1.columns

ds1 = set([tuple(line) for line in v_ds1.values])
ds2 = set([tuple(line) for line in v_ds2.values])

In [19]:
from IPython.display import display
def summarize_df(df):
    print('Dataframe description:')
    print(df.describe())
    print('\n')
    
    print('Dataframe size:')
    print(df.shape)
    print('\n')
    
    display(df.head())

The Intersection of the sets will show me which consensus features at what position appear in both datasets.

In [20]:
inters = pd.DataFrame(list(ds1.intersection(ds2)))
inters.columns = colnames
summarize_df(inters)

Dataframe description:
       code                feature       gn gprot length            rec_class
count    55                     55       55    55     55                   55
unique   22                     23       43     2     13                    2
top      HA  Hydrophobic aliphatic  5.61x61    Gs    any  Class A (Rhodopsin)
freq     10                     10        3    39     14                   38


Dataframe size:
(55, 6)




Unnamed: 0,code,feature,gn,gprot,length,rec_class
0,E,Charged negative [E],8.49x49,Gs,4,Class B1 (Secretin)
1,HY,Hydrophobic,34.51x51,Gs,any,Class A (Rhodopsin)
2,Hb,Hydrogen bonding (polar),6.26x26,Gi/o,any,Class A (Rhodopsin)
3,HR,Hydrophobic aromatic,2.57x57,Gs,5,Class B1 (Secretin)
4,A,Hydrophob al / α-H prop - very high [A],5.65x65,Gs,Max,Class A (Rhodopsin)


#### Intersection
- Intersection from SeqSig to Interface
- "Which features from SeqSig also appear in Interface?"
- There are 55 features in common among all tested GPCR and G-Protein combinations.
- Most conserved feature is HA (freq: 10)
- Most conserved position is in TM5 (freq: 3)

In [21]:
differ = pd.DataFrame(list(ds2.difference(ds1)))
differ.columns = colnames
summarize_df(differ)

Dataframe description:
       code               feature       gn gprot length            rec_class
count    29                    29       29    29     29                   29
unique   16                    16       27     2     10                    2
top      Hb  Charged positive [R]  5.71x71  Gi/o    any  Class A (Rhodopsin)
freq      3                     3        2    20      5                   25


Dataframe size:
(29, 6)




Unnamed: 0,code,feature,gn,gprot,length,rec_class
0,HA,Hydrophobic aliphatic,5.72x72,Gs,3-4,Class A (Rhodopsin)
1,-,Gap (no amino acid),6.23x23,Gi/o,,Class A (Rhodopsin)
2,F,Hydrophobic aromatic [F],2.40x40,Gi/o,,Class A (Rhodopsin)
3,K,Charged positive [K],5.71x71,Gi/o,5,Class A (Rhodopsin)
4,Hd,Hydrogen bond donor,5.71x71,Gs,4-5,Class A (Rhodopsin)


#### Difference
- Difference from SeqSig to Interface
- "Which features from Interface can not be found via the SeqSig Tool?"
- 29 features were only found via the interaction interface matrix
- Most frequent feature is HA (freq: 3)
- Most frequent position is in TM5 (freq: 2)

#### Sanity Check
Are there features at position 3.56x56 in the sequence signature dataset?
The previous test says that there are none - lets check that.

In [22]:
df = df_signatures
df.loc[df['gn'] == '3.56x56']

Unnamed: 0,code,cons,feature,gn,gprot,key,length,origin,rec_class,score
226,+-,9,Charged,3.56x56,Gi/o,153,any,seqsig,Class A (Rhodopsin),83
564,I,7,Hydrophobic aliphatic [I],3.56x56,Gs,107,,seqsig,Class A (Rhodopsin),50
873,T,10,Hydrogen bonding [T],3.56x56,Gs,236,,seqsig,Class B1 (Secretin),100


Yes! Three in total.
How about for the Gi/o g-protein class?

In [23]:
df.loc[
    (df['gn'] == '3.56x56') &
    (df['gprot'] == 'Gi/o')
]

Unnamed: 0,code,cons,feature,gn,gprot,key,length,origin,rec_class,score
226,+-,9,Charged,3.56x56,Gi/o,153,any,seqsig,Class A (Rhodopsin),83


Only one remaining now – and the feature is not of length 5.

In [24]:
df.loc[
    (df['gn'] == '3.56x56') &
    (df['gprot'] == 'Gi/o') &
    (df['length'] == '5')
]

Unnamed: 0,code,cons,feature,gn,gprot,key,length,origin,rec_class,score


### How are features represented in different combinations of receptor and g-protein classes?

For this I will use the interaction interface dataset.

In [25]:
from collections import Counter

In [26]:
df = df_interface_signatures
rec_classes = Counter(df['rec_class'].values)
gprot_classes = Counter(df['gprot'].values)

print('Receptor Cl.: {}'.format(rec_classes))
print('G-Prote. Cl.: {} \n'.format(gprot_classes))
rec_classes = sorted(list(rec_classes))
gprot_classes = sorted(list(gprot_classes))

summarize_df(df.drop(drop_list, 1))

Receptor Cl.: Counter({'Class A (Rhodopsin)': 63, 'Class B1 (Secretin)': 21})
G-Prote. Cl.: Counter({'Gs': 48, 'Gi/o': 36}) 

Dataframe description:
       code                feature       gn gprot length            rec_class
count    84                     84       84    84     84                   84
unique   29                     30       57     2     14                    2
top      HA  Hydrophobic aliphatic  8.48x48    Gs    any  Class A (Rhodopsin)
freq     13                     13        3    48     19                   63


Dataframe size:
(84, 6)




Unnamed: 0,code,feature,gn,gprot,length,rec_class
0,E,Charged negative [E],8.49x49,Gs,4,Class B1 (Secretin)
1,N,Hydrogen bonding [N],8.48x48,Gs,,Class B1 (Secretin)
2,N,Hydrogen bonding [N],8.47x47,Gs,,Class B1 (Secretin)
3,HY,Hydrophobic,7.60x60,Gs,any,Class B1 (Secretin)
4,E,Charged negative [E],6.53x53,Gs,4,Class B1 (Secretin)


#### Function to compare sets of positions

In [27]:
def compare_sets(df1, df2, method=set.intersection, drop_list=['origin','key','score','cons']):
    v_ds1 = df1.drop(drop_list, 1)
    v_ds2 = df2.drop(drop_list, 1)
    colnames = v_ds1.columns
    
    summarize_df(v_ds1)
    summarize_df(v_ds2)
    
    ds1 = set([tuple(line) for line in v_ds1.values])
    ds2 = set([tuple(line) for line in v_ds2.values])

    comp = pd.DataFrame(list(method(ds2, ds1)))
    try:
        comp.columns = colnames
        return comp
    except ValueError as e:
        print('Value Error\n{}:\nNo entries overlap between the two sets.'.format(e))
    

### Class A vs. G-Protein Classes

In [28]:
df1 = df.loc[
    (df['rec_class'] == rec_classes[0]) &
    (df['gprot'] == gprot_classes[0])
]
df2 = df.loc[
    (df['rec_class'] == rec_classes[0]) &
    (df['gprot'] == gprot_classes[1])
]

drop_list_strict = [
    'origin',
    'key',
    'score',
    'cons',
    'gprot',
    'rec_class',
]

#### Intersection
Which entries do these sets have in common?
In other words: "Which entries are not specific to one receptor + g-protein interaction?"

In [29]:
res = compare_sets(df1, df2, set.intersection, drop_list_strict)
res.sort_values('gn')

Dataframe description:
       code                feature       gn length
count    36                     36       36     36
unique   15                     15       36      9
top      HA  Hydrophobic aliphatic  6.36x36    any
freq      6                      6        1     13


Dataframe size:
(36, 4)




Unnamed: 0,code,feature,gn,length
21,Hb,Hydrogen bonding (polar),8.49x49,any
22,+-,Charged,8.48x48,4-5
23,Ha,Hydrogen bond acceptor,8.47x47,3
24,αH,α-Helix propensity - high,7.56x56,Hig
25,L,Hydrophobic aliphatic [L],6.37x37,


Dataframe description:
       code               feature       gn length
count    27                    27       27     27
unique   17                    18       27     11
top       A  Charged positive [R]  6.36x36       
freq      4                     4        1      5


Dataframe size:
(27, 4)




Unnamed: 0,code,feature,gn,length
57,E,Charged negative [E],8.49x49,4.0
58,R,Charged positive [R],8.48x48,6.0
59,R,Charged positive [R],7.56x56,6.0
60,L,Hydrophobic aliphatic [L],6.37x37,
61,Hb,Hydrogen bonding,6.36x36,2.0


Unnamed: 0,code,feature,gn,length
2,D,Charged negative [D],3.49x49,3.0
1,R,Charged positive [R],3.50x50,6.0
0,L,Hydrophobic aliphatic [L],6.37x37,


#### Results
- Intersection from SeqSig to Interface
- "Which features from in Class A + Gs can also be found in Class A + Gi/o?"
- The sets have three features for three positions in common.

#### Difference
Which entries are unique to each of these sets?
In other words: "Which entries are a unique type of interaction for that recptor + signal protein combination?"

In [30]:
res = compare_sets(df1, df2, set.difference, drop_list_strict)
res.sort_values('gn')

Dataframe description:
       code                feature       gn length
count    36                     36       36     36
unique   15                     15       36      9
top      HA  Hydrophobic aliphatic  6.36x36    any
freq      6                      6        1     13


Dataframe size:
(36, 4)




Unnamed: 0,code,feature,gn,length
21,Hb,Hydrogen bonding (polar),8.49x49,any
22,+-,Charged,8.48x48,4-5
23,Ha,Hydrogen bond acceptor,8.47x47,3
24,αH,α-Helix propensity - high,7.56x56,Hig
25,L,Hydrophobic aliphatic [L],6.37x37,


Dataframe description:
       code               feature       gn length
count    27                    27       27     27
unique   17                    18       27     11
top       A  Charged positive [R]  6.36x36       
freq      4                     4        1      5


Dataframe size:
(27, 4)




Unnamed: 0,code,feature,gn,length
57,E,Charged negative [E],8.49x49,4.0
58,R,Charged positive [R],8.48x48,6.0
59,R,Charged positive [R],7.56x56,6.0
60,L,Hydrophobic aliphatic [L],6.37x37,
61,Hb,Hydrogen bonding,6.36x36,2.0


Unnamed: 0,code,feature,gn,length
23,A,Hydrophob al / α-H prop - very high [A],3.53x53,Max
12,I,Hydrophobic aliphatic [I],3.54x54,
21,Hb,Hydrogen bonding (polar),3.55x55,any
13,P,α-Helix kink [P],34.50x50,2
2,HY,Hydrophobic,34.51x51,any
7,+,Charged positive,34.52x52,5-6
20,Y,Hydropob ar / H-bonding [Y],34.53x53,
6,Hu,Hydrogen bonding uncharged,34.54x54,3-4
8,Sm,Small,34.55x55,any
19,HA,Hydrophobic aliphatic,5.61x61,2-3


### Class B vs. G-Protein Classes

In [31]:
df1 = df.loc[
    (df['rec_class'] == rec_classes[1]) &
    (df['gprot'] == gprot_classes[0])
]
df2 = df.loc[
    (df['rec_class'] == rec_classes[1]) &
    (df['gprot'] == gprot_classes[1])
]

drop_list_strict = [
    'origin',
    'key',
    'score',
    'cons',
    'gprot',
    'rec_class',
]

#### Intersection
Which entries do these sets have in common?
In other words: "Which entries are not specific to one receptor + g-protein interaction?"

In [32]:
res = compare_sets(df1, df2, set.intersection, drop_list_strict)

Dataframe description:
        code  feature  gn  length
count      0        0   0       0
unique     0        0   0       0


Dataframe size:
(0, 4)




Unnamed: 0,code,feature,gn,length


Dataframe description:
       code                feature       gn length
count    21                     21       21     21
unique   13                     13       21     10
top      HA  Hydrophobic aliphatic  3.58x58       
freq      5                      5        1      7


Dataframe size:
(21, 4)




Unnamed: 0,code,feature,gn,length
0,E,Charged negative [E],8.49x49,4
1,N,Hydrogen bonding [N],8.48x48,
2,N,Hydrogen bonding [N],8.47x47,
3,HY,Hydrophobic,7.60x60,any
4,E,Charged negative [E],6.53x53,4


Value Error
Length mismatch: Expected axis has 0 elements, new values have 4 elements:
No entries overlap between the two sets.


As of now there are only Class B recptors interacting with Gs in the database. There exist no Class B interacting with any other signal protein class.

### Class A & B vs. Gs

In [33]:
df1 = df.loc[
    (df['rec_class'] == rec_classes[0]) &
    (df['gprot'] == gprot_classes[1])
]
df2 = df.loc[
    (df['rec_class'] == rec_classes[1]) &
    (df['gprot'] == gprot_classes[1])
]

drop_list_strict = [
    'origin',
    'key',
    'score',
    'cons',
    'gprot',
    'rec_class',
]

#### Intersection
Which entries do these sets have in common?
In other words: "Which entries are not specific to the general receptor (A or B) to Gs interaction?"

In [34]:
res = compare_sets(df1, df2, set.intersection, drop_list_strict)
res.sort_values('gn')

Dataframe description:
       code               feature       gn length
count    27                    27       27     27
unique   17                    18       27     11
top       A  Charged positive [R]  6.36x36       
freq      4                     4        1      5


Dataframe size:
(27, 4)




Unnamed: 0,code,feature,gn,length
57,E,Charged negative [E],8.49x49,4.0
58,R,Charged positive [R],8.48x48,6.0
59,R,Charged positive [R],7.56x56,6.0
60,L,Hydrophobic aliphatic [L],6.37x37,
61,Hb,Hydrogen bonding,6.36x36,2.0


Dataframe description:
       code                feature       gn length
count    21                     21       21     21
unique   13                     13       21     10
top      HA  Hydrophobic aliphatic  3.58x58       
freq      5                      5        1      7


Dataframe size:
(21, 4)




Unnamed: 0,code,feature,gn,length
0,E,Charged negative [E],8.49x49,4
1,N,Hydrogen bonding [N],8.48x48,
2,N,Hydrogen bonding [N],8.47x47,
3,HY,Hydrophobic,7.60x60,any
4,E,Charged negative [E],6.53x53,4


Unnamed: 0,code,feature,gn,length
1,HA,Hydrophobic aliphatic,5.61x61,2-3
0,E,Charged negative [E],8.49x49,4


## Sequence Signature Dataset
### How are features represented in different combinations of receptor and g-protein classes?
This time in the sequence signature dataset.

In [35]:
df = df_signatures
rec_classes = Counter(df['rec_class'].values)
gprot_classes = Counter(df['gprot'].values)

print('Receptor Cl.: {}'.format(rec_classes))
print('G-Prote. Cl.: {} \n'.format(gprot_classes))
rec_classes = sorted(list(rec_classes))
gprot_classes = sorted(list(gprot_classes))

summarize_df(df.drop(drop_list, 1))

Receptor Cl.: Counter({'Class A (Rhodopsin)': 672, 'Class B1 (Secretin)': 438})
G-Prote. Cl.: Counter({'Gs': 730, 'Gi/o': 380}) 

Dataframe description:
        code              feature       gn gprot length            rec_class
count   1110                 1110     1110  1110   1110                 1110
unique    32                   35      582     2     17                    2
top        -  Gap (no amino acid)  6.42x42    Gs         Class A (Rhodopsin)
freq     269                  265        3   730    484                  672


Dataframe size:
(1110, 6)




Unnamed: 0,code,feature,gn,gprot,length,rec_class
0,-,Gap (no amino acid),C.01-C-term-0009,Gi/o,,Class A (Rhodopsin)
1,-,Gap (no amino acid),C.01-C-term-0008,Gi/o,,Class A (Rhodopsin)
2,-,Gap (no amino acid),C.01-C-term-0007,Gi/o,,Class A (Rhodopsin)
3,-,Gap (no amino acid),C.01-C-term-0006,Gi/o,,Class A (Rhodopsin)
4,-,Gap (no amino acid),C.01-C-term-0005,Gi/o,,Class A (Rhodopsin)


Most frequent feature is Gap, and that is not of much help.
Therefore, dropping that feature.

In [36]:
df = df.loc[df['code'] != '-']
summarize_df(df.drop(drop_list, 1))

Dataframe description:
       code      feature        gn gprot length            rec_class
count   841          841       841   841    841                  841
unique   31           33       409     2     17                    2
top      HY  Hydrophobic  45.50x50    Gs    any  Class A (Rhodopsin)
freq    158          158         3   565    275                  567


Dataframe size:
(841, 6)




Unnamed: 0,code,feature,gn,gprot,length,rec_class
13,Hb,Hydrogen bonding (polar),8.56x56,Gi/o,any,Class A (Rhodopsin)
14,αH,α-Helix propensity - high,8.55x55,Gi/o,Hig,Class A (Rhodopsin)
15,HY,Hydrophobic,8.54x54,Gi/o,4-5,Class A (Rhodopsin)
16,Sm,Small,8.53x53,Gi/o,1-2,Class A (Rhodopsin)
17,Hu,Hydrogen bonding uncharged,8.52x52,Gi/o,3-4,Class A (Rhodopsin)


This removed about a quarter of unnecessary interactions and improves the summary statistics by making them more relevant.

#### Class A / Gs & Gi/o

In [37]:
df1 = df.loc[
    (df['rec_class'] == rec_classes[0]) &
    (df['gprot'] == gprot_classes[0])
]
df2 = df.loc[
    (df['rec_class'] == rec_classes[0]) &
    (df['gprot'] == gprot_classes[1])
]

drop_list_strict = [
    'origin',
    'key',
    'score',
    'cons',
    'gprot',
    'rec_class',
]
res = compare_sets(df1, df2, set.intersection, drop_list_strict)
res.sort_values('gn')

Dataframe description:
       code      feature          gn length
count   276          276         276    276
unique   25           26         276     14
top      HY  Hydrophobic  5.38-40x40    any
freq     95           95           1    169


Dataframe size:
(276, 4)




Unnamed: 0,code,feature,gn,length
13,Hb,Hydrogen bonding (polar),8.56x56,any
14,αH,α-Helix propensity - high,8.55x55,Hig
15,HY,Hydrophobic,8.54x54,4-5
16,Sm,Small,8.53x53,1-2
17,Hu,Hydrogen bonding uncharged,8.52x52,3-4


Dataframe description:
       code                feature       gn length
count   291                    291      291    291
unique   30                     31      291     16
top      HA  Hydrophobic aliphatic  2.49x49       
freq     43                     43        1    110


Dataframe size:
(291, 4)




Unnamed: 0,code,feature,gn,length
380,C,Disulfide-forming (S-S) [C],8.59x59,2.0
381,L,Hydrophobic aliphatic [L],8.58x58,
382,L,Hydrophobic aliphatic [L],8.57x57,
383,E,Charged negative [E],8.56x56,4.0
384,Q,Hydrogen bonding [Q],8.55x55,


Unnamed: 0,code,feature,gn,length
20,HA,Hydrophobic aliphatic,1.44x44,2-3
38,HY,Hydrophobic,1.48x48,any
14,N,Hydrogen bonding [N],1.50x50,
3,V,Hydrophobic aliphatic [V],1.53x53,
41,HA,Hydrophobic aliphatic,1.57x57,2-3
40,HY,Hydrophobic,1.58x58,any
24,L,Hydrophobic aliphatic [L],12.50x50,
16,T,Hydrogen bonding [T],2.39x39,
18,L,Hydrophobic aliphatic [L],2.46x46,
31,A,Hydrophob al / α-H prop - very high [A],2.47x47,Max


#### Class B / Gs & Gi/o

In [38]:
df1 = df.loc[
    (df['rec_class'] == rec_classes[1]) &
    (df['gprot'] == gprot_classes[0])
]
df2 = df.loc[
    (df['rec_class'] == rec_classes[1]) &
    (df['gprot'] == gprot_classes[1])
]

drop_list_strict = [
    'origin',
    'key',
    'score',
    'cons',
    'gprot',
    'rec_class',
]
res = compare_sets(df1, df2, set.intersection, drop_list_strict)

Dataframe description:
        code  feature  gn  length
count      0        0   0       0
unique     0        0   0       0


Dataframe size:
(0, 4)




Unnamed: 0,code,feature,gn,length


Dataframe description:
       code                feature       gn length
count   274                    274      274    274
unique   31                     33      274     17
top      HA  Hydrophobic aliphatic  2.49x49       
freq     38                     38        1     93


Dataframe size:
(274, 4)




Unnamed: 0,code,feature,gn,length
680,Ha,Hydrogen bond acceptor,8.64x64,4
681,HA,Hydrophobic aliphatic,8.63x63,3
682,+,Charged positive,8.62x62,5-6
683,HR,Hydrophobic aromatic,8.61x61,5-6
684,Hb,Hydrogen bonding (polar),8.60x60,any


Value Error
Length mismatch: Expected axis has 0 elements, new values have 4 elements:
No entries overlap between the two sets.


#### Class A & Class B vs Gs

In [39]:
df1 = df.loc[
    (df['rec_class'] == rec_classes[0]) &
    (df['gprot'] == gprot_classes[1])
]
df2 = df.loc[
    (df['rec_class'] == rec_classes[1]) &
    (df['gprot'] == gprot_classes[1])
]

drop_list_strict = [
    'origin',
    'key',
    'score',
    'cons',
    'gprot',
    'rec_class',
]
res = compare_sets(df1, df2, set.intersection, drop_list_strict)
res.sort_values('gn')

Dataframe description:
       code                feature       gn length
count   291                    291      291    291
unique   30                     31      291     16
top      HA  Hydrophobic aliphatic  2.49x49       
freq     43                     43        1    110


Dataframe size:
(291, 4)




Unnamed: 0,code,feature,gn,length
380,C,Disulfide-forming (S-S) [C],8.59x59,2.0
381,L,Hydrophobic aliphatic [L],8.58x58,
382,L,Hydrophobic aliphatic [L],8.57x57,
383,E,Charged negative [E],8.56x56,4.0
384,Q,Hydrogen bonding [Q],8.55x55,


Dataframe description:
       code                feature       gn length
count   274                    274      274    274
unique   31                     33      274     17
top      HA  Hydrophobic aliphatic  2.49x49       
freq     38                     38        1     93


Dataframe size:
(274, 4)




Unnamed: 0,code,feature,gn,length
680,Ha,Hydrogen bond acceptor,8.64x64,4
681,HA,Hydrophobic aliphatic,8.63x63,3
682,+,Charged positive,8.62x62,5-6
683,HR,Hydrophobic aromatic,8.61x61,5-6
684,Hb,Hydrogen bonding (polar),8.60x60,any


Unnamed: 0,code,feature,gn,length
5,Hb,Hydrogen bonding (polar),12.49x49,any
4,L,Hydrophobic aliphatic [L],12.50x50,
9,HY,Hydrophobic,3.35x35,any
11,W,Hydrophobic aromatic [W],4.50x50,6
3,C,Disulfide-forming (S-S) [C],45.50x50,2
1,V,Hydrophobic aliphatic [V],5.49x49,
0,HA,Hydrophobic aliphatic,5.55x55,2-3
6,HA,Hydrophobic aliphatic,5.61x61,2-3
8,+,Charged positive,5.66x66,5-6
10,L,Hydrophobic aliphatic [L],6.49x49,
