In [1]:
from helpers.setup import setup_django

In [2]:
setup_django()

  """)


In [3]:
import pickle
from pathlib import Path
from collections import Counter
from helpers.utility import *

## Calculating a Feature Consensus via the Interaction Interface Matrix

In [4]:
from pathlib import Path
p = Path('signprot/notebooks/interface_pickles').glob('**/*.p')
files = [x for x in p if x.is_file()]

interface_signatures = []

for file in files:
    if file.is_file():
        with file.open('rb') as f:
            name_raw = str(file)
            a = name_raw.split('/')
            a = a[-1].split('.')
            a = a[0].split('-')
            class_name = a[0].strip()
            gprot = 'Gi/o' if a[1].strip() == 'Gio' else a[1].strip()

            obj = pickle.load(f)            
            interface_signatures.append({
                    'rec_class': class_name,
                    'gprot': gprot,
                    'signature': obj['signature']
                })

In [5]:
pd.DataFrame(interface_signatures)

Unnamed: 0,gprot,rec_class,signature
0,Gs,Class B1 (Secretin),<seqsign.sequence_signature.SequenceSignature ...
1,Gi/o,Class A (Rhodopsin),<seqsign.sequence_signature.SequenceSignature ...
2,Gs,Class A (Rhodopsin),<seqsign.sequence_signature.SequenceSignature ...


In [10]:
for signature_dict in interface_signatures:
    signature = signature_dict['signature']

    sig_data = signature.prepare_display_data()
    gn = get_generic_numbers(sig_data)
    gn_flat = list(chain.from_iterable(gn))
            
    signature_dict['consensus'] = get_signature_consensus(sig_data, gn_flat)

In [11]:
{'{} - {}'.format(sig['rec_class'], sig['gprot']): len(sig['consensus']) for sig in interface_signatures}

{'Class A (Rhodopsin) - Gi/o': 36,
 'Class A (Rhodopsin) - Gs': 27,
 'Class B1 (Secretin) - Gs': 21}

In [12]:
data = []
for entry in interface_signatures:
    tmp = aggregate_consensus_data(entry)
    data.extend(tmp)

df_interface_signatures = pd.DataFrame(data)
display(df_interface_signatures.head())
print('Shape of the dataframe: {}'.format(df_interface_signatures.shape))

Unnamed: 0,code,cons,feature,gn,gprot,key,length,origin,rec_class,score
0,E,6,Charged negative [E],8.49x49,Gs,20,4,,Class B1 (Secretin),33
1,N,10,Hydrogen bonding [N],8.48x48,Gs,19,,,Class B1 (Secretin),100
2,N,6,Hydrogen bonding [N],8.47x47,Gs,18,,,Class B1 (Secretin),33
3,HY,8,Hydrophobic,7.60x60,Gs,17,any,,Class B1 (Secretin),67
4,E,8,Charged negative [E],6.53x53,Gs,16,4,,Class B1 (Secretin),67


Shape of the dataframe: (84, 10)


### How are features represented in different combinations of receptor and g-protein classes?

For this I will use the interaction interface dataset.

In [16]:
df = df_interface_signatures
rec_classes = Counter(df['rec_class'].values)
gprot_classes = Counter(df['gprot'].values)

print('Receptor Cl.: {}'.format(rec_classes))
print('G-Prote. Cl.: {} \n'.format(gprot_classes))
rec_classes = sorted(list(rec_classes))
gprot_classes = sorted(list(gprot_classes))

Receptor Cl.: Counter({'Class A (Rhodopsin)': 63, 'Class B1 (Secretin)': 21})
G-Prote. Cl.: Counter({'Gs': 48, 'Gi/o': 36}) 



### Class A vs. G-Protein Classes

In [18]:
df1 = df.loc[
    (df['rec_class'] == rec_classes[0]) &
    (df['gprot'] == gprot_classes[0])
]
df2 = df.loc[
    (df['rec_class'] == rec_classes[0]) &
    (df['gprot'] != gprot_classes[0])
]

drop_list_strict = [
    'origin',
    'key',
    'score',
    'cons',
    'gprot',
    'rec_class',
]

#### Intersection
Which entries do these sets have in common?
In other words: "Which entries are not specific to one receptor + g-protein interaction?"

In [19]:
res = compare_sets(df1, df2, set.intersection, drop_list_strict)
res.sort_values('gn')

Dataframe description:


Unnamed: 0,code,feature,gn,length
count,36,36,36,36
unique,15,15,36,9
top,HA,Hydrophobic aliphatic,8.49x49,any
freq,6,6,1,13




Dataframe size:
(36, 4)




Unnamed: 0,code,feature,gn,length
21,Hb,Hydrogen bonding (polar),8.49x49,any
22,+-,Charged,8.48x48,4-5
23,Ha,Hydrogen bond acceptor,8.47x47,3
24,αH,α-Helix propensity - high,7.56x56,Hig
25,L,Hydrophobic aliphatic [L],6.37x37,


Dataframe description:


Unnamed: 0,code,feature,gn,length
count,27,27,27,27.0
unique,17,18,27,11.0
top,A,Charged positive [R],8.49x49,
freq,4,4,1,5.0




Dataframe size:
(27, 4)




Unnamed: 0,code,feature,gn,length
57,E,Charged negative [E],8.49x49,4.0
58,R,Charged positive [R],8.48x48,6.0
59,R,Charged positive [R],7.56x56,6.0
60,L,Hydrophobic aliphatic [L],6.37x37,
61,Hb,Hydrogen bonding,6.36x36,2.0


Unnamed: 0,code,feature,gn,length
2,D,Charged negative [D],3.49x49,3.0
1,R,Charged positive [R],3.50x50,6.0
0,L,Hydrophobic aliphatic [L],6.37x37,


#### Results
- Intersection from SeqSig to Interface
- "Which features from Class A + Gs can also be found in Class A without Gs?"
- The sets have three features for three positions in common.

#### Difference
Which entries are unique to each of these sets?
In other words: "Which entries are a unique type of interaction for that recptor + signal protein combination?"

In [20]:
res = compare_sets(df1, df2, set.difference, drop_list_strict)
res.sort_values('gn')

Dataframe description:


Unnamed: 0,code,feature,gn,length
count,36,36,36,36
unique,15,15,36,9
top,HA,Hydrophobic aliphatic,8.49x49,any
freq,6,6,1,13




Dataframe size:
(36, 4)




Unnamed: 0,code,feature,gn,length
21,Hb,Hydrogen bonding (polar),8.49x49,any
22,+-,Charged,8.48x48,4-5
23,Ha,Hydrogen bond acceptor,8.47x47,3
24,αH,α-Helix propensity - high,7.56x56,Hig
25,L,Hydrophobic aliphatic [L],6.37x37,


Dataframe description:


Unnamed: 0,code,feature,gn,length
count,27,27,27,27.0
unique,17,18,27,11.0
top,A,Charged positive [R],8.49x49,
freq,4,4,1,5.0




Dataframe size:
(27, 4)




Unnamed: 0,code,feature,gn,length
57,E,Charged negative [E],8.49x49,4.0
58,R,Charged positive [R],8.48x48,6.0
59,R,Charged positive [R],7.56x56,6.0
60,L,Hydrophobic aliphatic [L],6.37x37,
61,Hb,Hydrogen bonding,6.36x36,2.0


Unnamed: 0,code,feature,gn,length
10,A,Hydrophob al / α-H prop - very high [A],3.53x53,Max
19,I,Hydrophobic aliphatic [I],3.54x54,
3,Hb,Hydrogen bonding (polar),3.55x55,any
9,P,α-Helix kink [P],34.50x50,2
1,HY,Hydrophobic,34.51x51,any
8,+,Charged positive,34.52x52,5-6
21,Y,Hydropob ar / H-bonding [Y],34.53x53,
0,Hu,Hydrogen bonding uncharged,34.54x54,3-4
16,Sm,Small,34.55x55,any
22,HA,Hydrophobic aliphatic,5.61x61,2-3


### Class B vs. G-Protein Classes

In [21]:
df1 = df.loc[
    (df['rec_class'] == rec_classes[1]) &
    (df['gprot'] == gprot_classes[0])
]
df2 = df.loc[
    (df['rec_class'] == rec_classes[1]) &
    (df['gprot'] != gprot_classes[0])
]

drop_list_strict = [
    'origin',
    'key',
    'score',
    'cons',
    'gprot',
    'rec_class',
]

#### Intersection
Which entries do these sets have in common?
In other words: "Which entries are not specific to one receptor + g-protein interaction?"

In [22]:
res = compare_sets(df1, df2, set.intersection, drop_list_strict)

Dataframe description:


Unnamed: 0,code,feature,gn,length
count,0,0,0,0
unique,0,0,0,0




Dataframe size:
(0, 4)




Unnamed: 0,code,feature,gn,length


Dataframe description:


Unnamed: 0,code,feature,gn,length
count,21,21,21,21.0
unique,13,13,21,10.0
top,HA,Hydrophobic aliphatic,8.47x47,
freq,5,5,1,7.0




Dataframe size:
(21, 4)




Unnamed: 0,code,feature,gn,length
0,E,Charged negative [E],8.49x49,4
1,N,Hydrogen bonding [N],8.48x48,
2,N,Hydrogen bonding [N],8.47x47,
3,HY,Hydrophobic,7.60x60,any
4,E,Charged negative [E],6.53x53,4


Value Error
Length mismatch: Expected axis has 0 elements, new values have 4 elements:
No entries overlap between the two sets.


As of now there are only Class B recptors interacting with Gs in the database. There exist no Class B interacting with any other signal protein class.