In [2]:
from helpers.setup import setup_django

In [3]:
setup_django()

  """)


In [4]:
import pickle
from pathlib import Pathfrom collections import Counter
from helpers.utility import *

## Calculating a Feature Consensus via the Interaction Interface Matrix

In [5]:
from pathlib import Path
p = Path('signprot/notebooks/interface_pickles').glob('**/*.p')
files = [x for x in p if x.is_file()]

interface_signatures = []

for file in files:
    if file.is_file():
        with file.open('rb') as f:
            name_raw = str(file)
            a = name_raw.split('/')
            a = a[-1].split('.')
            a = a[0].split('-')
            class_name = a[0].strip()
            gprot = 'Gi/o' if a[1].strip() == 'Gio' else a[1].strip()

            obj = pickle.load(f)            
            interface_signatures.append({
                    'rec_class': class_name,
                    'gprot': gprot,
                    'signature': obj['signature']
                })

In [6]:
pd.DataFrame(interface_signatures)

Unnamed: 0,gprot,rec_class,signature
0,Gi,Class A (Rhodopsin),<seqsign.sequence_signature.SequenceSignature ...
1,Gs,Class B1 (Secretin),<seqsign.sequence_signature.SequenceSignature ...
2,Gs,Class A (Rhodopsin),<seqsign.sequence_signature.SequenceSignature ...


In [8]:
for signature_dict in interface_signatures:
#     signature = signature_dict['signature']

#     sig_data = signature.prepare_display_data()
#     gn = get_generic_numbers(sig_data)
#     gn_flat = list(chain.from_iterable(gn))
            
#     signature_dict['consensus'] = get_signature_consensus(sig_data, gn_flat)
    calc_consensus_from_signature(signature_dict)



In [9]:
{'{} - {}'.format(sig['rec_class'], sig['gprot']): len(sig['consensus']) for sig in interface_signatures}

{'Class A (Rhodopsin) - Gi': 30,
 'Class A (Rhodopsin) - Gs': 27,
 'Class B1 (Secretin) - Gs': 22}

In [10]:
data = []
for entry in interface_signatures:
    tmp = aggregate_consensus_data(entry)
    data.extend(tmp)

df_interface_signatures = pd.DataFrame(data)
display(df_interface_signatures.head())
print('Shape of the dataframe: {}'.format(df_interface_signatures.shape))

Unnamed: 0,code,cons,feature,gn,gprot,key,length,origin,rec_class,score
0,+-,10,Charged,8.48x48,Gi,29,4-5,,Class A (Rhodopsin),100
1,Ha,10,Hydrogen bond acceptor,8.47x47,Gi,28,3,,Class A (Rhodopsin),100
2,HA,10,Hydrophobic aliphatic,7.56x56,Gi,27,3-4,,Class A (Rhodopsin),100
3,M,10,Hydrophobic aliphatic [M],6.36x36,Gi,26,4,,Class A (Rhodopsin),100
4,HA,10,Hydrophobic aliphatic,6.33x33,Gi,25,2-3,,Class A (Rhodopsin),100


Shape of the dataframe: (79, 10)


### How are features represented in different combinations of receptor and g-protein classes?

For this I will use the interaction interface dataset.

In [11]:
df = df_interface_signatures
rec_classes = Counter(df['rec_class'].values)
gprot_classes = Counter(df['gprot'].values)

print('Receptor Cl.: {}'.format(rec_classes))
print('G-Prote. Cl.: {} \n'.format(gprot_classes))
rec_classes = sorted(list(rec_classes))
gprot_classes = sorted(list(gprot_classes))

Receptor Cl.: Counter({'Class A (Rhodopsin)': 57, 'Class B1 (Secretin)': 22})
G-Prote. Cl.: Counter({'Gs': 49, 'Gi': 30}) 



### Class A vs. G-Protein Classes

In [12]:
df1 = df.loc[
    (df['rec_class'] == rec_classes[0]) &
    (df['gprot'] == gprot_classes[0])
]
df2 = df.loc[
    (df['rec_class'] == rec_classes[0]) &
    (df['gprot'] != gprot_classes[0])
]

drop_list_strict = [
    'origin',
    'key',
    'score',
    'cons',
    'gprot',
    'rec_class',
]

#### Intersection
Which entries do these sets have in common?
In other words: "Which entries are not specific to one receptor + g-protein interaction?"

In [13]:
res = compare_sets(df1, df2, set.intersection, drop_list_strict)
res.sort_values('gn')

Dataframe description:


Unnamed: 0,code,feature,gn,length
count,30,30,30,30
unique,16,16,30,11
top,R,Charged positive [R],8.48x48,any
freq,4,4,1,7




Dataframe size:
(30, 4)




Unnamed: 0,code,feature,gn,length
0,+-,Charged,8.48x48,4-5
1,Ha,Hydrogen bond acceptor,8.47x47,3
2,HA,Hydrophobic aliphatic,7.56x56,3-4
3,M,Hydrophobic aliphatic [M],6.36x36,4
4,HA,Hydrophobic aliphatic,6.33x33,2-3


Dataframe description:


Unnamed: 0,code,feature,gn,length
count,27,27,27,27.0
unique,17,18,27,11.0
top,R,Hydrophob al / α-H prop - very high [A],8.48x48,
freq,4,4,1,5.0




Dataframe size:
(27, 4)




Unnamed: 0,code,feature,gn,length
52,E,Charged negative [E],8.49x49,4.0
53,R,Charged positive [R],8.48x48,6.0
54,R,Charged positive [R],7.56x56,6.0
55,L,Hydrophobic aliphatic [L],6.37x37,
56,Hb,Hydrogen bonding,6.36x36,2.0


Unnamed: 0,code,feature,gn,length
2,D,Charged negative [D],3.49x49,3
3,R,Charged positive [R],3.50x50,6
0,A,Hydrophob al / α-H prop - very high [A],3.53x53,Max
1,P,α-Helix kink [P],34.50x50,2


#### Results
- Intersection from SeqSig to Interface
- "Which features from Class A + Gs can also be found in Class A without Gs?"
- The sets have three features for three positions in common.

#### Difference
#### Gs vs Gi/Go
Which entries are unique to each of these sets?
In other words: "Which entries are a unique type of interaction for that recptor + signal protein combination?"

In [14]:
res = compare_sets(df1, df2, set.difference, drop_list_strict)
res.sort_values('gn')

Dataframe description:


Unnamed: 0,code,feature,gn,length
count,30,30,30,30
unique,16,16,30,11
top,R,Charged positive [R],8.48x48,any
freq,4,4,1,7




Dataframe size:
(30, 4)




Unnamed: 0,code,feature,gn,length
0,+-,Charged,8.48x48,4-5
1,Ha,Hydrogen bond acceptor,8.47x47,3
2,HA,Hydrophobic aliphatic,7.56x56,3-4
3,M,Hydrophobic aliphatic [M],6.36x36,4
4,HA,Hydrophobic aliphatic,6.33x33,2-3


Dataframe description:


Unnamed: 0,code,feature,gn,length
count,27,27,27,27.0
unique,17,18,27,11.0
top,R,Hydrophob al / α-H prop - very high [A],8.48x48,
freq,4,4,1,5.0




Dataframe size:
(27, 4)




Unnamed: 0,code,feature,gn,length
52,E,Charged negative [E],8.49x49,4.0
53,R,Charged positive [R],8.48x48,6.0
54,R,Charged positive [R],7.56x56,6.0
55,L,Hydrophobic aliphatic [L],6.37x37,
56,Hb,Hydrogen bonding,6.36x36,2.0


Unnamed: 0,code,feature,gn,length
17,I,Hydrophobic aliphatic [I],3.54x54,
15,Hb,Hydrogen bonding (polar),3.55x55,any
13,HY,Hydrophobic,34.51x51,any
1,+,Charged positive,34.52x52,5-6
19,Y,Hydropob ar / H-bonding [Y],34.53x53,
5,Hu,Hydrogen bonding uncharged,34.54x54,3-4
10,Sm,Small,34.55x55,any
4,HA,Hydrophobic aliphatic,5.61x61,2-3
8,A,Hydrophob al / α-H prop - very high [A],5.64x64,Max
18,A,Hydrophob al / α-H prop - very high [A],5.65x65,Max


#### Difference
#### Gi/Go vs Gs

In [15]:
res = compare_sets(df2, df1, set.difference, drop_list_strict)
res.sort_values('gn')

Dataframe description:


Unnamed: 0,code,feature,gn,length
count,27,27,27,27.0
unique,17,18,27,11.0
top,R,Hydrophob al / α-H prop - very high [A],8.48x48,
freq,4,4,1,5.0




Dataframe size:
(27, 4)




Unnamed: 0,code,feature,gn,length
52,E,Charged negative [E],8.49x49,4.0
53,R,Charged positive [R],8.48x48,6.0
54,R,Charged positive [R],7.56x56,6.0
55,L,Hydrophobic aliphatic [L],6.37x37,
56,Hb,Hydrogen bonding,6.36x36,2.0


Dataframe description:


Unnamed: 0,code,feature,gn,length
count,30,30,30,30
unique,16,16,30,11
top,R,Charged positive [R],8.48x48,any
freq,4,4,1,7




Dataframe size:
(30, 4)




Unnamed: 0,code,feature,gn,length
0,+-,Charged,8.48x48,4-5
1,Ha,Hydrogen bond acceptor,8.47x47,3
2,HA,Hydrophobic aliphatic,7.56x56,3-4
3,M,Hydrophobic aliphatic [M],6.36x36,4
4,HA,Hydrophobic aliphatic,6.33x33,2-3


Unnamed: 0,code,feature,gn,length
10,T,Hydrogen bonding [T],2.39x39,
1,V,Hydrophobic aliphatic [V],3.54x54,
17,K,Charged positive [K],3.56x56,5
24,HA,Hydrophobic aliphatic,34.51x51,any
9,Hb,Hydrogen bonding (polar),34.52x52,any
4,HY,Hydrophobic,34.54x54,any
18,+-,Charged,34.55x55,any
8,R,Charged positive [R],34.57x57,6
23,R,Charged positive [R],4.40x40,6
13,M,Hydrophobic aliphatic [M],5.61x61,4


### Class B vs. G-Protein Classes

In [21]:
df1 = df.loc[
    (df['rec_class'] == rec_classes[1]) &
    (df['gprot'] == gprot_classes[0])
]
df2 = df.loc[
    (df['rec_class'] == rec_classes[1]) &
    (df['gprot'] != gprot_classes[0])
]

drop_list_strict = [
    'origin',
    'key',
    'score',
    'cons',
    'gprot',
    'rec_class',
]

#### Intersection
Which entries do these sets have in common?
In other words: "Which entries are not specific to one receptor + g-protein interaction?"

In [22]:
res = compare_sets(df1, df2, set.intersection, drop_list_strict)

Dataframe description:


Unnamed: 0,code,feature,gn,length
count,0,0,0,0
unique,0,0,0,0




Dataframe size:
(0, 4)




Unnamed: 0,code,feature,gn,length


Dataframe description:


Unnamed: 0,code,feature,gn,length
count,21,21,21,21.0
unique,13,13,21,10.0
top,HA,Hydrophobic aliphatic,8.47x47,
freq,5,5,1,7.0




Dataframe size:
(21, 4)




Unnamed: 0,code,feature,gn,length
0,E,Charged negative [E],8.49x49,4
1,N,Hydrogen bonding [N],8.48x48,
2,N,Hydrogen bonding [N],8.47x47,
3,HY,Hydrophobic,7.60x60,any
4,E,Charged negative [E],6.53x53,4


Value Error
Length mismatch: Expected axis has 0 elements, new values have 4 elements:
No entries overlap between the two sets.


As of now there are only Class B recptors interacting with Gs in the database. There exist no Class B interacting with any other signal protein class.