## Setup (Django)

In [45]:
from helpers.setup import setup_django

In [46]:
setup_django()

## Setup

In [47]:
from pathlib import Path
import json
from helpers.utility import *
from signprot.interactions import *
from protein.models import ProteinGProtein
from seqsign.sequence_signature import SequenceSignature

In [48]:
def export_features_for_snake(curr_obj, df):
    name1 = curr_obj['rec_class'].replace(' ', '_')
    name2 = curr_obj['gprot'].replace('/', '_')
    ext = '.json'
    json_path = '/protwis/sites/protwis/signprot/thesis_steven/'
    full_path = json_path + '_'.join([name1, name2, ext])
    df[['code', 'gn']].to_json(full_path, orient='records')

## Supporting Datastores

In [49]:
rec_clas = get_receptor_classes()
rec_segs = get_receptor_segments()
gpr_clas = get_gprot_classes()
gpr_segs = get_gprot_segments()

In [50]:
gpr_clas2 = ProteinGProtein.objects.all().values_list('name', flat=True)
gpr_clas2 = [i.split(' ')[0] for i in gpr_clas2]

## Generating a table of all receptors and their coupling data (Guide To Pharmacology)

In [51]:
coupling_data = prepare_coupling_data_container()

In [52]:
coupling_data = fill_coupling_data_container(coupling_data)

In [53]:
coupling_data = process_coupling_data(coupling_data)

In [54]:
df = pd.DataFrame(coupling_data)
df = pd.concat([df.drop(['Merged', 'Aska', 'GuideToPharma'], axis=1), df['Merged'].apply(pd.Series)], axis=1)
df.sort_values('key').head()

Unnamed: 0,coupling,key,rec_class,rec_obj,G12/G13,Gi/Go,Gq/G11,Gs,gprot
3,"{'GuideToPharma': {'Gi/Go': 'primary'}, 'Merge...",5HT1A,Class A (Rhodopsin),5ht1a_human,True,True,True,True,"[Gi/Go, Gq/G11]"
26,"{'GuideToPharma': {'Gi/Go': 'primary'}, 'Merge...",5HT1B,Class A (Rhodopsin),5ht1b_human,True,True,True,True,[Gi/Go]
183,"{'GuideToPharma': {'Gi/Go': 'primary'}, 'Merge...",5HT1D,Class A (Rhodopsin),5ht1d_human,True,True,True,True,[Gi/Go]
40,"{'GuideToPharma': {'Gi/Go': 'primary'}, 'Merge...",5HT1E,Class A (Rhodopsin),5ht1e_human,True,True,True,True,[Gi/Go]
242,"{'GuideToPharma': {'Gi/Go': 'primary'}, 'Merge...",5HT1F,Class A (Rhodopsin),5ht1f_human,True,True,True,True,[Gi/Go]


## Calculating Sequence Signatures for all receptors and their coupling partners

In [55]:
path = '/protwis/sites/protwis/signprot/thesis_steven/pickles/'

columns = [
    'rec_class',
    'gprot',
    'with',
    'wo',
    'file_with',
    'file_wo',
]
result_files = pd.DataFrame(columns=columns)

In [56]:
# for rc in rec_clas:
#     rc = str(rc)

#     for gp in gpr_clas2:
#         print(rc, '+', gp)
        
#         data_with = df[
#             (df[gp].astype(bool)) &
#             (df['rec_class'] == rc)
#         ]
#         data_without = df[
#             (df[gp].astype(bool) == False) &
#             (df['rec_class'] == rc)
#         ]
        
#         with_set = data_with['rec_obj']
#         wo_set = data_without['rec_obj']

#         signature = SequenceSignature()
#         signature.setup_alignments_signprot(rec_segs, with_set.tolist())
#         signature.calculate_signature_onesided()

#         file_with = Path(rc+'_'+gp+'_with.p')
#         file_with = str(file_with).replace('/', '_').replace(' ', '_')
#         pickle_signature({
#             'type': 'with',
#             'rec_class': rc,
#             'gprot': gp,
#             'signature': signature,
#         }, path, file_with)

        
#         signature = SequenceSignature()
#         signature.setup_alignments_signprot(rec_segs, wo_set.tolist())
#         signature.calculate_signature_onesided()

#         file_wo = Path(rc+'_'+gp+'_wo.p')
#         file_wo = str(file_wo).replace('/', '_').replace(' ', '_')
#         pickle_signature({
#             'type': 'without',
#             'rec_class': rc,
#             'gprot': gp,
#             'signature': signature,
#         }, path, file_wo)


#         result_files.loc[len(result_files)] = [
#             rc,
#             gp,
#             data_with.shape[0],
#             data_without.shape[0],
#             file_with,
#             file_wo,
#         ]

# result_files.to_csv(path+'index.csv')

# Data Analysis

## Comparison of uniquely Gi/Go coupling receptors to uniquely Gx coupling ones

These receptors all couple exclusively, primarily with Gi/Go.

In [86]:
df_agio = df[
    (df['Gi/Go']==True) &
    (df['G12/G13']==False) &
    (df['Gs']==False) &
    (df['Gq/G11']==False) &
    (df['rec_class']=='Class A (Rhodopsin)')]

These receptors all couple exclusively, primarily with Gs.

In [87]:
df_ags = df[
    (df['Gi/Go']==False) &
    (df['G12/G13']==False) &
    (df['Gs']==True) &
    (df['Gq/G11']==False) &
    (df['rec_class']=='Class A (Rhodopsin)')]

These receptors all couple exclusively, primarily with Gq/G11.

In [88]:
df_agq = df[
    (df['Gi/Go']==False) &
    (df['G12/G13']==False) &
    (df['Gs']==False) &
    (df['Gq/G11']==True) &
    (df['rec_class']=='Class A (Rhodopsin)')]

These receptors all couple to any G protein but Gi/Go.

In [100]:
df_a_not_gio = df[
    (df['rec_class']=='Class A (Rhodopsin)') &
    (df['Gi/Go']==False) &
    (
    (df['G12/G13']==True) |
    (df['Gs']==True) |
    (df['Gq/G11']==True)
    )
]

In [102]:
len(df_a_not_gio)

21

In [90]:
signature_agio = SequenceSignature()
signature_agio.setup_alignments_signprot(rec_segs, df_agio['rec_obj'].tolist())
signature_agio.calculate_signature_onesided()

signature_ags = SequenceSignature()
signature_ags.setup_alignments_signprot(rec_segs, df_ags['rec_obj'].tolist())
signature_ags.calculate_signature_onesided()

signature_agq = SequenceSignature()
signature_agq.setup_alignments_signprot(rec_segs, df_agq['rec_obj'].tolist())
signature_agq.calculate_signature_onesided()

signature_a_not_gio = SequenceSignature()
signature_a_not_gio.setup_alignments_signprot(rec_segs, df_a_not_gio['rec_obj'].tolist())
signature_a_not_gio.calculate_signature_onesided()

In [97]:
cons = calc_consensus_from_signature({
        'type': 'with',
        'rec_class': 'Class A (Rhodopsin)',
        'gprot': 'Gi/Go',
        'signature': signature_agio,
})
df1 = aggregate_consensus_data(cons)
df_agio = pd.DataFrame(df1)

cons = calc_consensus_from_signature({
        'type': 'with',
        'rec_class': 'Class A (Rhodopsin)',
        'gprot': 'Gs',
        'signature': signature_ags,
})
df2 = aggregate_consensus_data(cons)
df_ags = pd.DataFrame(df2)

cons = calc_consensus_from_signature({
        'type': 'with',
        'rec_class': 'Class A (Rhodopsin)',
        'gprot': 'Gq/G11',
        'signature': signature_agq,
})
df3 = aggregate_consensus_data(cons)
df_agq = pd.DataFrame(df3)

cons = calc_consensus_from_signature({
        'type': 'without',
        'rec_class': 'Class A (Rhodopsin)',
        'gprot': 'Gi/Go',
        'signature': signature_a_not_gio,
})
df4 = aggregate_consensus_data(cons)
df_a_not_gio = pd.DataFrame(df4)

df_a_not_gio = df_a_not_gio[df_a_not_gio['code'] != '-']
df_agio = df_agio[df_agio['code'] != '-']
df_ags = df_ags[df_ags['code'] != '-']
df_agq = df_agq[df_agq['code'] != '-']

drop_list = [
    'key',
    'score',
    'cons',
    'gprot',
    'rec_class',
    'origin',
    'feature',
#     'code',
#     'length'
]

### Comparing Signatures Gi/Go vs. Gs

In [62]:
comp = compare_sets(df_agio, df_ags, method=set.difference, drop_list=drop_list)
summarize_df(comp)
export_features_for_snake({
    'rec_class': 'Class A (Rhodopsin)',
    'gprot': 'GiGo_VS_Gs_difference'},
    comp)
# show_group_top_n(comp, 'feature', 10)
show_group_top_n(comp, 'code', 10)

Dataframe description:


Unnamed: 0,code,gn,length
count,342,342,342
unique,11,342,8
top,HY,7x41,any
freq,158,1,315




Dataframe size:
(342, 3)




Unnamed: 0,code,gn,length
43,Hb,01-C-term-0034,any
44,Hb,01-C-term-0033,any
45,Hb,01-C-term-0032,any
46,HY,01-C-term-0031,any
47,Hb,01-C-term-0030,any


Dataframe description:


Unnamed: 0,code,gn,length
count,313,313,313
unique,17,313,16
top,HY,7x41,any
freq,110,1,225




Dataframe size:
(313, 3)




Unnamed: 0,code,gn,length
26,HA,01-C-term-0027,3
27,Hb,01-C-term-0026,any
29,Hb,01-C-term-0024,any
30,HY,01-C-term-0023,any
32,Hb,01-C-term-0021,any


Dataframe description:


Unnamed: 0,code,gn,length
count,210,210,210
unique,9,210,7
top,Hb,zz-N-term-9997,any
freq,86,1,193




Dataframe size:
(210, 3)




Unnamed: 0,code,gn,length
0,HY,01-C-term-0003,any
1,Hb,00-ECL2-0004,any
2,Hb,00-ECL2-0001,any
3,Hb,01-C-term-0033,any
4,HY,6x37,any


In [63]:
comp = compare_sets(df_agio, df_ags, method=set.intersection, drop_list=drop_list)
summarize_df(comp)
export_features_for_snake({
    'rec_class': 'Class A (Rhodopsin)',
    'gprot': 'GiGo_VS_Gs_intersection'},
    comp)
# show_group_top_n(comp, 'feature', 10)
show_group_top_n(comp, 'code', 10)

Dataframe description:


Unnamed: 0,code,gn,length
count,342,342,342
unique,11,342,8
top,HY,7x41,any
freq,158,1,315




Dataframe size:
(342, 3)




Unnamed: 0,code,gn,length
43,Hb,01-C-term-0034,any
44,Hb,01-C-term-0033,any
45,Hb,01-C-term-0032,any
46,HY,01-C-term-0031,any
47,Hb,01-C-term-0030,any


Dataframe description:


Unnamed: 0,code,gn,length
count,313,313,313
unique,17,313,16
top,HY,7x41,any
freq,110,1,225




Dataframe size:
(313, 3)




Unnamed: 0,code,gn,length
26,HA,01-C-term-0027,3
27,Hb,01-C-term-0026,any
29,Hb,01-C-term-0024,any
30,HY,01-C-term-0023,any
32,Hb,01-C-term-0021,any


Dataframe description:


Unnamed: 0,code,gn,length
count,132,132,132
unique,10,132,7
top,HY,1x54,any
freq,76,1,122




Dataframe size:
(132, 3)




Unnamed: 0,code,gn,length
0,Hb,8x49,any
1,HY,5x56,any
2,HY,1x30,any
3,Hb,00-ECL2-0005,any
4,W,4x50,6


### Comparing Signatures Gi/Go vs. Gq/G11

In [64]:
comp = compare_sets(df_agio, df_agq, method=set.difference, drop_list=drop_list)
summarize_df(comp)
export_features_for_snake({
    'rec_class': 'Class A (Rhodopsin)',
    'gprot': 'GiGo_VS_GqG11_difference'},
    comp)
# show_group_top_n(comp, 'feature', 10)
show_group_top_n(comp, 'code', 10)

Dataframe description:


Unnamed: 0,code,gn,length
count,342,342,342
unique,11,342,8
top,HY,7x41,any
freq,158,1,315




Dataframe size:
(342, 3)




Unnamed: 0,code,gn,length
43,Hb,01-C-term-0034,any
44,Hb,01-C-term-0033,any
45,Hb,01-C-term-0032,any
46,HY,01-C-term-0031,any
47,Hb,01-C-term-0030,any


Dataframe description:


Unnamed: 0,code,gn,length
count,327,327,327
unique,19,327,14
top,HY,7x41,any
freq,139,1,253




Dataframe size:
(327, 3)




Unnamed: 0,code,gn,length
114,+-,01-C-term-0034,any
115,Hb,01-C-term-0033,any
116,+-,01-C-term-0032,any
117,Hb,01-C-term-0031,any
118,Hb,01-C-term-0030,any


Dataframe description:


Unnamed: 0,code,gn,length
count,187,187,187
unique,8,187,7
top,Hb,1x49,any
freq,78,1,169




Dataframe size:
(187, 3)




Unnamed: 0,code,gn,length
0,HY,01-C-term-0003,any
1,HY,1x30,any
2,Hb,7x28,any
3,Hb,01-C-term-0034,any
4,W,4x50,6


In [66]:
comp = compare_sets(df_agio, df_agq, method=set.intersection, drop_list=drop_list)
summarize_df(comp)
export_features_for_snake({
    'rec_class': 'Class A (Rhodopsin)',
    'gprot': 'GiGo_VS_GqG11_intersection'},
    comp)
# show_group_top_n(comp, 'feature', 10)
show_group_top_n(comp, 'code', 10)

Dataframe description:


Unnamed: 0,code,gn,length
count,342,342,342
unique,11,342,8
top,HY,7x41,any
freq,158,1,315




Dataframe size:
(342, 3)




Unnamed: 0,code,gn,length
43,Hb,01-C-term-0034,any
44,Hb,01-C-term-0033,any
45,Hb,01-C-term-0032,any
46,HY,01-C-term-0031,any
47,Hb,01-C-term-0030,any


Dataframe description:


Unnamed: 0,code,gn,length
count,327,327,327
unique,19,327,14
top,HY,7x41,any
freq,139,1,253




Dataframe size:
(327, 3)




Unnamed: 0,code,gn,length
114,+-,01-C-term-0034,any
115,Hb,01-C-term-0033,any
116,+-,01-C-term-0032,any
117,Hb,01-C-term-0031,any
118,Hb,01-C-term-0030,any


Dataframe description:


Unnamed: 0,code,gn,length
count,155,155,155
unique,9,155,7
top,HY,zz-N-term-9978,any
freq,92,1,146




Dataframe size:
(155, 3)




Unnamed: 0,code,gn,length
0,Hb,00-ECL2-0004,any
1,Hb,8x49,any
2,HY,5x56,any
3,Hb,00-ECL2-0001,any
4,Hb,01-C-term-0033,any


### Comparing Signatures Gi/Go vs. not Gi/Go

In [98]:
comp = compare_sets(df_agio, df_a_not_gio, method=set.difference, drop_list=drop_list)
summarize_df(comp)
export_features_for_snake({
    'rec_class': 'Class A (Rhodopsin)',
    'gprot': 'GiGo_VS_not_GiGo_difference'},
    comp)
# show_group_top_n(comp, 'feature', 10)
show_group_top_n(comp, 'code', 10)

Dataframe description:


Unnamed: 0,code,gn,length
count,342,342,342
unique,11,342,8
top,HY,7x41,any
freq,158,1,315




Dataframe size:
(342, 3)




Unnamed: 0,code,gn,length
43,Hb,01-C-term-0034,any
44,Hb,01-C-term-0033,any
45,Hb,01-C-term-0032,any
46,HY,01-C-term-0031,any
47,Hb,01-C-term-0030,any


Dataframe description:


Unnamed: 0,code,gn,length
count,331,331,331
unique,12,331,9
top,HY,7x41,any
freq,183,1,310




Dataframe size:
(331, 3)




Unnamed: 0,code,gn,length
115,Hb,01-C-term-0033,any
117,Hb,01-C-term-0031,any
118,Hb,01-C-term-0030,any
119,Hb,01-C-term-0029,any
120,HY,01-C-term-0028,any


Dataframe description:


Unnamed: 0,code,gn,length
count,138,138,138
unique,9,138,6
top,Hb,1x54,any
freq,61,1,119




Dataframe size:
(138, 3)




Unnamed: 0,code,gn,length
0,HY,01-C-term-0003,any
1,Hb,00-ECL2-0004,any
2,Hb,7x28,any
3,Hb,01-C-term-0034,any
4,W,4x50,6


code_count


is deprecated and will be removed in a future version
  {count_col: len}).sort_values(


Unnamed: 0,code,code_count
0,Hb,61
1,HY,33
2,Sm,18
3,HA,13
4,αH,4
5,Hu,3
6,P,3
7,W,2
8,R,1


In [99]:
comp = compare_sets(df_agio, df_a_not_gio, method=set.intersection, drop_list=drop_list)
summarize_df(comp)
export_features_for_snake({
    'rec_class': 'Class A (Rhodopsin)',
    'gprot': 'GiGo_VS_not_GiGo_intersection'},
    comp)
# show_group_top_n(comp, 'feature', 10)
show_group_top_n(comp, 'code', 10)

Dataframe description:


Unnamed: 0,code,gn,length
count,342,342,342
unique,11,342,8
top,HY,7x41,any
freq,158,1,315




Dataframe size:
(342, 3)




Unnamed: 0,code,gn,length
43,Hb,01-C-term-0034,any
44,Hb,01-C-term-0033,any
45,Hb,01-C-term-0032,any
46,HY,01-C-term-0031,any
47,Hb,01-C-term-0030,any


Dataframe description:


Unnamed: 0,code,gn,length
count,331,331,331
unique,12,331,9
top,HY,7x41,any
freq,183,1,310




Dataframe size:
(331, 3)




Unnamed: 0,code,gn,length
115,Hb,01-C-term-0033,any
117,Hb,01-C-term-0031,any
118,Hb,01-C-term-0030,any
119,Hb,01-C-term-0029,any
120,HY,01-C-term-0028,any


Dataframe description:


Unnamed: 0,code,gn,length
count,204,204,204
unique,9,204,7
top,HY,5x63,any
freq,125,1,196




Dataframe size:
(204, 3)




Unnamed: 0,code,gn,length
0,Hb,8x49,any
1,HY,1x30,any
2,HY,5x56,any
3,Hb,00-ECL2-0001,any
4,Hb,01-C-term-0033,any


code_count


is deprecated and will be removed in a future version
  {count_col: len}).sort_values(


Unnamed: 0,code,code_count
0,HY,125
1,Hb,65
2,Sm,5
3,HA,3
4,αH,2
5,C,1
6,Hu,1
7,N,1
8,P,1


## Comparing Receptor Sets

In [12]:
import itertools
result_file = Path(path + 'index.csv')
result_file = pd.read_csv(result_file)
result_file

Unnamed: 0.1,Unnamed: 0,rec_class,gprot,with,wo,file_with,file_wo
0,0,Class A (Rhodopsin),Gi/Go,203,83,Class_A_(Rhodopsin)_Gi_Go_with.p,Class_A_(Rhodopsin)_Gi_Go_wo.p
1,1,Class A (Rhodopsin),Gq/G11,174,112,Class_A_(Rhodopsin)_Gq_G11_with.p,Class_A_(Rhodopsin)_Gq_G11_wo.p
2,2,Class A (Rhodopsin),Gs,166,120,Class_A_(Rhodopsin)_Gs_with.p,Class_A_(Rhodopsin)_Gs_wo.p
3,3,Class A (Rhodopsin),G12/G13,147,139,Class_A_(Rhodopsin)_G12_G13_with.p,Class_A_(Rhodopsin)_G12_G13_wo.p
4,4,Class B1 (Secretin),Gi/Go,5,10,Class_B1_(Secretin)_Gi_Go_with.p,Class_B1_(Secretin)_Gi_Go_wo.p
5,5,Class B1 (Secretin),Gq/G11,8,7,Class_B1_(Secretin)_Gq_G11_with.p,Class_B1_(Secretin)_Gq_G11_wo.p
6,6,Class B1 (Secretin),Gs,12,3,Class_B1_(Secretin)_Gs_with.p,Class_B1_(Secretin)_Gs_wo.p
7,7,Class B1 (Secretin),G12/G13,4,11,Class_B1_(Secretin)_G12_G13_with.p,Class_B1_(Secretin)_G12_G13_wo.p
8,8,Class B2 (Adhesion),Gi/Go,1,32,Class_B2_(Adhesion)_Gi_Go_with.p,Class_B2_(Adhesion)_Gi_Go_wo.p
9,9,Class B2 (Adhesion),Gq/G11,4,29,Class_B2_(Adhesion)_Gq_G11_with.p,Class_B2_(Adhesion)_Gq_G11_wo.p


### Class A Gi/Go

#### Difference

Which entries are unique to each of these sets?
In other words: "Which entries are a unique type of interaction for that recptor class + signal protein combination in comparison to that receptor class versus all other possible signal proteins?"

In [13]:
data_row = 0
obj = load_pickle_signature(path, result_file, data_row, 0)
cons = calc_consensus_from_signature(obj)
df1 = aggregate_consensus_data(cons)
df1 = pd.DataFrame(df1)

obj = load_pickle_signature(path, result_file, data_row, 1)
cons = calc_consensus_from_signature(obj)
df2 = aggregate_consensus_data(cons)
df2 = pd.DataFrame(df2)

display(obj['rec_class'])
display(obj['gprot'])

drop_list = [
    'key',
    'score',
    'cons',
    'gprot',
    'rec_class'
]
df1 = df1.loc[df1['code'] != '-']
df2 = df2.loc[df2['code'] != '-']
comp = compare_sets(df1, df2, method=set.difference, drop_list=drop_list)

'Class A (Rhodopsin)'

'Gi/Go'

Dataframe description:


Unnamed: 0,code,feature,gn,length,origin
count,337,337,337,337,0.0
unique,10,10,337,6,0.0
top,HY,Hydrophobic,4.42x42,any,
freq,182,182,1,330,




Dataframe size:
(337, 5)




Unnamed: 0,code,feature,gn,length,origin
109,Hb,Hydrogen bonding (polar),C.01-C-term-0033,any,
112,Hb,Hydrogen bonding (polar),C.01-C-term-0030,any,
114,Hb,Hydrogen bonding (polar),C.01-C-term-0028,any,
115,Hb,Hydrogen bonding (polar),C.01-C-term-0027,any,
116,Hb,Hydrogen bonding (polar),C.01-C-term-0026,any,


Dataframe description:


Unnamed: 0,code,feature,gn,length,origin
count,332,332,332,332,0.0
unique,7,8,332,6,0.0
top,HY,Hydrophobic,4.42x42,any,
freq,190,190,1,322,




Dataframe size:
(332, 5)




Unnamed: 0,code,feature,gn,length,origin
324,Hb,Hydrogen bonding (polar),C.01-C-term-0029,any,
325,Hb,Hydrogen bonding (polar),C.01-C-term-0028,any,
326,Hb,Hydrogen bonding (polar),C.01-C-term-0027,any,
327,Hb,Hydrogen bonding (polar),C.01-C-term-0026,any,
328,Hb,Hydrogen bonding (polar),C.01-C-term-0025,any,


In [14]:
summarize_df(comp)

Dataframe description:


Unnamed: 0,code,feature,gn,length,origin
count,60,60,60,60,0.0
unique,6,7,60,5,0.0
top,HY,Hydrophobic,4.42x42,any,
freq,34,34,1,52,




Dataframe size:
(60, 5)




Unnamed: 0,code,feature,gn,length,origin
0,HY,Hydrophobic,4.43x43,any,
1,Hb,Hydrogen bonding (polar),3.50x50,any,
2,Hb,Hydrogen bonding (polar),45.00-ECL2-0010,any,
3,Sm,Small,3.35x35,any,
4,HY,Hydrophobic,7.32x31,any,


In [16]:
export_features_for_snake(obj, comp)
show_group_top_n(comp, 'feature', 10)

feature_count


is deprecated and will be removed in a future version
  {count_col: len}).sort_values(


Unnamed: 0,feature,feature_count
0,Hydrophobic,34
1,Hydrogen bonding (polar),15
2,α-Helix propensity - high,5
3,Small,3
4,Hydrogen bonding uncharged,1
5,Hydrophobic aromatic,1
6,α-Helix propensity - low,1


### Class B1 Gs

In [29]:
data_row = 6
obj = load_pickle_signature(path, result_file, data_row, 0)
cons = calc_consensus_from_signature(obj)
df1 = aggregate_consensus_data(cons)
df1 = pd.DataFrame(df1)

obj = load_pickle_signature(path, result_file, data_row, 1)
cons = calc_consensus_from_signature(obj)
df2 = aggregate_consensus_data(cons)
df2 = pd.DataFrame(df2)

display(obj['rec_class'])
display(obj['gprot'])

drop_list = [
    'key',
    'score',
    'cons',
    'gprot',
    'rec_class'
]
df1 = df1.loc[df1['code'] != '-']
df2 = df2.loc[df2['code'] != '-']
comp = compare_sets(df1, df2, method=set.difference, drop_list=drop_list)

'Class B1 (Secretin)'

'Gs'

Dataframe description:


Unnamed: 0,code,feature,gn,length,origin
count,437,437,437,437,0.0
unique,26,28,437,16,0.0
top,HY,Hydrophobic,N.zz-N-term-9930,any,
freq,144,144,1,318,




Dataframe size:
(437, 5)




Unnamed: 0,code,feature,gn,length,origin
73,Sm,Small,C.01-C-term-0032,any,
74,Sm,Small,C.01-C-term-0031,any,
75,Hb,Hydrogen bonding (polar),C.01-C-term-0030,any,
76,Hb,Hydrogen bonding (polar),C.01-C-term-0029,any,
77,Hb,Hydrogen bonding (polar),C.01-C-term-0028,any,


Dataframe description:


Unnamed: 0,code,feature,gn,length,origin
count,462,462,462,462,0.0
unique,30,32,462,17,0.0
top,HY,Hydrophobic,2.56x56,any,
freq,93,93,1,214,




Dataframe size:
(462, 5)




Unnamed: 0,code,feature,gn,length,origin
23,E,Charged negative [E],C.01-C-term-0085,4.0,
25,L,Hydrophobic aliphatic [L],C.01-C-term-0083,,
26,I,Hydrophobic aliphatic [I],C.01-C-term-0082,,
38,E,Charged negative [E],C.01-C-term-0070,4.0,
42,S,Hydrogen bonding [S],C.01-C-term-0066,,


In [30]:
summarize_df(comp)

Dataframe description:


Unnamed: 0,code,feature,gn,length,origin
count,290,290,290,290,0.0
unique,27,29,290,17,0.0
top,HA,Hydrophobic aliphatic,N.zz-N-term-9930,any,
freq,53,53,1,100,




Dataframe size:
(290, 5)




Unnamed: 0,code,feature,gn,length,origin
0,HA,Hydrophobic aliphatic,6.48x48,2-3,
1,HA,Hydrophobic aliphatic,C.01-C-term-0036,any,
2,HA,Hydrophobic aliphatic,8.52x52,any,
3,HR,Hydrophobic aromatic,4.41x42,4-5,
4,Sm,Small,45.zz-ECL2-9998,0-1,


In [31]:
export_features_for_snake(obj, comp)
show_group_top_n(comp, 'feature', 10)

feature_count


is deprecated and will be removed in a future version
  {count_col: len}).sort_values(


Unnamed: 0,feature,feature_count
0,Hydrophobic aliphatic,53
1,Small,34
2,Hydrophobic,34
3,Hydrogen bonding (polar),31
4,Hydrophobic aliphatic [L],15
5,α-Helix propensity - high,14
6,Hydrogen bond acceptor,14
7,Hydrophobic aromatic,12
8,Hydrogen bonding uncharged,11
9,Charged positive,9


In [32]:
comp.sort_values('gn')

Unnamed: 0,code,feature,gn,length,origin
159,+-,Charged,1.31x31,any,
175,+,Charged positive,1.32x32,5-6,
156,L,Hydrophobic aliphatic [L],1.37x37,,
279,L,Hydrophobic aliphatic [L],1.39x39,,
122,HA,Hydrophobic aliphatic,1.42x42,3-4,
107,T,Hydrogen bonding [T],1.44x44,,
196,HY,Hydrophobic,1.49x49,any,
125,HA,Hydrophobic aliphatic,1.52x52,any,
128,S,Hydrogen bonding [S],1.53x53,,
63,HA,Hydrophobic aliphatic,1.56x56,2-3,
