In [1]:
import pandas as pd
import csv
import ipywidgets as widgets
from ipywidgets import interact

In [2]:
%cd ../../data/semmeddb/output
# !ls

/usr/local/bin/notebooks/data/semmeddb/output


## The Raw Data

In [3]:
triples_final = pd.read_csv('SEMMEDDB_TRIPLES_FINAL.csv', 
                            encoding='utf-8',
                            sep=',',
                            index_col = 0,
                           header=0
                          )
triples_final = triples_final.astype('str') 
print('Total number of triples in Raw data: {}'.format(len(triples_final)))
triples_final.head()

Total number of triples in Raw data: 267786


Unnamed: 0,SUBJECT,RELATION,OBJECT,ENTREZ_ID,UNIPROT_Protein names,UNIPROT_Gene name_primary,UNIPROT_Gene name_synonym,Organism,IS_MASTER_GENE,PMID
0,DUX4,AFFECTS,Allelic exclusion,100288687,DUX4 (Double homeodomain protein DUX4-fl),DUX4,DUX4L,Homo sapiens (Human),subject,12055243
1,DUX4,AFFECTS,Antibody Diversity,100288687,DUX4 (Double homeodomain protein DUX4-fl),DUX4,DUX4L,Homo sapiens (Human),subject,6767191
2,DUX4,AFFECTS,Antibody Formation,100288687,DUX4 (Double homeodomain protein DUX4-fl),DUX4,DUX4L,Homo sapiens (Human),subject,3091695
3,DUX4,AFFECTS,B-Cell Development,100288687,DUX4 (Double homeodomain protein DUX4-fl),DUX4,DUX4L,Homo sapiens (Human),subject,790935721349430
4,DUX4,AFFECTS,Cell Line,100288687,DUX4 (Double homeodomain protein DUX4-fl),DUX4,DUX4L,Homo sapiens (Human),subject,10087940


## The SUBJECT View

In [8]:
df1 = triples_final.copy()
df1 = df1[df1['IS_MASTER_GENE']=='subject']
df1 = df1[['SUBJECT', 'RELATION', 'OBJECT', 'PMID']].copy()
df1['PMID'] = df1['PMID'].apply(lambda x: x.split(','))
df1['PMID_Count'] = df1['PMID'].apply(lambda x: len(x))

# create the Subject drop down list
items_sub = ['Select from List']+sorted(df1['SUBJECT'].unique().tolist())

# set up Subject drop down widget
x_widget = widgets.Dropdown(options = items_sub, description='SUBJECT')

# set up Relation drop down widget
y_widget = widgets.Dropdown(description='RELATION')

# update function
def update(*args):
    df1_update = df1[df1['SUBJECT']==x_widget.value]
    y_widget.options = ['Select from List']+df1_update['RELATION'].unique().tolist()
    df1_update = df1

# listen for change
x_widget.observe(update)

# show data function
def show_data(x, y):
    if x=='All' and y=='ALL':  return df1
    elif not x=='All' and y=='All': return df1[df1['SUBJECT']==x]
    elif x=='All' and not y=='All': return df1[df1['RELATION']==y]
    else: return (df1[(df1['SUBJECT']==x) & (df1['RELATION']==y)]
                 .sort_values(by='PMID_Count', ascending=False)
                  [['SUBJECT', 'RELATION', 'OBJECT', 'PMID_Count', 'PMID']]
                 )

# run
interact(show_data, x = x_widget, y = y_widget);

interactive(children=(Dropdown(description='SUBJECT', options=('Select from List', 'ABL1', 'ACSL1', 'AFF4', 'A…

### Most Common RELATION by OBJECT type

* Frequency is based on RELATION-OBJECT combination and is measured in terms of
  * count of SUBJECT
  * count of Pubmed articles

In [7]:
df2 = triples_final.copy()
df2 = df2[df2['IS_MASTER_GENE']=='subject']
df2 = df2[['SUBJECT', 'RELATION', 'OBJECT', 'PMID']].copy()
df2['PMID'] = df2['PMID'].apply(lambda x: len(x.split(','))) 

# groupby
grouped = df2.groupby(['RELATION', 'OBJECT']).agg({'SUBJECT': ['count'], 'PMID': 'sum'})

# sort descending
df2 = pd.DataFrame((grouped
 .reset_index()
 .sort_values(by=('SUBJECT', 'count'), ascending=False)
))

# flatten column header
df2.columns = df2.columns.get_level_values(0)

# rename columns
df2.columns=(['RELATION', 'OBJECT', 'Subject_Count', 'PMID_Count'])


###########
# widgets #
###########

# create the Relation drop down list
items_rels = ['Select from List']+sorted(df2['RELATION'].unique().tolist())

# set up Relation drop down widget
x_widget = widgets.Dropdown(options = items_rels, description='RELATION')

# set up slider widget
style = {'description_width': 'initial'}
slider = widgets.IntSlider(min=0, max=100, step=1, value=10, 
                           description='Min. Subject Count', 
                           style=style)

# update function
def update(*args):
    df2_update = df2[df2['RELATION']==x_widget.value]
    
# listen for change
x_widget.observe(update)

# show data function
def show_data(x, y):
    df2_update = df2[df2['Subject_Count'].astype(int)>=y]
    if x=='All':  
        return df2_update
    else:
        return df2_update[(df2_update['RELATION']==x) ]

# run
interact(show_data, x = x_widget, y=slider);



interactive(children=(Dropdown(description='RELATION', options=('Select from List', 'ADMINISTERED_TO', 'AFFECT…

### SUBJECTS for a given RELATION-OBJECT combination

In [6]:
df3 = triples_final.copy()
df3 = df3[df3['IS_MASTER_GENE']=='subject']
df3 = df3[['RELATION', 'OBJECT', 'SUBJECT', 'PMID']].copy()
df3['PMID'] = df3['PMID'].apply(lambda x: x.split(','))
df3['PMID_Count'] = df3['PMID'].apply(lambda x: len(x))

# create the Relation drop down list
items_rels = ['Select from List']+sorted(df3['RELATION'].unique().tolist())

# create the Object drop down list
items_obj = ['All']+sorted(df3['OBJECT'].unique().tolist())

# set up Relation drop down widget
x_widget = widgets.Dropdown(options = items_rels, description='RELATION')

# set up Object drop down widget
y_widget = widgets.Dropdown(description='OBJECT')

# set up slider widget
style = {'description_width': 'initial'}
slider = widgets.IntSlider(min=0, max=200, step=1, value=10, 
                           description='Min. Article Count', 
                           style=style)

# update function
def update(*args):
    df3_update = df3[df3['RELATION']==x_widget.value]
    y_widget.options = ['Select from List']+df3_update['OBJECT'].unique().tolist()
    df3_update = df3

# listen for change
x_widget.observe(update)

# show data function
def show_data(x, y, z):
    df3_update = df3[df3['PMID_Count'].astype(int)>=z]
    if x=='All':  
        return df3_update
    else:
        return (df3_update[(df3_update['RELATION']==x) & (df3_update['OBJECT']==y)]
                  .sort_values(by='PMID_Count', ascending=False)
                  [['SUBJECT', 'PMID_Count']]
                 )
    
# run
interact(show_data, x = x_widget, y = y_widget, z = slider);

interactive(children=(Dropdown(description='RELATION', options=('Select from List', 'ADMINISTERED_TO', 'AFFECT…