In [1]:
# https://blog.ouseful.info/2016/12/29/simple-view-controls-for-pandas-dataframes-using-ipython-widgets/
# https://towardsdatascience.com/the-next-level-of-data-visualization-in-python-dd6e99039d5e

import pandas as pd
import numpy as np
import csv

# %matplotlib widget
# import matplotlib.pyplot as plt

# import widgets
from ipywidgets import interact, interact_manual, interactive
from ipywidgets import HBox, Label
import ipywidgets as widgets

In [2]:
%cd ../../data/semmeddb/output
# !ls

/usr/local/bin/data/semmeddb/output


## The Raw Data

In [3]:
triples_final = pd.read_csv('SEMMEDDB_TRIPLES_FINAL.csv', 
                            encoding='utf-8',
                            sep=',',
                            index_col = 0,
                           header=0
                          )
triples_final = triples_final.astype('str') 
print(len(triples_final))
triples_final.head()

267786


Unnamed: 0,SUBJECT,RELATION,OBJECT,ENTREZ_ID,UNIPROT_Protein names,UNIPROT_Gene name_primary,UNIPROT_Gene name_synonym,Organism,IS_MASTER_GENE,PMID
0,DUX4,AFFECTS,Allelic exclusion,100288687,DUX4 (Double homeodomain protein DUX4-fl),DUX4,DUX4L,Homo sapiens (Human),subject,12055243
1,DUX4,AFFECTS,Antibody Diversity,100288687,DUX4 (Double homeodomain protein DUX4-fl),DUX4,DUX4L,Homo sapiens (Human),subject,6767191
2,DUX4,AFFECTS,Antibody Formation,100288687,DUX4 (Double homeodomain protein DUX4-fl),DUX4,DUX4L,Homo sapiens (Human),subject,3091695
3,DUX4,AFFECTS,B-Cell Development,100288687,DUX4 (Double homeodomain protein DUX4-fl),DUX4,DUX4L,Homo sapiens (Human),subject,790935721349430
4,DUX4,AFFECTS,Cell Line,100288687,DUX4 (Double homeodomain protein DUX4-fl),DUX4,DUX4L,Homo sapiens (Human),subject,10087940


In [4]:
# test
# df = triples_final.copy()
# df = df[df['SUBJECT']=='LATS1']
# df['RELATION'].unique().tolist()

## The SUBJECT View

In [5]:
# https://stackoverflow.com/questions/45754356/jupyter-notebook-widgets-create-dependent-dropdowns

In [6]:
# Create the dataframe
df1 = triples_final.copy()
df1 = df1[df1['IS_MASTER_GENE']=='subject']
df1 = df1[['SUBJECT', 'RELATION', 'OBJECT', 'PMID']].copy()

# create the Subject drop down list
items_sub = ['Select from List']+sorted(df1['SUBJECT'].unique().tolist())

# set up widgets
x_widget = widgets.Dropdown(options = items_sub, description='SUBJECT')
y_widget = widgets.Dropdown(description='RELATION')

# Define update function that updates the content of y based on what we select for x
def update(*args):
#     global df
    df1_update = df1[df1['SUBJECT']==x_widget.value]
    y_widget.options = ['All']+df1_update['RELATION'].unique().tolist()
    df1_update = df1
    
x_widget.observe(update)

# function to be executed
def show_data(x, y):
    if x=='All' and y=='ALL':  return df1
    elif not x=='All' and y=='All': return df1[df1['SUBJECT']==x]
    elif x=='All' and not y=='All': return df1[df1['RELATION']==y]
    else: return df1[(df1['SUBJECT']==x) & (df1['RELATION']==y)]

interact(show_data, x = x_widget, y = y_widget);

interactive(children=(Dropdown(description='SUBJECT', options=('Select from List', 'ABL1', 'ACSL1', 'AFF4', 'A…

In [7]:
# df = triples_final.copy()
# df = df[df['IS_MASTER_GENE']=='subject']
# df = df[['SUBJECT', 'RELATION', 'OBJECT', 'PMID']].copy()

# rels = df.RELATION.value_counts()
# rels = rels[rels > 1000]
# rels

### Most Common RELATION by OBJECT type

* Frequency is based on RELATION-OBJECT combination and is measured in terms of
  * count of SUBJECT
  * count of Pubmed articles

In [8]:
df2 = triples_final.copy()
df2 = df2[df2['IS_MASTER_GENE']=='subject']
df2 = df2[['SUBJECT', 'RELATION', 'OBJECT', 'PMID']].copy()
df2['PMID'] = df2['PMID'].apply(lambda x: len(x.split(','))) 
# print(df.head())

# groupby
grouped = df2.groupby(['RELATION', 'OBJECT']).agg({'SUBJECT': ['count'], 'PMID': 'sum'})
# print(grouped.head())

# sort descending
df2 = pd.DataFrame((grouped
 .reset_index()
 .sort_values(by=('SUBJECT', 'count'), ascending=False)
))

#flatten column header
df2.columns = df2.columns.get_level_values(0)

# rename columns
df2.columns=(['RELATION', 'OBJECT', 'Subject_Count', 'PMID_Count'])

# filter min 10 occurrences
# df = df[df['Subject_Count'].astype(int)>10]


###########
# widgets #
###########

# create the Relation drop down list
items_rels = ['All']+sorted(df2['RELATION'].unique().tolist())

# set up drop down widget
x_widget = widgets.Dropdown(options = items_rels, description='RELATION')

# set up slider widget
style = {'description_width': 'initial'}
slider = widgets.IntSlider(min=0, max=100, step=1, value=0, 
                           description='Min. Subject Count', 
                           style=style)

# updates the content of y based on what we select for x
def update(*args):
#     global df2_update
    df2_update = df2[df2['RELATION']==x_widget.value]
    
x_widget.observe(update)

# function to be executed
def show_data(x, y):
    global df2_update
    df2_update = df2[df2['Subject_Count'].astype(int)>=y]
    if x=='All':  
        return df2_update
    else:
        return df2_update[(df2_update['RELATION']==x) ]
    
interact(show_data, x = x_widget, y=slider);



interactive(children=(Dropdown(description='RELATION', options=('All', 'ADMINISTERED_TO', 'AFFECTS', 'ASSOCIAT…

### SUBJECTS for a given RELATION-OBJECT combination

In [44]:
# Create the dataframe
df3 = triples_final.copy()
df3 = df3[df3['IS_MASTER_GENE']=='subject']
df3 = df3[['RELATION', 'OBJECT', 'SUBJECT', 'PMID']].copy()
df3['PMID'] = df3['PMID'].apply(lambda x: x.split(','))
df3['PMID_Count'] = df3['PMID'].apply(lambda x: len(x))

# create the Relation drop down list
items_rels = ['Select from List']+sorted(df3['RELATION'].unique().tolist())

# create the Object drop down list
items_obj = ['All']+sorted(df3['OBJECT'].unique().tolist())

# set up widgets
x_widget = widgets.Dropdown(options = items_rels, description='RELATION')
y_widget = widgets.Dropdown(description='OBJECT')

# Define update function that updates the content of y based on what we select for x
def update(*args):
#     global df
    df3_update = df3[df3['RELATION']==x_widget.value]
    y_widget.options = ['Select from List']+df3_update['OBJECT'].unique().tolist()
    df3_update = df3
    
x_widget.observe(update)

# function to be executed
def show_data(x, y):
    if x=='All' and y=='ALL':  return df3
    elif not x=='All' and y=='All': return df3[df3['RELATION']==x]
    elif x=='All' and not y=='All': return df3[df3['OBJECT']==y]
    else: return (df3[(df3['RELATION']==x) & (df3['OBJECT']==y)]
                  .sort_values(by='PMID_Count', ascending=False)
                  [['SUBJECT', 'PMID_Count']]
                 )

interact(show_data, x = x_widget, y = y_widget);

interactive(children=(Dropdown(description='RELATION', options=('Select from List', 'ADMINISTERED_TO', 'AFFECT…

In [10]:
# # import plotly.plotly as py
# import plotly.graph_objs as go
# import numpy as np
# from ipywidgets import *

# df = triples_final[['SUBJECT', 'RELATION', 'OBJECT']].copy()

# y = df.RELATION.value_counts()
# # y = y[y > 1000]

# # print(y.tail())
# # print(y.index.tolist())

# layout = go.Layout(
#     xaxis=dict(
#         autorange=True,
#         showgrid=False,
#         zeroline=False,
#         showline=False,
#         ticks='',
#         showticklabels=False
#     ),
#     yaxis=dict(
#         autorange=True,
#         showgrid=False,
#         zeroline=False,
#         showline=False,
#         ticks='',
#         showticklabels=True
#     )
# )


# items_sub = ['Select from List']+sorted(df['SUBJECT'].unique().tolist())
# x_widget = widgets.Dropdown(options = items_sub, description='SUBJECT')

# @interact(a=x_widget)
# def update(a=''):
#     with fig.batch_update():
#         df2 = df[df['SUBJECT']==a]
#         y = df2.RELATION.value_counts()
#         bar = fig.data[0]
#         bar.x = y.index.tolist()
        
        
# fig = go.FigureWidget(layout=layout)
# fig.add_bar(x=y.index.tolist(), y=y)
# fig.layout.title = '\nRelation Count'
# fig