### Can we cluster drugs together based on their similarity in reacions?

In here, we'll create a new data frame containing information about drugs with at least 200 adverse events reported in our sample of data.

In [21]:
import requests
import os
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import time
from joblib import delayed, Parallel
import datetime
# helper functions:
import helpers
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

from bokeh.plotting import *
from bokeh.models import HoverTool, BoxSelectTool, TapTool
from bokeh.io import output_notebook, save
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.models.graphs import from_networkx
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.events import Tap
from bokeh.models.graphs import from_networkx, NodesAndLinkedEdges, EdgesAndLinkedNodes

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

%matplotlib inline

def create_graph(reaction_df):
    
    g = nx.Graph()
    g.inspection_policy=NodesAndLinkedEdges()
    
    # set some attributes for graph labeling:
    count = []
    duration = []
    top5_reaction = []
    nx.set_node_attributes(g, count, 'count')
    nx.set_node_attributes(g, duration, 'duration_bracket')
    nx.set_node_attributes(g, top5_reaction, 'top5_reaction')
    
    # list of drugs:
    drugs = [d for d in reaction_df['medicinalproduct']]

    for i in range(0,len(reaction_df)):
        
        drug = reaction_df['medicinalproduct'][i]
        duration = reaction_df['drug_duration_disc'][i]
        top5 = reaction_df['top5_reaction'][i]        
        
        # make sure the attributes are assigned to exisiting nodes:
        if drug in g:
            
            g.nodes[drug]["count"] = reaction_df['medicinalproduct_count'][i]
            g.nodes[drug]["duration_bracket"] = duration
            g.nodes[drug]['top5_reactions'] = top5
            
        else:

            g.add_node(drug)
            g.nodes[drug]["count"] = reaction_df['medicinalproduct_count'][i]
            g.nodes[drug]["duration_bracket"] = duration
            g.nodes[drug]['top5_reactions'] = top5
            print(g)

        for j in range(1,len(drugs)):

            # assign weight from text similarity of reactions:
            weight = reaction_df[reaction_df.columns[j]][i]

            drug1 = reaction_df['medicinalproduct'][i]
            drug2 = reaction_df['medicinalproduct'][j]
            # create an edge between two drugs if the similarity is at least 0.3 (1 = max, 0 = min)
            if (drug1 != drug2) & (float(weight) > 0.3) :
                
                if g.has_edge(drug1, drug2):

                    pass

                else:

                    g.add_edge(drug1, drug2, attr_dict={"weight":weight},length = 10*weight)
                    
                    
    neighbourless_drugs = []
    
    for drug in g.nodes():

        if len(list(g.neighbors(drug))) == 0:

            neighbourless_drugs.append(drug)

    for drug in neighbourless_drugs:
        
        g.remove_node(drug)
                       


    return g


def hex_generator(palette):
    
    try:
    
        hex_code = ['#%02x%02x%02x' % (int(255*val[0]), int(255*val[1]), int(255*val[2])) for val in palette]
        
    except:
        hex_code = []
    
    return hex_code
    

def network_viz(g, view_in_notebook = True, output_html = 'drug_netwrok.html', title = "Network visualization of drugs based on similarity of reactions and median duration until adverse events"):
    # color by duration of drug taken:
    g.inspection_policy=NodesAndLinkedEdges()
    color_att = set([g.nodes[node]['duration_bracket'] for node in g.nodes])

    palette = sns.color_palette("BuGn_r", len(color_att))
    sizes = []
    drug_labels = []
    reaction_labels =[]
    count_labels = []
    # The above palette is given in RGB [0-1] range
    # bokeh requires colours be defined in hex RGB
    hex_palette = hex_generator(palette)
    
    # set palette map in hex code:
    palette_map = {duration:hex_palette[i] for i, duration in enumerate(color_att)}

    #pos = nx.layout.spring_layout(g, iterations=1000)
    
    pos = nx.layout.fruchterman_reingold_layout(g, iterations = 1000)
    
    # ------------------------------------------------

    # Determine appearance of nodes, representing drugs
    
    xs, ys, colours, labels, sizes = [], [], [], [], []

    for i, (node_name, coords) in enumerate(pos.items()):
        
        # X and Y co-ordinates of the drug
        xs.append(coords[0])
        ys.append(coords[1])


        d = g.nodes[node_name]['duration_bracket']
        
        colours.append(palette_map[d])

        r = g.nodes[node_name]['top5_reactions']
                
        count = g.nodes[node_name]["count"]
        
        drug_labels.append(node_name)
        
        reaction_labels.append(r)
        
        count_labels.append(str(count))
        
        #labels.append(label)

        size = min(0.05*count,30)

        sizes.append(size)
        
    # set data for nodes on the graph:
    node_source = ColumnDataSource(data=dict(x=xs, y=ys, color=colours, size=sizes,drug = drug_labels, top5_reactions = reaction_labels, count_labels = count_labels))

    xlist, ylist, weight = [], [], []

    for node_A, node_B, data in g.edges(data=True):

        x1, y1 = pos[node_A]
        x2, y2 = pos[node_B]

        xlist += [x1, x2, float("NaN")]
        ylist += [y1, y2, float("NaN")]


    line_source = ColumnDataSource(data=dict(xs=xlist, ys=ylist))

    # ---------------------------

    if view_in_notebook:
        
        output_notebook(hide_banner=True)

    f1 = figure(plot_width=1200, plot_height=1000, tools="pan,wheel_zoom,box_zoom,reset,hover")

    f1.grid.grid_line_width = 0
    f1.axis.visible = False

    # Draw the lines between nodes
    f1.line(x="xs", y="ys", line_width=0.5, source=line_source, color="#000000", line_alpha=0.35)

    # Draw the nodes
    f1.circle("x", "y", source=node_source, name="node", size="size", color="color", line_width=0.5, line_alpha=0.75, line_color="#000000")

    # Attach the HoverTool to the drug nodes to display their label
    
    tooltips = [("drug","@drug"),("top reactions","@top5_reactions"),("adverse event count","@count_labels")]
    hover = f1.select(dict(type=HoverTool))
    hover.tooltips = tooltips 
    hover.point_policy = "snap_to_data"
    
    
    

    hover.names = ["node"]

    # Legend

    f1.title.text = "Network visualization of drugs based on similarity of reactions and median duration until adverse events"
    f1.title.align = "center"
    f1.title.text_color = "grey"
    f1.title.text_font_size = "14px"

    text = [index for _,index in enumerate(palette_map)]

    colors = [palette_map[index] for _,index in enumerate(palette_map)]

    x_pos = [-1+0.05*i for i in range(0,len(colors))] 
    y_pos = [-1 for i in range(0,len(colors))] 
    size_legend = [18 for i in range(0,len(colors))]

    f1.circle(x_pos, y_pos, size=size_legend, line_width=0.5, line_alpha=0.75, line_color="#000000", color=colors,)
    f1.text(x_pos, y_pos, text=text, text_align="center", text_font_size="8pt",angle = 45)
    
    
    # color:
    
    f1.background_fill_color = "beige"
    f1.background_fill_alpha = 0.2

    #show(f1)
    if view_in_notebook:
        show(f1)
    else:
        filename = save(f1, filename=output_html, title=title)
        
    return f1


    
#network_viz(g)    
    
    

In [22]:
reaction_df = pd.read_csv('openFDA_data/drug_df.csv')
g = create_graph(reaction_df)
network_viz(g)


