# Triplet(BAC-REL-DIS) Visualization(3-dim)
## 171027
* baseline code

## 171115 update
* Only connected edge BAC-REL and REL-DIS
* use vector value generated by word2vec

In [1]:
import os, sys
sys.path.append(os.path.split(os.getcwd())[0])

from WordVector import WordVector
from DimensionReduction import DimensionReduction
import pandas as pd

In [2]:
import data_helper
bac,dis,rel = data_helper.get_triplet()
bac.sort()
dis.sort()
rel.sort()
nodes = bac + dis + rel
print("Bacteria List({0}): {1}".format(len(bac), bac[:5]))
print("Disease List({0}): {1}".format(len(dis), dis[:5]))
print("Relation List({0}): {1}".format(len(rel), rel[:5]))

Bacteria List(276): ['BAC00actinmey', 'BAC00actinne', 'BAC00aerohydro', 'BAC00bacianth', 'BAC00bacianth']
Disease List(276): ['DIS00actinomycosis', 'DIS00actinomycosis', 'DIS00bactpneu', 'DIS00bactpneu', 'DIS00diarrhea']
Relation List(276): ['abdominal', 'accompany', 'acquire', 'agent', 'agent']


#### nodes : Bacteria + Disease + Relation words

In [3]:
print("All Nodes({0}): {1}".format(len(nodes), nodes[:5]))

All Nodes(828): ['BAC00actinmey', 'BAC00actinne', 'BAC00aerohydro', 'BAC00bacianth', 'BAC00bacianth']


#### unique nodes

In [4]:
unique_nodes = list(set(nodes))
unique_nodes.sort()

In [5]:
print("Unique Nodes({0}): {1}".format(len(unique_nodes), unique_nodes[:5]))

Unique Nodes(141): ['BAC00actinmey', 'BAC00actinne', 'BAC00aerohydro', 'BAC00bacianth', 'BAC00bifidobbif']


#### word2vec

In [6]:
wv = WordVector(unique_nodes, dim=10)

Loading a pre-trained model...
Load success!


In [7]:
vec_3dim = DimensionReduction().PCA(vector=wv.vector, dim=3)

Explained variation per principal component: [ 0.25880969  0.19610742  0.14437555]
Average of Explained variations: 0.5992926359176636


In [8]:
df = pd.DataFrame()
df['words'] = wv.word
df['x'] = [v[0] for v in wv.vector]
df['y'] = [v[1] for v in wv.vector] 
df['z'] = [v[2] for v in wv.vector]

In [9]:
group = []
for w in wv.word:
    if w[:3] == "BAC":
        group.append(1)
    elif w[:3] == "DIS":
        group.append(2)
    else:
        group.append(3)
df['group'] = group

In [10]:
df['index'] = df.index

In [11]:
df = df[['index','words','x','y','z', 'group']]

In [12]:
df.head()

Unnamed: 0,index,words,x,y,z,group
0,0,BAC00actinmey,0.225603,-0.212015,-0.203851,1
1,1,BAC00actinne,0.382572,-0.149741,-0.001161,1
2,2,BAC00aerohydro,1.366007,-0.150797,-0.088442,1
3,3,BAC00bacianth,1.158517,-0.055536,-0.320448,1
4,4,BAC00bifidobbif,0.405588,-0.0301,-0.065202,1


### Numbering to Entities

In [13]:
nodes_num = {}
for i in range(len(unique_nodes)):
    nodes_num[unique_nodes[i]] = i

In [14]:
nodes_num

{'BAC00actinmey': 0,
 'BAC00actinne': 1,
 'BAC00aerohydro': 2,
 'BAC00bacianth': 3,
 'BAC00bifidobbif': 4,
 'BAC00bifidoblon': 5,
 'BAC00burkhocenoc': 6,
 'BAC00burkhocep': 7,
 'BAC00burkhomal': 8,
 'BAC00burkhopseudo': 9,
 'BAC00campyljej': 10,
 'BAC00citrokos': 11,
 'BAC00citrorode': 12,
 'BAC00clostperfr': 13,
 'BAC00corynebamyco': 14,
 'BAC00corynebjeik': 15,
 'BAC00corynebxer': 16,
 'BAC00empedobre': 17,
 'BAC00enteroaero': 18,
 'BAC00enteroclo': 19,
 'BAC00enterofaec': 20,
 'BAC00erysiperhusio': 21,
 'BAC00escheco': 22,
 'BAC00fusobanecro': 23,
 'BAC00haemoinflu': 24,
 'BAC00klebspneum': 25,
 'BAC00lactobplan': 26,
 'BAC00lactoplan': 27,
 'BAC00listmonocy': 28,
 'BAC00neismening': 29,
 'BAC00pandisp': 30,
 'BAC00prevobuc': 31,
 'BAC00promira': 32,
 'BAC00pseudaerug': 33,
 'BAC00raoulornithi': 34,
 'BAC00salmoente': 35,
 'BAC00sebaltermi': 36,
 'BAC00staphylaur': 37,
 'BAC00staphylcarn': 38,
 'BAC00staphylepide': 39,
 'BAC00stenotromalto': 40,
 'BAC00streptagala': 41,
 'BAC00strep

### make Edges based on numbering value

In [15]:
Edges = []
for i in range(len(bac)):
    Edges.append((nodes_num[bac[i]], nodes_num[rel[i]]))
    Edges.append((nodes_num[rel[i]], nodes_num[dis[i]]))

print("Edges List({0}): {1}".format(len(Edges),Edges[:5]))

Edges List(552): [(0, 69), (69, 53), (1, 70), (70, 53), (2, 71)]


In [16]:
Xe=[]
Ye=[]
Ze=[]
for e in Edges:
    Xe+=[df.loc[e[0]].x,df.loc[e[1]].x, None]# x-coordinates of edge ends
    Ye+=[df.loc[e[0]].y,df.loc[e[1]].y, None]
    Ze+=[df.loc[e[0]].z,df.loc[e[1]].z, None]

## 3D Visualization using plotly

In [17]:
import plotly.plotly as py
from plotly.graph_objs import *

In [18]:
# Edge Viz
trace1=Scatter3d(x=Xe,
                 y=Ye,
                 z=Ze,
                 mode='lines',
                 line=Line(color='rgb(125,125,125)', width=1),
                 hoverinfo='none'
                )
# Nodes Viz
trace2=Scatter3d(x=df['x'],
                 y=df['y'],
                 z=df['z'],
                 mode='markers',
                 name='actors',
                 marker=Marker(symbol='dot',
                               size=4,
                               color=df['group'],
                               colorscale='Viridis',
                               line=Line(color='rgb(50,50,50)', width=0.5)
                              ),
                 text=df['words'],
                 hoverinfo='text'
                )

In [19]:
axis=dict(showbackground=False,
          showline=False,
          zeroline=False,
          showgrid=False,
          showticklabels=False,
          title=''
          )


In [20]:
layout = Layout(
         title="Bio Triplet Visualization",
         width=1000,
         height=1000,
         showlegend=False,
         scene=Scene(
         xaxis=XAxis(axis),
         yaxis=YAxis(axis),
         zaxis=ZAxis(axis),
        ),
     margin=Margin(
        t=100
    ),
    hovermode='closest',
    annotations=Annotations([
           Annotation(
           showarrow=False,
            text="",
            xref='paper',
            yref='paper',
            x=0,
            y=0.1,
            xanchor='left',
            yanchor='bottom',
            font=Font(
            size=14
            )
            )
        ]),    )

In [21]:
data_=Data([trace1, trace2])
fig=Figure(data=data_, layout=layout)

py.iplot(fig, filename='Bio Triplet Visualization')