In [1]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools

from __future__ import print_function

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

In [2]:
import os, sys
sys.path.append(os.path.split(os.getcwd())[0])

from WordVector import WordVector

In [3]:
import data_helper
_,_,relation = data_helper.get_triplet()
relation[:5]

['cause', 'cause', 'cause', 'cause', 'cause']

In [4]:
relation = list(set(relation))
relation[:5]

['experimental', 'organism', 'airway', 'mouse', 'due']

In [5]:
wv = WordVector(relation)

Loading a pre-trained model...
Load success!


In [6]:
wv.word[:5]

['experimental', 'organism', 'airway', 'mouse', 'due']

In [7]:
wv.vector[0]

array([  2.74403191e+00,   3.78401518e+00,  -4.27681804e-01,
         1.57612205e+00,   8.17368507e-01,  -4.30196822e-01,
        -2.69360900e-01,   1.30753100e+00,  -1.66119266e+00,
         9.68618572e-01,   2.33553982e+00,   4.77984369e-01,
        -1.94371402e+00,   1.70754075e-01,  -9.06704128e-01,
         4.95371222e-01,   1.02233243e+00,  -2.45659328e+00,
        -7.90223420e-01,   8.02290797e-01,   1.06625223e+00,
        -8.11640620e-01,   2.81526327e-01,  -2.80640781e-01,
         1.43135905e+00,   9.33593035e-01,  -3.18225503e+00,
        -2.26155472e+00,  -1.35019207e+00,  -2.62796164e-01,
         7.58577228e-01,   2.44068414e-01,   1.09568827e-01,
         1.96046495e+00,  -1.95055830e+00,  -2.44854164e+00,
        -1.03914291e-01,  -7.89862633e-01,   1.35568738e+00,
         2.98522925e+00,   6.09386384e-01,   1.18249631e+00,
         1.00210652e-01,  -1.42211258e+00,   6.99579716e-01,
        -1.74916112e+00,   5.18589497e+00,  -1.13520229e+00,
         1.60812068e+00,

# Silhouette Coefficient Analysis + PCA for Viz

In [8]:
range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10]
colors=["rgb(255,0,0)", "rgb(0,255,0)", "rgb(0,0,255)", "rgb(255,255,0)", "rgb(255,0,255)", 
        "rgb(0,255,255)", "rgb(127,0,255)", "rgb(255,128,0)","rgb(255,0,127)", "rgb(128,128,128)"]

In [9]:
from DimensionReduction import DimensionReduction
dr = DimensionReduction()

vec_3d = dr.PCA(wv.vector, dim=3)

Explained variation per principal component: [ 0.11155304  0.07719806  0.06918815]
Average of Explained variations: 0.25793924927711487


In [10]:
figures = []

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig = tools.make_subplots(rows=1, cols=2,
                              print_grid=False,
                              specs=[[{'is_3d': False}, {'is_3d': True}]])

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    fig['layout']['xaxis1'].update(title='The silhouette coefficient values', range=[-0.1, 1])
   
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    fig['layout']['yaxis1'].update(title='Cluster label',
                                   showticklabels=False,
                                   range=[0, len(wv.vector) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters)
    cluster_labels = clusterer.fit_predict(wv.vector)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(wv.vector, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(wv.vector, cluster_labels)
    y_lower = 10
    
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        # colors = cm.spectral(cluster_labels.astype(float) / n_clusters)

        filled_area = go.Scatter(y=np.arange(y_lower, y_upper),
                                 x=ith_cluster_silhouette_values,
                                 mode='lines',
                                 showlegend=False,
                                 line=dict(width=0.5,
                                          color=colors[i]),
                                 fill='tozerox')
        fig.append_trace(filled_area, 1, 1)
        
        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples
        

    # The vertical line for average silhouette score of all the values
    axis_line = go.Scatter(x=[silhouette_avg],
                           y=[0, len(wv.vector) + (n_clusters + 1) * 10],
                           showlegend=False,
                           mode='lines',
                           line=dict(color="red", dash='dash',
                                     width =1) )

    fig.append_trace(axis_line, 1, 1)
    
    
    
    
    
    
    # 2nd Plot showing the actual clusters formed
    
    labeled_vec = list(zip(vec_3d, cluster_labels))
    
    # Labeling the clusters
    for i in range(n_clusters):
        ith_cluster_vector = np.array([v for (v,c) in labeled_vec if c == i])
        # colors = matplotlib.colors.colorConverter.to_rgb(cm.spectral(float(i) / n_clusters))
        # colors = 'rgb'+str(colors)
        
        clusters = go.Scatter3d(x=ith_cluster_vector[:, 0], 
                                y=ith_cluster_vector[:, 1], 
                                z=ith_cluster_vector[:, 2],
                                showlegend=False,
                                mode='markers',
                                marker=dict(
                                    color=colors[i],
                                    size=3,
                                    line=dict(
                                        width=0.5
                                    )
                                )
                               )
        fig.append_trace(clusters, 1, 2)
        
    
                      
        
        
        

    fig['layout'].update(title="Silhouette analysis for KMeans clustering on sample data "
                         "with n_clusters = %d" % n_clusters)
    
    figures.append(fig)

For n_clusters = 2 The average silhouette_score is : 0.0680239
For n_clusters = 3 The average silhouette_score is : 0.0492983
For n_clusters = 4 The average silhouette_score is : 0.0570824
For n_clusters = 5 The average silhouette_score is : 0.0769951
For n_clusters = 6 The average silhouette_score is : 0.0711657
For n_clusters = 7 The average silhouette_score is : 0.07278
For n_clusters = 8 The average silhouette_score is : 0.0396569
For n_clusters = 9 The average silhouette_score is : 0.0422402
For n_clusters = 10 The average silhouette_score is : 0.0553703


In [11]:
py.iplot(figures[0])

In [12]:
py.iplot(figures[1])

In [13]:
py.iplot(figures[2])

# Silhouette Coefficient Analysis + t-SNE for Viz

In [14]:
range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10]
colors=["rgb(255,0,0)", "rgb(0,255,0)", "rgb(0,0,255)", "rgb(255,255,0)", "rgb(255,0,255)", 
        "rgb(0,255,255)", "rgb(127,0,255)", "rgb(255,128,0)","rgb(255,0,127)", "rgb(128,128,128)"]

In [15]:
from DimensionReduction import DimensionReduction
dr = DimensionReduction()

vec_3d = dr.TSNE(wv.vector, dim=3)

[t-SNE] Computing 72 nearest neighbors...
[t-SNE] Indexed 73 samples in 0.000s...
[t-SNE] Computed neighbors for 73 samples in 0.002s...
[t-SNE] Computed conditional probabilities for sample 73 / 73
[t-SNE] Mean sigma: 6.408542
[t-SNE] KL divergence after 250 iterations with early exaggeration: 93.542938
[t-SNE] Error after 1000 iterations: 1.811770


In [16]:
figures = []

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig = tools.make_subplots(rows=1, cols=2,
                              print_grid=False,
                              specs=[[{'is_3d': False}, {'is_3d': True}]])

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    fig['layout']['xaxis1'].update(title='The silhouette coefficient values', range=[-0.1, 1])
   
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    fig['layout']['yaxis1'].update(title='Cluster label',
                                   showticklabels=False,
                                   range=[0, len(wv.vector) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters)
    cluster_labels = clusterer.fit_predict(wv.vector)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(wv.vector, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(wv.vector, cluster_labels)
    y_lower = 10
    
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        # colors = cm.spectral(cluster_labels.astype(float) / n_clusters)

        filled_area = go.Scatter(y=np.arange(y_lower, y_upper),
                                 x=ith_cluster_silhouette_values,
                                 mode='lines',
                                 showlegend=False,
                                 line=dict(width=0.5,
                                          color=colors[i]),
                                 fill='tozerox')
        fig.append_trace(filled_area, 1, 1)
        
        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples
        

    # The vertical line for average silhouette score of all the values
    axis_line = go.Scatter(x=[silhouette_avg],
                           y=[0, len(wv.vector) + (n_clusters + 1) * 10],
                           showlegend=False,
                           mode='lines',
                           line=dict(color="red", dash='dash',
                                     width =1) )

    fig.append_trace(axis_line, 1, 1)
    
    
    
    
    
    
    # 2nd Plot showing the actual clusters formed    
    labeled_vec = list(zip(vec_3d, cluster_labels))
    
    # Labeling the clusters
    for i in range(n_clusters):
        ith_cluster_vector = np.array([v for (v,c) in labeled_vec if c == i])
        # colors = matplotlib.colors.colorConverter.to_rgb(cm.spectral(float(i) / n_clusters))
        # colors = 'rgb'+str(colors)
        
        clusters = go.Scatter3d(x=ith_cluster_vector[:, 0], 
                                y=ith_cluster_vector[:, 1], 
                                z=ith_cluster_vector[:, 2],
                                showlegend=False,
                                mode='markers',
                                marker=dict(
                                    color=colors[i],
                                    size=3,
                                    line=dict(
                                        width=0.5
                                    )
                                )
                               )
        fig.append_trace(clusters, 1, 2)    
                      
        
        
        

    fig['layout'].update(title="Silhouette analysis for KMeans clustering on sample data "
                         "with n_clusters = %d" % n_clusters)
    
    figures.append(fig)

For n_clusters = 2 The average silhouette_score is : 0.0783685
For n_clusters = 3 The average silhouette_score is : 0.0623605
For n_clusters = 4 The average silhouette_score is : 0.0668652
For n_clusters = 5 The average silhouette_score is : 0.0529201
For n_clusters = 6 The average silhouette_score is : 0.044938
For n_clusters = 7 The average silhouette_score is : 0.0572846
For n_clusters = 8 The average silhouette_score is : 0.0495784
For n_clusters = 9 The average silhouette_score is : 0.0685568
For n_clusters = 10 The average silhouette_score is : 0.0367635


In [17]:
py.iplot(figures[0])

In [18]:
py.iplot(figures[1])

In [19]:
py.iplot(figures[2])