In [1]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools

from __future__ import print_function

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

In [2]:
import os, sys
sys.path.append(os.path.split(os.getcwd())[0])

from WordVector import WordVector

In [3]:
import data_helper
_,_,relation = data_helper.get_triplet()
relation[:5]

['cause', 'cause', 'cause', 'cause', 'cause']

In [4]:
relation = list(set(relation))
relation[:5]

['associate', 'isolate', 'tract', 'agent', 'protect']

In [5]:
wv = WordVector(relation)

Loading a pre-trained model...
Load success!


In [6]:
wv.word[:5]

['associate', 'isolate', 'tract', 'agent', 'protect']

In [7]:
wv.vector[0]

array([-7.46317625,  5.22262335,  0.71423316,  0.25281432, -1.88950515,
        2.81982327,  2.73439074, -4.62568474,  6.02962923, -0.94903088,
        1.83171713,  1.3681246 ,  0.25107786, -0.18608131,  5.71420813,
       -1.36870456, -0.7962479 ,  0.04593692,  0.8792268 , -7.22526407,
        0.80905265,  1.58117318,  1.0936867 ,  5.19975233, -3.17536306,
       -0.70044959,  1.0075146 ,  0.66012025, -1.50227594, -0.10630549,
        2.6462605 ,  2.83042741,  3.67946649,  3.47750616,  3.1894536 ,
        3.15186787,  2.79550552, -3.32184863, -3.00941992, -0.50261015,
        0.3727569 , -3.37824678,  0.15729892, -2.11493087, -2.6781621 ,
        0.56826591,  2.38367653,  2.03506947, -1.72758603,  1.0746913 ,
        1.08710182,  1.2823298 , -0.54097337,  0.20989586,  7.90480423,
        2.08192778,  2.01071239, -0.15677463, -0.11600718, -3.72518182,
        1.92703974, -2.60743594,  3.79480958, -2.81607437,  1.63305116,
        3.10046721, -3.40380216, -1.28256249,  0.90507215, -0.77

# Silhouette Coefficient Analysis

In [8]:
range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10]

In [15]:
figures = []

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig = tools.make_subplots(rows=1, cols=1,
                              print_grid=False,
                              specs=[[{'is_3d': False}]])

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    fig['layout']['xaxis1'].update(title='The silhouette coefficient values',
                                   range=[-0.1, 1])
   
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    fig['layout']['yaxis1'].update(title='Cluster label',
                                   showticklabels=False,
                                   range=[0, len(wv.vector) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters)
    cluster_labels = clusterer.fit_predict(wv.vector)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(wv.vector, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(wv.vector, cluster_labels)
    y_lower = 10
    
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        colors = cm.spectral(cluster_labels.astype(float) / n_clusters)

        filled_area = go.Scatter(y=np.arange(y_lower, y_upper),
                                 x=ith_cluster_silhouette_values,
                                 mode='lines',
                                 showlegend=False,
                                 line=dict(width=0.5,
                                          color=colors),
                                 fill='tozerox')
        fig.append_trace(filled_area, 1, 1)
        
        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples
        

    # The vertical line for average silhouette score of all the values
    axis_line = go.Scatter(x=[silhouette_avg],
                           y=[0, len(wv.vector) + (n_clusters + 1) * 10],
                           showlegend=False,
                           mode='lines',
                           line=dict(color="red", dash='dash', width =1) )

    fig.append_trace(axis_line, 1, 1)
    
    
    figures.append(fig)

For n_clusters = 2 The average silhouette_score is : 0.0871994
For n_clusters = 3 The average silhouette_score is : 0.0644643
For n_clusters = 4 The average silhouette_score is : 0.087233
For n_clusters = 5 The average silhouette_score is : 0.0702233
For n_clusters = 6 The average silhouette_score is : 0.069751
For n_clusters = 7 The average silhouette_score is : 0.0440013
For n_clusters = 8 The average silhouette_score is : 0.0452041
For n_clusters = 9 The average silhouette_score is : 0.0646735
For n_clusters = 10 The average silhouette_score is : 0.0568134


In [16]:
py.iplot(figures[0])

In [17]:
py.iplot(figures[1])

In [18]:
py.iplot(figures[2])