In [None]:
import numpy as np
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True)

In [None]:
import arff as ar

dataset = ar.load(open('data/defect.arff', 'r'))
data = np.array(dataset['data'])
np.random.shuffle(data)

1. Clustering

In [None]:
from sklearn.cluster import KMeans

training_data, test_data = np.array_split(data, 2)

target_names = np.array(dataset['attributes'][-1][1])

split_data = lambda data: (
  np.array([e[:-1] for e in data]), 
  np.array([np.where(target_names == e[-1])[0][0] for e in data]),
)

training_data, training_target = split_data(training_data)

classifier = KMeans(init = 'k-means++', n_clusters = 2, random_state = 0).fit(training_data)

test_data, test_target = split_data(test_data)

prediction = classifier.predict(test_data)

total = len(test_data)
correct = (test_target != prediction).sum()

print('Number of mislabeled points out of %d points: %d (%.1f%%)' % (
  total,
  correct,
  correct / total * 100.0,
))

2. Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(test_target, prediction).ravel()

iplot({
  'data': [go.Pie(
    values = [tn, fp, fn, tp],
    labels = ['True Negative', 'False Positive', 'False Negative', 'True Positive'],
    hole = .4,
  )],
  'layout': {
    'title': 'Confusion Matrix',
    'annotations': [{
      'font': {'size': 15},
      'showarrow': False,
      'text': 'KMeans',
    }],
  },
})

3. Visualization

In [None]:
trace1 = go.Bar(
  x = ['Actual Positive'],
  y = [fn],
  name = 'False Negative',
)

trace2 = go.Bar(
  x = ['Actual Positive'],
  y = [tp],
  name = 'True Positive',
)

trace3 = go.Bar(
  x = ['Actual Negative'],
  y = [tn],
  name = 'True Negative',
)

trace4 = go.Bar(
  x = ['Actual Negative'],
  y = [fp],
  name = 'False Positive',
)

iplot({
  'data': [trace1, trace2, trace3, trace4], 
  'layout': {
    'title': 'Prediction',
    'barmode': 'stack',
  },
})