In [814]:
from scipy.io import arff
import pandas as pd
import numpy as np
import math
import random
from copy import deepcopy
from sklearn import preprocessing
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split

In [823]:
data = arff.loadarff('./Data/kc1.arff.txt')
df = pd.DataFrame(data[0])
df['defects'] = df['defects'].apply(lambda x: str(x)[1:]) #removing 'b' from classes
df['defects'] = df['defects'].map({"'true'": True, "'false'": False})
df.defects.value_counts()

False    1783
True      326
Name: defects, dtype: int64

In [824]:
data_set = df[df.defects == False]
data_set = data_set.iloc[:, :-1]
Y_f = data_set.iloc[:, -1]
X_train, X_test, labels_train, labels_test = train_test_split(data_set, Y_f, test_size=0.25, random_state=42)
X_train = np.array(X_train.values)
bugs = df[df.defects == True]
Y_t = bugs.iloc[:, -1]
X_test['defects'] = False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [825]:
dfs = pd.concat([X_test, bugs])
dfs = dfs.sample(frac=1)

In [826]:
def get_init_centers(k, n_instances):
    init_ids = []
    while len(init_ids) < k:
        index = np.random.randint(0, n_instances)
        if not index in init_ids:
            init_ids.append(index)
    return init_ids

def calc_distance(x, y):
    return np.sqrt(np.sum((x - y)**2))

def get_cost(X, centers_id):
    dists = np.zeros((len(X), len(centers_id)))
    for j in range(len(centers_id)):
        center = X[centers_id[j]]
        for i in range(len(X)):
            if i != centers_id[j]:
                dists[i, j] = calc_distance(X[i], center)
    mask = np.argmin(dists, axis=1)
    members = np.zeros(len(X))
    costs = np.zeros(len(centers_id))
    for i in range(len(centers_id)):
        mem_id = np.where(mask==i)
        members[mem_id] = i
        costs[i] = np.sum(dists[mem_id, i])
    return members, costs, np.sum(costs), dists

In [827]:
def kmedoids(X, n_clusters):
    n_instances, n_features = X.shape
    centers = get_init_centers(n_clusters, n_instances)
    members, costs, total_cost, dists = get_cost(X, centers)
    count, SWAPED = 0, True
    while count < 1000 and SWAPED:
        SWAPED = False
        for i in range(n_instances):
            if not i in centers:
                for j in range(len(centers)):
                    centers_ = deepcopy(centers)
                    centers_[j] = i
                    members_, costs_, total_cost_, dists_ = get_cost(X, centers_)
                    if total_cost_ - total_cost < 0:
                        members, costs, total_cost, dists = members_, costs_, total_cost_, dists_
                        centers = centers_
                        SWAPED = True
        count += 1
    return centers, members, costs, total_cost, dists

In [828]:
centers, members, costs, total_cost, dists = kmedoids(X_train, 1)

D = pairwise_distances(X_train, metric='euclidean')
centers_distance = D[centers][0]
threshold = np.percentile(centers_distance, 50)

predictions = []
classes = np.array(dfs.iloc[:, -1].values)
dfs_np = dfs.iloc[:, :-1]
dfs_np = np.array(dfs_np.values)

for i in range(len(dfs_np)):
    result = calc_distance(X_train[centers][0], dfs_np[i]) > threshold
    predictions.append(result)

In [831]:
tn, fp, fn, tp = confusion_matrix(classes, predictions).ravel()
(tn, fp, fn, tp)

(212, 234, 37, 289)

In [832]:
precision_recall_fscore_support(classes, predictions, average=None)

(array([0.85140562, 0.55258126]),
 array([0.47533632, 0.88650307]),
 array([0.61007194, 0.68080094]),
 array([446, 326]))