In [1]:
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering as AGC
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.cluster import SpectralClustering

from sklearn.metrics import *
from sklearn.preprocessing import normalize

import seaborn as sns
import pandas as pd
import pathlib

str1='classifier_ranker'
#str1='total_dataset'
pathlib.Path(str1).mkdir(parents=True, exist_ok=True) 
df=pd.read_csv('Wholesale+classifier_ranker.csv')

In [2]:
from scipy.spatial import distance
import pandas as pd 
from sklearn.cluster import k_means
# nc is number of clusters
# to be implemented without the use of any libraries (from the scratch)

def compute_s(i, x, labels, clusters):
    norm_c= len(clusters)
    s = 0
    for x in clusters:
        
        s += distance.euclidean(x, clusters[i])
    return s

def compute_Rij(i, j, x, labels, clusters, nc):
    Rij = 0
    try:
        d = distance.euclidean(clusters[i],clusters[j])
        Rij = (compute_s(i, x, labels, clusters) + compute_s(j, x, labels, clusters))/d
    
    except:
        Rij = 0    
    return Rij

def compute_R(i, x, labels, clusters, nc): 
    list_r = []
    for i in range(nc):
        for j in range(nc):
            if(i!=j):
                temp = compute_Rij(i, j, x, labels, clusters, nc)
                list_r.append(temp)

    return max(list_r)

def compute_DB_index(x, labels, clusters, nc):

    sigma_R = 0.0
    for i in range(nc):
        sigma_R = sigma_R + compute_R(i, x, labels, clusters, nc)

    DB_index = float(sigma_R)/float(nc)
    return DB_index
    
    


In [3]:
import numpy as np


def normalize_to_smallest_integers(labels):
    """Normalizes a list of integers so that each number is reduced to the minimum possible integer, maintaining the order of elements.

    :param labels: the list to be normalized
    :returns: a numpy.array with the values normalized as the minimum integers between 0 and the maximum possible value.
    """

    max_v = len(set(labels)) if -1 not in labels else len(set(labels)) - 1
    sorted_labels = np.sort(np.unique(labels))
    unique_labels = range(max_v)
    new_c = np.zeros(len(labels), dtype=np.int32)

    for i, clust in enumerate(sorted_labels):
        new_c[labels == clust] = unique_labels[i]

    return new_c


def dunn(labels, distances):
    """
    Dunn index for cluster validation (the bigger, the better)
    
    .. math:: D = \\min_{i = 1 \\ldots n_c; j = i + 1\ldots n_c} \\left\\lbrace \\frac{d \\left( c_i,c_j \\right)}{\\max_{k = 1 \\ldots n_c} \\left(diam \\left(c_k \\right) \\right)} \\right\\rbrace
    
    where :math:`d(c_i,c_j)` represents the distance between
    clusters :math:`c_i` and :math:`c_j`, given by the distances between its
    two closest data points, and :math:`diam(c_k)` is the diameter of cluster
    :math:`c_k`, given by the distance between its two farthest data points.
    
    The bigger the value of the resulting Dunn index, the better the clustering
    result is considered, since higher values indicate that clusters are
    compact (small :math:`diam(c_k)`) and far apart.

    :param labels: a list containing cluster labels for each of the n elements
    :param distances: an n x n numpy.array containing the pairwise distances between elements
    
    .. [Kovacs2005] Kovács, F., Legány, C., & Babos, A. (2005). Cluster validity measurement techniques. 6th International Symposium of Hungarian Researchers on Computational Intelligence.
    """

    labels = normalize_to_smallest_integers(labels)

    unique_cluster_distances = np.unique(min_cluster_distances(labels, distances))
    max_diameter = max(diameter(labels, distances))

    if np.size(unique_cluster_distances) > 1:
        return unique_cluster_distances[1] / max_diameter
    else:
        return unique_cluster_distances[0] / max_diameter


def min_cluster_distances(labels, distances):
    """Calculates the distances between the two nearest points of each cluster.

    :param labels: a list containing cluster labels for each of the n elements
    :param distances: an n x n numpy.array containing the pairwise distances between elements
    """
    labels = normalize_to_smallest_integers(labels)
    n_unique_labels = len(np.unique(labels))

    min_distances = np.zeros((n_unique_labels, n_unique_labels))
    for i in np.arange(0, len(labels) - 1):
        for ii in np.arange(i + 1, len(labels)):
            if labels[i] != labels[ii] and distances[i, ii] > min_distances[labels[i], labels[ii]]:
                min_distances[labels[i], labels[ii]] = min_distances[labels[ii], labels[i]] = distances[i, ii]
    return min_distances


def diameter(labels, distances):
    """Calculates cluster diameters (the distance between the two farthest data points in a cluster)

    :param labels: a list containing cluster labels for each of the n elements
    :param distances: an n x n numpy.array containing the pairwise distances between elements
    :returns:
    """
    labels = normalize_to_smallest_integers(labels)
    n_clusters = len(np.unique(labels))
    diameters = np.zeros(n_clusters)

    for i in np.arange(0, len(labels) - 1):
        for ii in np.arange(i + 1, len(labels)):
            if labels[i] == labels[ii] and distances[i, ii] > diameters[labels[i]]:
                diameters[labels[i]] = distances[i, ii]
    return diameters




In [28]:
X=df.loc[:, df.columns != 'Channel'].as_matrix()
X=normalize(X)
y=df[['Channel']].as_matrix()

In [4]:
db_index=[]
dunn_index=[]
slhte_index=[]
rand_indlist=[]
acc_list=[]
precision_list=[]
recall_list=[]
fscore_list=[]
str2='Kmeans'
pathlib.Path(str1+'/'+str2).mkdir(parents=True, exist_ok=True) 
for i in range(2,15):
    clf=KMeans(n_clusters=i, random_state=0).fit(X)
    
    #internal_measures
    index_db_val = compute_DB_index(X,clf.labels_,clf.cluster_centers_, i)
    db_index.append(index_db_val)
    index_dunn = dunn(clf.labels_,euclidean_distances(X))
    dunn_index.append(index_dunn)
    index_slhte = silhouette_score(X,clf.labels_)
    slhte_index.append(index_slhte)
    
    #external_measures
    precision,recall,fcore,support=precision_recall_fscore_support(y,clf.labels_,average='weighted')
    acuracy_=accuracy_score(y, clf.labels_)
    rand_score=adjusted_rand_score(y.reshape(440,), clf.labels_)
    rand_indlist.append(rand_score)
    acc_list.append(acuracy_)
    precision_list.append(precision)
    recall_list.append(recall)
    fscore_list.append(fcore)
import matplotlib.pyplot as plt    
t=np.arange(2,15,1)
plt.clf()
plt.plot(t, dunn_index, 'r--')
plt.xlabel('k-value')
plt.ylabel('dunn_index')
plt.savefig(str1+'/'+str2+'/'+'dunn_index.png')
plt.clf()
plt.plot(t, db_index, 'b^')
plt.xlabel('k-value')
plt.ylabel('db_index')
plt.savefig(str1+'/'+str2+'/'+'dB_index.png')
plt.clf()
plt.plot(t, slhte_index, 'gs')
plt.xlabel('k-value')
plt.ylabel('silhouette_index')
plt.savefig(str1+'/'+str2+'/'+'silhouette_index.png')
plt.clf()
plt.plot(t, precision_list, 'g+')
plt.xlabel('k-value')
plt.ylabel('precision_list')
plt.savefig(str1+'/'+str2+'/'+'precision.png')
plt.clf()
plt.plot(t, acc_list, 'r--')
plt.xlabel('k-value')
plt.ylabel('acc_list')
plt.savefig(str1+'/'+str2+'/'+'accuracy.png')
plt.clf()
plt.plot(t, recall_list, 'r+')
plt.xlabel('k-value')
plt.ylabel('recall_list')
plt.savefig(str1+'/'+str2+'/'+'recall.png')
plt.clf()
plt.plot(t, fscore_list, 'r+')
plt.xlabel('k-value')
plt.ylabel('fscore_list')
plt.savefig(str1+'/'+str2+'/'+'fscore.png')


NameError: name 'X' is not defined

In [None]:
db_index=[]
dunn_index=[]
slhte_index=[]
rand_indlist=[]
acc_list=[]
precision_list=[]
recall_list=[]
fscore_list=[]
str2='AGC_euc_ward'
pathlib.Path(str1+'/'+str2).mkdir(parents=True, exist_ok=True) 
for i in range(2,15):
    clf = AGC(n_clusters=i, affinity='euclidean', linkage='ward').fit(X)
    
    #internal_measures
    index_dunn = dunn(clf.labels_,euclidean_distances(X))
    dunn_index.append(index_dunn)
    index_slhte = silhouette_score(X,clf.labels_)
    slhte_index.append(index_slhte)
    
    #external_measures
    precision,recall,fcore,support=precision_recall_fscore_support(y,clf.labels_,average='weighted')
    acuracy_=accuracy_score(y, clf.labels_)
    rand_score=adjusted_rand_score(y.reshape(440,), clf.labels_)
    rand_indlist.append(rand_score)
    acc_list.append(acuracy_)
    precision_list.append(precision)
    recall_list.append(recall)
    fscore_list.append(fcore)
import matplotlib.pyplot as plt    
t=np.arange(2,15,1)
plt.clf()
plt.plot(t, dunn_index, 'r--')
plt.xlabel('k-value')
plt.ylabel('dunn_index')
plt.savefig(str1+'/'+str2+'/'+'dunn_index.png')
plt.clf()
plt.plot(t, slhte_index, 'gs')
plt.xlabel('k-value')
plt.ylabel('silhouette_index')
plt.savefig(str1+'/'+str2+'/'+'silhouette_index.png')
plt.clf()
plt.plot(t, precision_list, 'g+')
plt.xlabel('k-value')
plt.ylabel('precision_list')
plt.savefig(str1+'/'+str2+'/'+'precision.png')
plt.clf()
plt.plot(t, acc_list, 'r--')
plt.xlabel('k-value')
plt.ylabel('acc_list')
plt.savefig(str1+'/'+str2+'/'+'accuracy.png')
plt.clf()
plt.plot(t, recall_list, 'r+')
plt.xlabel('k-value')
plt.ylabel('recall_list')
plt.savefig(str1+'/'+str2+'/'+'recall.png')
plt.clf()
plt.plot(t, fscore_list, 'r+')
plt.xlabel('k-value')
plt.ylabel('fscore_list')
plt.savefig(str1+'/'+str2+'/'+'fscore.png')
    

In [5]:
import skfuzzy
db_index=[]
dunn_index=[]
slhte_index=[]
rand_indlist=[]
acc_list=[]
precision_list=[]
recall_list=[]
fscore_list=[]
str2='Fuzzy-C-means-2(m)'
pathlib.Path(str1+'/'+str2).mkdir(parents=True, exist_ok=True) 
for i in range(2,15):
    cntr, u, u0, d, jm, p, fpc = skfuzzy.cluster.cmeans(
        X.reshape(X.shape[1],X.shape[0]), i, 2, error=0.005, maxiter=1000, init=None)
    labels=np.argmax(u,axis=0)
    #internal_measures
    index_db_val = compute_DB_index(X,labels,cntr, i)
    db_index.append(index_db_val)
    index_dunn = dunn(labels,euclidean_distances(X))
    dunn_index.append(index_dunn)
    index_slhte = silhouette_score(X,labels)
    slhte_index.append(index_slhte)
    
    #external_measures
    precision,recall,fcore,support=precision_recall_fscore_support(y,labels,average='weighted')
    acuracy_=accuracy_score(y, labels)
    rand_score=adjusted_rand_score(y.reshape(440,), labels)
    rand_indlist.append(rand_score)
    acc_list.append(acuracy_)
    precision_list.append(precision)
    recall_list.append(recall)
    fscore_list.append(fcore)
import matplotlib.pyplot as plt    
t=np.arange(2,15,1)
plt.clf()
plt.plot(t, dunn_index, 'r--')
plt.xlabel('k-value')
plt.ylabel('dunn_index')
plt.savefig(str1+'/'+str2+'/'+'dunn_index.png')
plt.clf()
plt.plot(t, db_index, 'b^')
plt.xlabel('k-value')
plt.ylabel('db_index')
plt.savefig(str1+'/'+str2+'/'+'dB_index.png')
plt.clf()
plt.plot(t, slhte_index, 'gs')
plt.xlabel('k-value')
plt.ylabel('silhouette_index')
plt.savefig(str1+'/'+str2+'/'+'silhouette_index.png')
plt.clf()
plt.plot(t, precision_list, 'g+')
plt.xlabel('k-value')
plt.ylabel('precision_list')
plt.savefig(str1+'/'+str2+'/'+'precision.png')
plt.clf()
plt.plot(t, acc_list, 'r--')
plt.xlabel('k-value')
plt.ylabel('acc_list')
plt.savefig(str1+'/'+str2+'/'+'accuracy.png')
plt.clf()
plt.plot(t, recall_list, 'r+')
plt.xlabel('k-value')
plt.ylabel('recall_list')
plt.savefig(str1+'/'+str2+'/'+'recall.png')
plt.clf()
plt.plot(t, fscore_list, 'r+')
plt.xlabel('k-value')
plt.ylabel('fscore_list')
plt.savefig(str1+'/'+str2+'/'+'fscore.png')


NameError: name 'X' is not defined

In [32]:
db_index=[]
dunn_index=[]
slhte_index=[]
rand_indlist=[]
acc_list=[]
precision_list=[]
recall_list=[]
fscore_list=[]
str2='Mean_shift'
pathlib.Path(str1+'/'+str2).mkdir(parents=True, exist_ok=True) 
for i in np.arange(0.1,0.6,0.05):
    ms = MeanShift(bandwidth=i, bin_seeding=True)
    ms.fit(X)
    #internal_measures
    index_db_val = compute_DB_index(X,ms.labels_,ms.cluster_centers_, len(np.unique(ms.labels_)))
    db_index.append(index_db_val)
    index_dunn = dunn(ms.labels_,euclidean_distances(X))
    dunn_index.append(index_dunn)
    index_slhte = silhouette_score(X,ms.labels_)
    slhte_index.append(index_slhte)
    
    #external_measures
    precision,recall,fcore,support=precision_recall_fscore_support(y,ms.labels_,average='weighted')
    acuracy_=accuracy_score(y, ms.labels_)
    rand_score=adjusted_rand_score(y.reshape(440,), ms.labels_)
    rand_indlist.append(rand_score)
    acc_list.append(acuracy_)
    precision_list.append(precision)
    recall_list.append(recall)
    fscore_list.append(fcore)
import matplotlib.pyplot as plt    
t=np.arange(0.1,0.6,0.05)
plt.clf()
plt.plot(t, dunn_index, 'r--')
plt.xlabel('bandwidth_value')
plt.ylabel('dunn_index')
plt.savefig(str1+'/'+str2+'/'+'dunn_index.png')
plt.clf()
plt.plot(t, db_index, 'b^')
plt.xlabel('bandwidth_value')
plt.ylabel('db_index')
plt.savefig(str1+'/'+str2+'/'+'dB_index.png')
plt.clf()
plt.plot(t, slhte_index, 'gs')
plt.xlabel('bandwidth_value')
plt.ylabel('silhouette_index')
plt.savefig(str1+'/'+str2+'/'+'silhouette_index.png')
plt.clf()
plt.plot(t, precision_list, 'g+')
plt.xlabel('bandwidth_value')
plt.ylabel('precision_list')
plt.savefig(str1+'/'+str2+'/'+'precision.png')
plt.clf()
plt.plot(t, acc_list, 'r--')
plt.xlabel('bandwidth_value')
plt.ylabel('acc_list')
plt.savefig(str1+'/'+str2+'/'+'accuracy.png')
plt.clf()
plt.plot(t, recall_list, 'r+')
plt.xlabel('bandwidth_value')
plt.ylabel('recall_list')
plt.savefig(str1+'/'+str2+'/'+'recall.png')
plt.clf()
plt.plot(t, fscore_list, 'r+')
plt.xlabel('bandwidth_value')
plt.ylabel('fscore_list')
plt.savefig(str1+'/'+str2+'/'+'fscore.png')


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.5500000000000002