In [33]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import sklearn.metrics as metrics
from genie import genie
from scipy.cluster.hierarchy import linkage, cut_tree
from sklearn.cluster import KMeans, SpectralClustering, Birch

In [34]:
path = 'data/01FCPSdata'
files = [f for f in listdir(path) if isfile(join(path, f))]

In [35]:
dataset_names = [f.split('.')[0] for f in files if f.endswith('.lrn')]

In [None]:
for ds_name in dataset_names:
    print(ds_name)
    results = []
    data = pd.read_csv(join(path, ds_name+'.lrn'), sep='\t', comment='%', header=None, index_col=0)
    data = data.values
    labels = pd.read_csv(join(path, ds_name+'.cls'), sep='\t', comment='%', header=None, index_col=0)
    k = len(labels.iloc[:,0].unique())
    labels = labels.values.flatten()
    best_FM = 0
    best_FM_g = 0
    best_AMI = 0
    best_AMI_g = 0
    best_Rand = 0
    best_Rand_g = 0
    for g in [x / 10 for x in range(1, 11)]:
        res = genie(data, g, k)
        fm = metrics.fowlkes_mallows_score(labels, res)
        if(fm > best_FM):
            best_FM = fm
            best_FM_g = g
        ami = metrics.adjusted_mutual_info_score(labels, res)
        if(ami > best_AMI):
            best_AMI = ami
            best_AMI_g = g
        rand = metrics.adjusted_rand_score(labels, res)
        if(rand > best_Rand):
            best_Rand = rand
            best_Rand_g = g
    results.append(('genie', best_FM, best_AMI, best_Rand, best_FM_g, best_AMI_g, best_Rand_g))
    for method in ['single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward']:
        res = cut_tree(linkage(data, method), n_clusters=k).flatten()
        results.append((method, metrics.fowlkes_mallows_score(labels, res), metrics.adjusted_mutual_info_score(labels, res), 
                       metrics.adjusted_rand_score(labels, res), None, None, None))
    kmeans = KMeans(n_clusters=k).fit(data)
    res = kmeans.labels_
    results.append(('kmeans', metrics.fowlkes_mallows_score(labels, res), metrics.adjusted_mutual_info_score(labels, res), 
                       metrics.adjusted_rand_score(labels, res), None, None, None))
    spectral = SpectralClustering(n_clusters=k).fit(data)
    res = spectral.labels_
    results.append(('spectral', metrics.fowlkes_mallows_score(labels, res), metrics.adjusted_mutual_info_score(labels, res), 
                       metrics.adjusted_rand_score(labels, res), None, None, None))
    brc = Birch(branching_factor=50, n_clusters=k, threshold=0.5, compute_labels=True).fit(data)
    res = brc.labels_
    results.append(('birch', metrics.fowlkes_mallows_score(labels, res), metrics.adjusted_mutual_info_score(labels, res), 
                       metrics.adjusted_rand_score(labels, res), None, None, None))
    df = pd.DataFrame(results)
    df.to_csv(ds_name+'.csv')    

Hepta
Atom




WingNut
TwoDiamonds
Lsun
EngyTime
