In [41]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.cluster import InterclusterDistance
from yellowbrick.cluster import KElbowVisualizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
import time
from scipy import linalg
import matplotlib as mpl
import itertools

from sklearn.metrics import v_measure_score, homogeneity_score, adjusted_mutual_info_score


import matplotlib.cm as cm

from sklearn.utils import shuffle
from sklearn.utils import check_random_state
from sklearn.cluster import MiniBatchKMeans

from sklearn.metrics import accuracy_score
from sklearn.model_selection import (GridSearchCV, train_test_split, validation_curve)   
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from yellowbrick.model_selection import LearningCurve, ValidationCurve

from sklearn.model_selection import cross_val_score
import time

np.random.seed(42)

In [34]:
def load_data(ds):
    df = pd.read_csv("data/" + ds)
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    X, y = X.to_numpy(), y.to_numpy()
    return X, y

In [35]:
# KMeans = 2
X, y = load_data('wdbc-modified.data')
pca = PCA(n_components= 2).fit(X)
X_pca = pca.transform(X)
start = time.time()
kmeans=KMeans(2).fit(X_pca)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
kmeans.predict(X_pca)
query_time = time.time() - start
print("Query time: " + str(query_time))
y = kmeans.predict(X_pca)

result = pd.concat([pd.DataFrame(X_pca), pd.DataFrame(y)], axis=1, sort=False)
result.columns = [0, 1, 2]

X, y = load_data('wdbc-modified.data')

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.001)
X_train, X_test, y_train, y_test = train_test_split(result, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))

Train time: 0.040306806564331055
Query time: 0.0012507438659667969
Cross validation score: 0.9187747035573122
Train time: 1.4266328811645508
Query time: 0.0009500980377197266
Test Accuracy: 0.9385964912280702


In [36]:
# EM = 2
X, y = load_data('wdbc-modified.data')
pca = PCA(n_components= 2).fit(X)
X_pca = pca.transform(X)
start = time.time()
kmeans=GaussianMixture(2).fit(X_pca)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
kmeans.predict(X_pca)
query_time = time.time() - start
print("Query time: " + str(query_time))
y = kmeans.predict(X_pca)

result = pd.concat([pd.DataFrame(X_pca), pd.DataFrame(y)], axis=1, sort=False)
result.columns = [0, 1, 2]

X, y = load_data('wdbc-modified.data')

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.001)
X_train, X_test, y_train, y_test = train_test_split(result, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))

Train time: 0.014967203140258789
Query time: 0.005136251449584961
Cross validation score: 0.9160079051383399
Train time: 1.0170092582702637
Query time: 0.0013041496276855469
Test Accuracy: 0.9210526315789473


In [42]:

# KMeans = 2
X, y = load_data('wdbc-modified.data')
pca = PCA(n_components= 2).fit(X)
X_pca = pca.transform(X)
start = time.time()
kmeans=KMeans(2).fit(X_pca)
labels = kmeans.predict(X_pca)

print(v_measure_score(y, labels))
print(adjusted_mutual_info_score(y, labels))
print(homogeneity_score(y, labels))

0.46479332792160793
0.46400471284520906
0.42229071246999117


In [44]:
# Gaussian = 2
X, y = load_data('wdbc-modified.data')
pca = PCA(n_components= 2).fit(X)
X_pca = pca.transform(X)
start = time.time()
kmeans=GaussianMixture(2).fit(X_pca)
labels = kmeans.predict(X_pca)

print(v_measure_score(y, labels))
print(adjusted_mutual_info_score(y, labels))
print(homogeneity_score(y, labels))

0.6470261554158523
0.646544762580409
0.6343604044495395
