In [21]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import FastICA, PCA
from sklearn import random_projection
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import accuracy_score
from sklearn.model_selection import (GridSearchCV, train_test_split, validation_curve)   
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from yellowbrick.model_selection import LearningCurve, ValidationCurve

from sklearn.model_selection import cross_val_score
import time

np.random.seed(42)

# Utilities

In [22]:
def load_data(ds):
    df = pd.read_csv("data/" + ds)
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    X, y = X.to_numpy(), y.to_numpy()
    return X, y

# Breast Cancer

## PCA

In [23]:
X, y = load_data('wdbc-modified.data')
pca = PCA(n_components= 2).fit(X)
X_pca = pca.transform(X)

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.001)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))

Cross validation score: 0.916600790513834
Train time: 1.3261313438415527
Query time: 0.0005731582641601562
Test Accuracy: 0.9385964912280702


## ICA

In [24]:
X, y = load_data('wdbc-modified.data')
ica = FastICA(n_components= 5, max_iter=10000, tol=0.1).fit(X)
X_ica = ica.transform(X)

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.001)
X_train, X_test, y_train, y_test = train_test_split(X_ica, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))

Cross validation score: 0.6482213438735178
Train time: 0.33411288261413574
Query time: 0.0006079673767089844
Test Accuracy: 0.543859649122807


## Randomized Projections

In [27]:
X, y = load_data('wdbc-modified.data')
rp = random_projection.SparseRandomProjection(n_components=11)
X_rp=rp.fit_transform(X)

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.001)
X_train, X_test, y_train, y_test = train_test_split(X_rp, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))

Cross validation score: 0.7482213438735179
Train time: 0.05169677734375
Query time: 0.0003731250762939453
Test Accuracy: 0.5877192982456141


## SVD

In [28]:
X, y = load_data('wdbc-modified.data')
tsvd = TruncatedSVD(n_components=2)
X_tsvd = tsvd.fit_transform(X)

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.001)
X_train, X_test, y_train, y_test = train_test_split(X_tsvd, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))

Cross validation score: 0.7863636363636364
Train time: 0.06421875953674316
Query time: 0.0005259513854980469
Test Accuracy: 0.6666666666666666
