In [37]:
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image
%matplotlib inline
import matplotlib.cm as cm
import seaborn as sns
sns.set_style("dark")

from yellowbrick.cluster import KElbowVisualizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

# Clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture as EM

# Dim reduction
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.random_projection import SparseRandomProjection, GaussianRandomProjection
from scipy.stats import kurtosis 

# Models
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

from sklearn.datasets import load_digits
from utils import learning_curve_plotter, model_param_curve, metrics

In [38]:
seed = 777

pca_k_diabet, pca_k_digit = 3, 20
ica_k_diabet, ica_k_digit = 3, 13
rp_k_diabet, rp_k_digit = 2, 28
tsvd_k_diabet, tsvd_k_digit = 3, 20

In [39]:
# Diabet Load Dataset

diabet_path = "./data/diabetes.csv"
df_diabet = pd.read_csv(diabet_path)

target = 'Outcome'
features = [ 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']

X_origin, y = df_diabet[features], df_diabet[target]

# data normalize
sc = MinMaxScaler()
sc.fit(X_origin)

X = sc.transform(X_origin)
X_df = pd.DataFrame(X)
df_diabet = pd.concat([X_df, y], axis=1)

In [40]:
def calc_score_func(X, y, nn_clf, dim=2, mode="None", cv=5):
    
    if mode == "mlp":
        X_feature = X
    elif mode == "pca":
        pca = PCA(random_state=seed, n_components=dim)
        X_feature = pca.fit_transform(X)
    elif mode == "ica":
        ica = FastICA(random_state=seed, n_components=dim).fit(X)
        X_feature = ica.fit_transform(X)
    elif mode == "rp":
        rp = SparseRandomProjection(random_state=seed, n_components=dim)
        X_feature = rp.fit_transform(X)
    elif mode == "tsvd":
        tsvd = TruncatedSVD(n_components=dim, random_state=seed)
        X_feature = tsvd.fit_transform(X)

    start_time = time.time()
    score = cross_val_score(nn_clf, X_feature, y, scoring="accuracy", cv=cv).mean()
    train_time = time.time() - start_time
    
    return (score, train_time)

# best parameter tester
best_nn_clf = MLPClassifier(
    random_state=seed,
    hidden_layer_sizes=30,
    learning_rate_init=0.001,
    max_iter=1000,
    activation="tanh"
)

## From dataset1 perform & time

- kmeans & em + MLP
- pca

In [41]:
start_time = time.time()
kmeans = KMeans(2).fit(X)
train_time = time.time() - start_time
print(f"Cluster Time: {train_time}")

Cluster Time: 0.03479290008544922


In [42]:
y_cluster = kmeans.predict(X)

In [43]:
X.shape
# y_cluster.shape

(768, 8)

In [47]:
def calc_score_func(X, y, dim=2, model=None, nn_clf=best_nn_clf, cv=5):

    start_time = time.time()

    cluster = model(2).fit(X)
    y_cluster = cluster.predict(X)
    X_cluster_features = np.c_[X,y_cluster]

    score = cross_val_score(nn_clf, X_cluster_features, y, scoring="accuracy", cv=cv).mean()
    train_time = time.time() - start_time
    
    return (score, train_time)

In [45]:
calc_score_func(X, y, dim=2, model=KMeans)

(0.7656905186316951, 2.9945151805877686)

In [46]:
calc_score_func(X, y, dim=2, model=EM)

(0.7630846277905101, 2.9043776988983154)