In [57]:
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image
%matplotlib inline
import matplotlib.cm as cm
import seaborn as sns
sns.set_style("dark")

from yellowbrick.cluster import KElbowVisualizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

# Clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture as EM

# Dim reduction
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.random_projection import SparseRandomProjection, GaussianRandomProjection
from scipy.stats import kurtosis 

# Models
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

from sklearn.datasets import load_digits
from utils import learning_curve_plotter, model_param_curve, metrics

In [58]:
seed = 777

pca_k_diabet, pca_k_digit = 6, 22
ica_k_diabet, ica_k_digit = 6, 13
rp_k_diabet, rp_k_digit = 3, 28
tsvd_k_diabet, tsvd_k_digit = 6, 22

In [59]:
# Diabet Load Dataset

diabet_path = "./data/diabetes.csv"
df_diabet = pd.read_csv(diabet_path)

target = 'Outcome'
features = [ 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']

X_origin, y = df_diabet[features], df_diabet[target]

# data normalize
sc = MinMaxScaler()
sc.fit(X_origin)

X = sc.transform(X_origin)
X_df = pd.DataFrame(X)
df_diabet = pd.concat([X_df, y], axis=1)

In [60]:
def calc_score_func(X, y, nn_clf, dim=2, mode="None", cv=5):
    
    if mode == "mlp":
        X_feature = X
    elif mode == "pca":
        pca = PCA(random_state=seed, n_components=dim)
        X_feature = pca.fit_transform(X)
    elif mode == "ica":
        ica = FastICA(random_state=seed, n_components=dim).fit(X)
        X_feature = ica.fit_transform(X)
    elif mode == "rp":
        rp = SparseRandomProjection(random_state=seed, n_components=dim)
        X_feature = rp.fit_transform(X)
    elif mode == "tsvd":
        tsvd = TruncatedSVD(n_components=dim, random_state=seed)
        X_feature = tsvd.fit_transform(X)

    start_time = time.time()
    score = cross_val_score(nn_clf, X_feature, y, scoring="accuracy", cv=cv).mean()
    train_time = time.time() - start_time
    
    return (score, train_time)

# best parameter tester
best_nn_clf = MLPClassifier(
    random_state=seed,
    hidden_layer_sizes=30,
    learning_rate_init=0.001,
    max_iter=1000,
    activation="tanh"
)

## From dataset1 perform & time

- pca
- ica
- rp
- tsvd

In [62]:
diabet_mlp_score = calc_score_func(X, y, best_nn_clf, mode="mlp")
diabet_pca_score = calc_score_func(X, y, best_nn_clf, dim=6, mode="pca")
diabet_ica_score = calc_score_func(X, y, best_nn_clf, dim=6, mode="ica")
diabet_rp_score = calc_score_func(X, y, best_nn_clf, dim=3, mode="rp")
diabet_tsvd_score = calc_score_func(X, y, best_nn_clf, dim=6, mode="tsvd")

In [63]:
print(diabet_mlp_score, diabet_pca_score, diabet_ica_score, diabet_rp_score, diabet_tsvd_score)

(0.7682964094728801, 2.3147709369659424) (0.7565232153467447, 0.9081752300262451) (0.7643663526016468, 1.9875288009643555) (0.712282488753077, 1.0444138050079346) (0.7591545709192767, 2.4633517265319824)
