# Reduction dimensions for iris dataset using MDS, Isomp, t-SNE, and PCA

In [1]:
#---------------
# import modules
#---------------

import numpy as np
import joblib as jb
import matplotlib.pyplot as plt

#datasets
from sklearn.datasets import load_breast_cancer

from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from collections import Counter

from sklearn.manifold import MDS
from sklearn.manifold import TSNE
from sklearn.manifold import Isomap
from sklearn.preprocessing import StandardScaler # for PCA
from sklearn.decomposition import PCA

from sklearn.neural_network import MLPClassifier

In [2]:
#----------------
# Iris Dataset
#----------------

data = load_breast_cancer()
X_org = data.data
y_org = data.target
print('Shape of X: ', X_org.shape)
print('Shape of y: ', y_org.shape)

Shape of X:  (569, 30)
Shape of y:  (569,)


In [3]:
# normalize data (0,1)
t = MinMaxScaler()
t.fit(X_org)
X_org = t.transform(X_org)

In [4]:
n_particulas = 100
max_iter = 1000
n_training = 10

## PCA reduction

### 25D

In [None]:
# Create the object
scaler = StandardScaler()
# Calculate the mean and standard deviation for each variable in the dataset 
scaler.fit(X_org)
# The transformed (scaled) values of X are stored in the variable X_scaled which is also 
# the same two-dimensional Numpy array.
X_scaled = scaler.transform(X_org)

embedding = PCA(n_components=25)
embedding.fit(X_scaled)
X_transformed = embedding.transform(X_scaled)
print ('Shape of X_train transformed: ', X_transformed.shape)

X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_transformed, y_org, test_size=0.2, random_state=100)
print(f"Training target statistics: {Counter(y_train_bal)}")
print(f"Testing target statistics: {Counter(y_test_bal)}")

X_sample = len(X_train_bal)
X_input = len(X_train_bal[1])
X_class = len(np.unique(y_train_bal))

X_train = X_train_bal
y_train = y_train_bal
X_test = X_test_bal
y_test = y_test_bal

cost_test = []

for i in range(n_training):
    print('Training ', i+1, '...')
    clf = MLPClassifier(hidden_layer_sizes=(X_input * 3, ), activation='tanh', solver='lbfgs', max_iter=max_iter, alpha=5e-4)
    clf.out_activation_ = 'multiclass'
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    cost_test.append(mean_squared_error(y_test, y_test_pred))
    print ('Test prediction cost with training: ', cost_test[i]) 
    min_cost = min(cost_test)
    print ("Min test prediction cost: ", min_cost)
    if  cost_test[i] <= min_cost:
        # save the model to disk
        filename = 'lbfgs_bc_model_113_100_1000_25_pca.sav'
        jb.dump(clf, filename)

In [6]:
loaded_model = jb.load(filename)

print("=====================================================================")
print("=====================================================================")
y_test_pred_load = loaded_model.predict(X_test)
cost_test_load = mean_squared_error(y_test, y_test_pred_load)
acc_test_load = accuracy_score(y_test, y_test_pred_load)

y_train_pred_load = loaded_model.predict(X_train)
cost_train_load = mean_squared_error(y_train, y_train_pred_load)
acc_train_load = accuracy_score(y_train, y_train_pred_load)

print('Training: MSE: ', cost_train_load, ' ACC score: ', acc_train_load)
print('Testing: MSE: ', cost_test_load, ' ACC score: ', acc_test_load)
print("=====================================================================")
print("=====================================================================")

Training: MSE:  0.0  ACC score:  1.0
Testing: MSE:  0.008771929824561403  ACC score:  0.9912280701754386


### 20D

In [None]:
# Create the object
scaler = StandardScaler()
# Calculate the mean and standard deviation for each variable in the dataset 
scaler.fit(X_org)
# The transformed (scaled) values of X are stored in the variable X_scaled which is also 
# the same two-dimensional Numpy array.
X_scaled = scaler.transform(X_org)

embedding = PCA(n_components=20)
embedding.fit(X_scaled)
X_transformed = embedding.transform(X_scaled)
print ('Shape of X_train transformed: ', X_transformed.shape)

X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_transformed, y_org, test_size=0.2, random_state=100)
print(f"Training target statistics: {Counter(y_train_bal)}")
print(f"Testing target statistics: {Counter(y_test_bal)}")

X_sample = len(X_train_bal)
X_input = len(X_train_bal[1])
X_class = len(np.unique(y_train_bal))

X_train = X_train_bal
y_train = y_train_bal
X_test = X_test_bal
y_test = y_test_bal

cost_test = []

for i in range(n_training):
    print('Training ', i+1, '...')
    clf = MLPClassifier(hidden_layer_sizes=(X_input * 3, ), activation='tanh', solver='lbfgs', max_iter=max_iter, alpha=5e-4)
    clf.out_activation_ = 'multiclass'
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    cost_test.append(mean_squared_error(y_test, y_test_pred))
    print ('Test prediction cost with training: ', cost_test[i]) 
    min_cost = min(cost_test)
    print ("Min test prediction cost: ", min_cost)
    if  cost_test[i] <= min_cost:
        # save the model to disk
        filename = 'lbfgs_bc_model_113_100_1000_20_pca.sav'
        jb.dump(clf, filename)

In [8]:
loaded_model = jb.load(filename)

print("=====================================================================")
print("=====================================================================")
y_test_pred_load = loaded_model.predict(X_test)
cost_test_load = mean_squared_error(y_test, y_test_pred_load)
acc_test_load = accuracy_score(y_test, y_test_pred_load)

y_train_pred_load = loaded_model.predict(X_train)
cost_train_load = mean_squared_error(y_train, y_train_pred_load)
acc_train_load = accuracy_score(y_train, y_train_pred_load)

print('Training: MSE: ', cost_train_load, ' ACC score: ', acc_train_load)
print('Testing: MSE: ', cost_test_load, ' ACC score: ', acc_test_load)
print("=====================================================================")
print("=====================================================================")

Training: MSE:  0.0  ACC score:  1.0
Testing: MSE:  0.017543859649122806  ACC score:  0.9824561403508771


### 15D

In [None]:
# Create the object
scaler = StandardScaler()
# Calculate the mean and standard deviation for each variable in the dataset 
scaler.fit(X_org)
# The transformed (scaled) values of X are stored in the variable X_scaled which is also 
# the same two-dimensional Numpy array.
X_scaled = scaler.transform(X_org)

embedding = PCA(n_components=15)
embedding.fit(X_scaled)
X_transformed = embedding.transform(X_scaled)
print ('Shape of X_train transformed: ', X_transformed.shape)

X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_transformed, y_org, test_size=0.2, random_state=100)
print(f"Training target statistics: {Counter(y_train_bal)}")
print(f"Testing target statistics: {Counter(y_test_bal)}")

X_sample = len(X_train_bal)
X_input = len(X_train_bal[1])
X_class = len(np.unique(y_train_bal))

X_train = X_train_bal
y_train = y_train_bal
X_test = X_test_bal
y_test = y_test_bal

cost_test = []

for i in range(n_training):
    print('Training ', i+1, '...')
    clf = MLPClassifier(hidden_layer_sizes=(X_input * 3, ), activation='tanh', solver='lbfgs', max_iter=max_iter, alpha=5e-4)
    clf.out_activation_ = 'multiclass'
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    cost_test.append(mean_squared_error(y_test, y_test_pred))
    print ('Test prediction cost with training: ', cost_test[i]) 
    min_cost = min(cost_test)
    print ("Min test prediction cost: ", min_cost)
    if  cost_test[i] <= min_cost:
        # save the model to disk
        filename = 'lbfgs_bc_model_113_100_1000_15_pca.sav'
        jb.dump(clf, filename)

In [10]:
loaded_model = jb.load(filename)

print("=====================================================================")
print("=====================================================================")
y_test_pred_load = loaded_model.predict(X_test)
cost_test_load = mean_squared_error(y_test, y_test_pred_load)
acc_test_load = accuracy_score(y_test, y_test_pred_load)

y_train_pred_load = loaded_model.predict(X_train)
cost_train_load = mean_squared_error(y_train, y_train_pred_load)
acc_train_load = accuracy_score(y_train, y_train_pred_load)

print('Training: MSE: ', cost_train_load, ' ACC score: ', acc_train_load)
print('Testing: MSE: ', cost_test_load, ' ACC score: ', acc_test_load)
print("=====================================================================")
print("=====================================================================")

Training: MSE:  0.0  ACC score:  1.0
Testing: MSE:  0.008771929824561403  ACC score:  0.9912280701754386


### 10D

In [None]:
# Create the object
scaler = StandardScaler()
# Calculate the mean and standard deviation for each variable in the dataset 
scaler.fit(X_org)
# The transformed (scaled) values of X are stored in the variable X_scaled which is also 
# the same two-dimensional Numpy array.
X_scaled = scaler.transform(X_org)

embedding = PCA(n_components=10)
embedding.fit(X_scaled)
X_transformed = embedding.transform(X_scaled)
print ('Shape of X_train transformed: ', X_transformed.shape)

X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_transformed, y_org, test_size=0.2, random_state=100)
print(f"Training target statistics: {Counter(y_train_bal)}")
print(f"Testing target statistics: {Counter(y_test_bal)}")

X_sample = len(X_train_bal)
X_input = len(X_train_bal[1])
X_class = len(np.unique(y_train_bal))

X_train = X_train_bal
y_train = y_train_bal
X_test = X_test_bal
y_test = y_test_bal

cost_test = []

for i in range(n_training):
    print('Training ', i+1, '...')
    clf = MLPClassifier(hidden_layer_sizes=(X_input * 3, ), activation='tanh', solver='lbfgs', max_iter=max_iter, alpha=5e-4)
    clf.out_activation_ = 'multiclass'
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    cost_test.append(mean_squared_error(y_test, y_test_pred))
    print ('Test prediction cost with training: ', cost_test[i]) 
    min_cost = min(cost_test)
    print ("Min test prediction cost: ", min_cost)
    if  cost_test[i] <= min_cost:
        # save the model to disk
        filename = 'lbfgs_bc_model_113_100_1000_10_pca.sav'
        jb.dump(clf, filename)

In [12]:
loaded_model = jb.load(filename)

print("=====================================================================")
print("=====================================================================")
y_test_pred_load = loaded_model.predict(X_test)
cost_test_load = mean_squared_error(y_test, y_test_pred_load)
acc_test_load = accuracy_score(y_test, y_test_pred_load)

y_train_pred_load = loaded_model.predict(X_train)
cost_train_load = mean_squared_error(y_train, y_train_pred_load)
acc_train_load = accuracy_score(y_train, y_train_pred_load)

print('Training: MSE: ', cost_train_load, ' ACC score: ', acc_train_load)
print('Testing: MSE: ', cost_test_load, ' ACC score: ', acc_test_load)
print("=====================================================================")
print("=====================================================================")

Training: MSE:  0.0  ACC score:  1.0
Testing: MSE:  0.02631578947368421  ACC score:  0.9736842105263158


### 5D

In [None]:
# Create the object
scaler = StandardScaler()
# Calculate the mean and standard deviation for each variable in the dataset 
scaler.fit(X_org)
# The transformed (scaled) values of X are stored in the variable X_scaled which is also 
# the same two-dimensional Numpy array.
X_scaled = scaler.transform(X_org)

embedding = PCA(n_components=5)
embedding.fit(X_scaled)
X_transformed = embedding.transform(X_scaled)
print ('Shape of X_train transformed: ', X_transformed.shape)

X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_transformed, y_org, test_size=0.2, random_state=100)
print(f"Training target statistics: {Counter(y_train_bal)}")
print(f"Testing target statistics: {Counter(y_test_bal)}")

X_sample = len(X_train_bal)
X_input = len(X_train_bal[1])
X_class = len(np.unique(y_train_bal))

X_train = X_train_bal
y_train = y_train_bal
X_test = X_test_bal
y_test = y_test_bal

cost_test = []

for i in range(n_training):
    print('Training ', i+1, '...')
    clf = MLPClassifier(hidden_layer_sizes=(X_input * 3, ), activation='tanh', solver='lbfgs', max_iter=max_iter, alpha=5e-4)
    clf.out_activation_ = 'multiclass'
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    cost_test.append(mean_squared_error(y_test, y_test_pred))
    print ('Test prediction cost with training: ', cost_test[i]) 
    min_cost = min(cost_test)
    print ("Min test prediction cost: ", min_cost)
    if  cost_test[i] <= min_cost:
        # save the model to disk
        filename = 'lbfgs_bc_model_113_100_1000_5_pca.sav'
        jb.dump(clf, filename)

In [14]:
loaded_model = jb.load(filename)

print("=====================================================================")
print("=====================================================================")
y_test_pred_load = loaded_model.predict(X_test)
cost_test_load = mean_squared_error(y_test, y_test_pred_load)
acc_test_load = accuracy_score(y_test, y_test_pred_load)

y_train_pred_load = loaded_model.predict(X_train)
cost_train_load = mean_squared_error(y_train, y_train_pred_load)
acc_train_load = accuracy_score(y_train, y_train_pred_load)

print('Training: MSE: ', cost_train_load, ' ACC score: ', acc_train_load)
print('Testing: MSE: ', cost_test_load, ' ACC score: ', acc_test_load)
print("=====================================================================")
print("=====================================================================")

Training: MSE:  0.0  ACC score:  1.0
Testing: MSE:  0.0  ACC score:  1.0
