# Dimensionality Reduction

In [63]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from scipy.stats import wilcoxon
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [64]:
datasets = ["abalone", "acute-inflammation", "acute-nephritis", "arrhythmia", "bank", "breast-cancer",
               "car", "cardiotocography-3clases", "congressional-voting", "credit-approval", "iris"]
path = "../UA-ECE523-EngrAppMLData/data/" + datasets[5] + ".csv"
data = np.loadtxt(path, delimiter=",")
X, y = data[:, :-1], data[:, -1]

In [65]:
def score_dataset(X, y, clf):
    pca = PCA(n_components=0.90)
    pcaX = pca.fit_transform(X)
    
    score0 = np.mean(cross_val_score(clf, X, y, cv=5))
    score1 = np.mean(cross_val_score(clf, pcaX, y, cv=5))
    
    return score0, score1

The following code tests whether PCA does better than not using a preprocessing technique with a Logistic Regression classifier.

In [68]:
scores = np.zeros((len(datasets), 2))
i = 0
for dataset in datasets:
    path = "../UA-ECE523-EngrAppMLData/data/" + dataset + ".csv"
    data = np.loadtxt(path, delimiter=",")
    X, y = data[:, :-1], data[:, -1]
    
    clf = LogisticRegression()
    
    scores[i] = score_dataset(X, y, clf)
    i += 1

print(scores)

w, p = wilcoxon(scores[:, 1], scores[:, 0], alternative="greater")
print(w, p)
if p < 0.1:
    print("There is strong evidence that using PCA to preprocess the data does better than not using it at the 0.05 level.")
else:
    print("There is not strong evidence to say that using PCA to preprocess the data does better than not using it at the 0.05 level.")

[[0.64424639 0.54895606]
 [1.         1.        ]
 [1.         1.        ]
 [0.63279761 0.63712768]
 [0.89493571 0.89117636]
 [0.70278282 0.68523896]
 [0.75279944 0.75279944]
 [0.85137586 0.85797073]
 [0.6161041  0.62064956]
 [0.84212625 0.84649554]
 [0.90666667 0.86666667]]
14.0 0.7122984885436232
There is not strong evidence to say that using PCA to preprocess the data does better than not using it at the 0.05 level.


The following code tests whether PCA does better than not using a preprocessing technique with a Naive Bayes Gaussian classifier.

In [69]:
scores = np.zeros((len(datasets), 2))
i = 0
for dataset in datasets:
    path = "../UA-ECE523-EngrAppMLData/data/" + dataset + ".csv"
    data = np.loadtxt(path, delimiter=",")
    X, y = data[:, :-1], data[:, -1]
    
    clf = GaussianNB()
    
    scores[i] = score_dataset(X, y, clf)
    i += 1

print(scores)

w, p = wilcoxon(scores[:, 1], scores[:, 0], alternative="greater")
print(w, p)
if p < 0.1:
    print("There is strong evidence that using PCA to preprocess the data does better than not using it at the 0.05 level.")

[[0.57052789 0.55591593]
 [0.82282609 0.94166667]
 [0.91666667 1.        ]
 [0.14167139 0.33788957]
 [0.82504327 0.88608786]
 [0.67120387 0.66067756]
 [0.69498252 0.77774566]
 [0.71309473 0.85422038]
 [0.55881267 0.57496051]
 [0.79724011 0.80878173]
 [0.95333333 0.89333333]]
57.0 0.016427109964321847
There is strong evidence that using PCA to preprocess the data does better than not using it at the 0.05 level.
