In [3]:
feature_dict = {i:label for i,label in zip(
                range(4),
                  ('sepal length in cm',
                  'sepal width in cm',
                  'petal length in cm',
                  'petal width in cm', ))}


In [4]:
import pandas as pd
import numpy as np

df = pd.io.parsers.read_csv(
    filepath_or_buffer='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
    header=None,
    sep=',',
    )
df.columns = [l for i,l in sorted(feature_dict.items())] + ['class label']
df.dropna(how="all", inplace=True) # to drop the empty line at file-end

df.tail()


Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class label
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [5]:
from sklearn.preprocessing import LabelEncoder

X = df[['sepal length in cm',
                  'sepal width in cm',
                  'petal length in cm',
                  'petal width in cm',]].values
y = df['class label'].values

enc = LabelEncoder()
label_encoder = enc.fit(y)
y = label_encoder.transform(y) + 1

label_dict = {1: 'Setosa', 2: 'Versicolor', 3:'Virginica'}

# Computing the d-dimensional mean vectors

In [6]:
np.set_printoptions(precision=4)

mean_vectors = []
for cl in range(1,4):
    mean_vectors.append(np.mean(X[y==cl], axis=0))
    print('Mean Vector class %s: %s\n' %(cl, mean_vectors[cl-1]))


Mean Vector class 1: [ 5.006  3.418  1.464  0.244]

Mean Vector class 2: [ 5.936  2.77   4.26   1.326]

Mean Vector class 3: [ 6.588  2.974  5.552  2.026]



# Step 2: Computing the Within-class Scatter Matrices

In [7]:
S_W = np.zeros((4,4))
for cl,mv in zip(range(1,4), mean_vectors):
    class_sc_mat = np.zeros((4,4))                  # scatter matrix for every class
    for row in X[y == cl]:
        row, mv = row.reshape(4,1), mv.reshape(4,1) # make column vectors
        class_sc_mat += (row-mv).dot((row-mv).T)
    S_W += class_sc_mat                             # sum class scatter matrices
print('within-class Scatter Matrix:\n', S_W)

within-class Scatter Matrix:
 [[ 38.9562  13.683   24.614    5.6556]
 [ 13.683   17.035    8.12     4.9132]
 [ 24.614    8.12    27.22     6.2536]
 [  5.6556   4.9132   6.2536   6.1756]]


In [8]:
# eigenvectors and eigenvalues for the within-class Scatter Matrix:
eig_val_S_W, eig_vec_S_W = np.linalg.eig(S_W)
print( "eig_val_S_W=" , eig_val_S_W , "\neig_vec_S_W=" , eig_vec_S_W , "\n")

eig_val_S_W= [ 65.237    3.2595   8.1396  12.7507] 
eig_vec_S_W= [[ 0.7375  0.2302 -0.6321  0.0593]
 [ 0.3219 -0.3266  0.1749 -0.8713]
 [ 0.5723 -0.3472  0.5843  0.4588]
 [ 0.158   0.8484  0.4779 -0.1637]] 



In [9]:
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_val_S_W[i]), eig_vec_S_W[:,i]) for i in range(len(eig_val_S_W))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort(key=lambda x: x[0], reverse=True)

# Visually confirm that the list is correctly sorted by decreasing eigenvalues
for i in eig_pairs:
    print(i[0])

65.2370101074
12.7506937244
8.13955740658
3.25953876162


In [11]:
eig_pairs

[(65.237010107399598, array([ 0.7375,  0.3219,  0.5723,  0.158 ])),
 (12.750693724401781, array([ 0.0593, -0.8713,  0.4588, -0.1637])),
 (8.13955740658289, array([-0.6321,  0.1749,  0.5843,  0.4779])),
 (3.2595387616156577, array([ 0.2302, -0.3266, -0.3472,  0.8484]))]

In [10]:
matrix_w = np.hstack((eig_pairs[2][1].reshape(4,1), eig_pairs[3][1].reshape(4,1)))
print('Matrix W:\n', matrix_w)

Matrix W:
 [[-0.6321  0.2302]
 [ 0.1749 -0.3266]
 [ 0.5843 -0.3472]
 [ 0.4779  0.8484]]


In [12]:
transformed = X.dot(matrix_w)
transformed.shape

(150, 2)

In [13]:
transformed

array([[ -1.6978e+00,  -2.8550e-01],
       [ -1.6589e+00,  -1.6825e-01],
       [ -1.5559e+00,  -2.4488e-01],
       [ -1.3933e+00,  -3.0468e-01],
       [ -1.6171e+00,  -3.4117e-01],
       [ -1.5466e+00,  -2.8155e-01],
       [ -1.3515e+00,  -2.8309e-01],
       [ -1.5937e+00,  -3.1058e-01],
       [ -1.3603e+00,  -2.5068e-01],
       [ -1.6307e+00,  -3.2047e-01],
       [ -1.7940e+00,  -3.1648e-01],
       [ -1.4088e+00,  -3.9134e-01],
       [ -1.6434e+00,  -2.7611e-01],
       [ -1.5027e+00,  -2.8703e-01],
       [ -2.1697e+00,  -2.1821e-01],
       [ -1.7656e+00,  -3.0634e-01],
       [ -1.7803e+00,  -1.4266e-01],
       [ -1.6500e+00,  -2.0066e-01],
       [ -1.8015e+00,  -2.6469e-01],
       [ -1.5391e+00,  -3.3335e-01],
       [ -1.7296e+00,  -2.8796e-01],
       [ -1.5088e+00,  -2.1585e-01],
       [ -1.5980e+00,  -2.9435e-01],
       [ -1.4141e+00,  -6.9825e-02],
       [ -1.2335e+00,  -4.9550e-01],
       [ -1.6052e+00,  -2.1468e-01],
       [ -1.4396e+00,  -1.7562e-01],
 