In [1]:
feature_dict = {i:label for i,label in zip(
                range(4),
                  ('sepal length in cm',
                  'sepal width in cm',
                  'petal length in cm',
                  'petal width in cm', ))}

In [2]:
import pandas as pd
import numpy as np

df = pd.io.parsers.read_csv(
    filepath_or_buffer='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
    header=None,
    sep=',',
    )
df.columns = [l for i,l in sorted(feature_dict.items())] + ['class label']
df.dropna(how="all", inplace=True) # to drop the empty line at file-end

df.tail()

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class label
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [3]:
from sklearn.preprocessing import LabelEncoder

X = df[['sepal length in cm',
                  'sepal width in cm',
                  'petal length in cm',
                  'petal width in cm',]].values
y = df['class label'].values

enc = LabelEncoder()
label_encoder = enc.fit(y)
y = label_encoder.transform(y) + 1

label_dict = {1: 'Setosa', 2: 'Versicolor', 3:'Virginica'}

In [4]:
np.set_printoptions(precision=4)

mean_vectors = []
for cl in range(1,4):
    mean_vectors.append(np.mean(X[y==cl], axis=0))
    print('Mean Vector class %s: %s\n' %(cl, mean_vectors[cl-1]))

Mean Vector class 1: [ 5.006  3.418  1.464  0.244]

Mean Vector class 2: [ 5.936  2.77   4.26   1.326]

Mean Vector class 3: [ 6.588  2.974  5.552  2.026]



In [5]:
overall_mean = np.mean(X, axis=0)

S_B = np.zeros((4,4))
for i,mean_vec in enumerate(mean_vectors):  
    n = X[y==i+1,:].shape[0]
    mean_vec = mean_vec.reshape(4,1) # make column vector
    overall_mean = overall_mean.reshape(4,1) # make column vector
    S_B += n * (mean_vec - overall_mean).dot((mean_vec - overall_mean).T)

print('between-class Scatter Matrix:\n', S_B)

between-class Scatter Matrix:
 [[  63.2121  -19.534   165.1647   71.3631]
 [ -19.534    10.9776  -56.0552  -22.4924]
 [ 165.1647  -56.0552  436.6437  186.9081]
 [  71.3631  -22.4924  186.9081   80.6041]]


In [6]:
# eigenvectors and eigenvalues for the within-class Scatter Matrix:
eig_val_S_B, eig_vec_S_B = np.linalg.eig(S_B)
print( "eig_val_S_B=" , eig_val_S_B , "\neig_vec_S_B=" , eig_vec_S_B , "\n")

eig_val_S_B= [  5.8645e+02   4.9916e+00  -5.7567e-16   2.0433e-14] 
eig_vec_S_B= [[ 0.3269 -0.3326 -0.8282 -0.1528]
 [-0.1096 -0.8878  0.1561  0.3829]
 [ 0.8628  0.1364  0.1076  0.4468]
 [ 0.3698 -0.2874  0.5274 -0.794 ]] 



In [8]:
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_val_S_B[i]), eig_vec_S_B[:,i]) for i in range(len(eig_val_S_B))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort(key=lambda x: x[0], reverse=True)

# Visually confirm that the list is correctly sorted by decreasing eigenvalues
for i in eig_pairs:
    print(i[0])

586.445984021
4.99161597915
2.0432532655e-14
5.75672210462e-16


In [10]:
eig_pairs

[(586.44598402085001, array([ 0.3269, -0.1096,  0.8628,  0.3698])),
 (4.9916159791497172, array([-0.3326, -0.8878,  0.1364, -0.2874])),
 (2.0432532654978783e-14, array([-0.1528,  0.3829,  0.4468, -0.794 ])),
 (5.7567221046184073e-16, array([-0.8282,  0.1561,  0.1076,  0.5274]))]

In [9]:
matrix_w = np.hstack((eig_pairs[0][1].reshape(4,1), eig_pairs[1][1].reshape(4,1)))
print('Matrix W:\n', matrix_w)

Matrix W:
 [[ 0.3269 -0.3326]
 [-0.1096 -0.8878]
 [ 0.8628  0.1364]
 [ 0.3698 -0.2874]]


In [11]:
transformed = X.dot(matrix_w)
transformed.shape

(150, 2)

In [12]:
transformed

array([[ 2.5653, -4.6701],
       [ 2.5548, -4.1597],
       [ 2.3812, -4.2843],
       [ 2.532 , -4.135 ],
       [ 2.5217, -4.7256],
       [ 2.9524, -5.1415],
       [ 2.4498, -4.4437],
       [ 2.6299, -4.5344],
       [ 2.4023, -3.9046],
       [ 2.5931, -4.2061],
       [ 2.7278, -4.9338],
       [ 2.6508, -4.4542],
       [ 2.4851, -4.0977],
       [ 2.0628, -3.9723],
       [ 2.5668, -5.3741],
       [ 2.8231, -5.7125],
       [ 2.6073, -5.1961],
       [ 2.6023, -4.6988],
       [ 3.0244, -5.1238],
       [ 2.6557, -4.9515],
       [ 2.9332, -4.6402],
       [ 2.7037, -4.8915],
       [ 2.0458, -4.6471],
       [ 2.957 , -4.5378],
       [ 2.9096, -4.4133],
       [ 2.76  , -4.1656],
       [ 2.7901, -4.5782],
       [ 2.6843, -4.6897],
       [ 2.609 , -4.6146],
       [ 2.64  , -4.2434],
       [ 2.6837, -4.1879],
       [ 2.8346, -4.7249],
       [ 2.5816, -5.1936],
       [ 2.6194, -5.4246],
       [ 2.5931, -4.2061],
       [ 2.393 , -4.3978],
       [ 2.6098, -4.8168],
 