In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv(
    filepath_or_buffer='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 
    header=None, 
    sep=',')

df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
df.dropna(how="all", inplace=True) # drops the empty line at file-end

df.tail()

Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid,class
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [3]:
X = df[['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid']].values
y = df['class'].values

In [12]:
#X = U S V
U,S,V = np.linalg.svd(X)
print("U=", U , "\nS=", S , "\nV=" , V )

U= [[ -6.16171172e-02   1.29969428e-01  -5.58364155e-05 ...,  -9.34637342e-02
   -9.60224157e-02  -8.09922905e-02]
 [ -5.80722977e-02   1.11371452e-01   6.84386629e-02 ...,   3.66755322e-02
   -3.24463474e-02   1.27273399e-02]
 [ -5.67633852e-02   1.18294769e-01   2.31062793e-03 ...,   3.08252776e-02
    1.95234663e-01   1.35567696e-01]
 ..., 
 [ -9.40702260e-02  -4.98348018e-02  -4.14958083e-02 ...,   9.81822841e-01
   -2.17978813e-02  -8.85972146e-03]
 [ -9.48993908e-02  -5.62107520e-02  -2.12386574e-01 ...,  -2.14264126e-02
    9.42038920e-01  -2.96933496e-02]
 [ -8.84882764e-02  -5.16210172e-02  -9.51442925e-02 ...,  -8.52768485e-03
   -3.02139863e-02   9.73577349e-01]] 
S= [ 95.95066751  17.72295328   3.46929666   1.87891236] 
V= [[-0.75116805 -0.37978837 -0.51315094 -0.16787934]
 [ 0.28583096  0.54488976 -0.70889874 -0.34475845]
 [ 0.49942378 -0.67502499 -0.05471983 -0.54029889]
 [ 0.32345496 -0.32124324 -0.48077482  0.74902286]]


In [9]:
S=np.square(S)
S

array([  9.20653060e+03,   3.14103073e+02,   1.20360193e+01,
         3.53031167e+00])

In [17]:
V=V.T
V

array([[-0.75116805, -0.37978837, -0.51315094, -0.16787934],
       [ 0.28583096,  0.54488976, -0.70889874, -0.34475845],
       [ 0.49942378, -0.67502499, -0.05471983, -0.54029889],
       [ 0.32345496, -0.32124324, -0.48077482,  0.74902286]])

In [19]:
cov_mat = np.cov([X[0,:],X[1,:],X[2,:]],X[3,:])
print('Covariance Matrix:\n', cov_mat)

Covariance Matrix:
 [[ 4.75        4.42166667  4.35333333  4.16      ]
 [ 4.42166667  4.14916667  4.055       3.885     ]
 [ 4.35333333  4.055       3.99        3.81333333]
 [ 4.16        3.885       3.81333333  3.65666667]]


In [26]:
# eigenvectors and eigenvalues for the from the covariance matrix
eig_val_cov, eig_vec_cov = np.linalg.eig(cov_mat)
print( "eig_val_cov=" , eig_val_cov , "\neig_vec_cov=" , eig_vec_cov , "\n")

eig_val_cov= [  1.65178969e+01   2.11439743e-02   1.63378468e-15   6.79244079e-03] 
eig_vec_cov= [[-0.53594712 -0.48950914 -0.64998061 -0.22509264]
 [-0.50033874  0.7933919  -0.06717125 -0.34011537]
 [-0.49129599 -0.35241396  0.75666436 -0.24878044]
 [-0.47015964  0.08194211  0.02173187  0.8785005 ]] 



In [35]:
for ev in eig_vec_cov:
    numpy.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev))

# So, in order to decide which eigenvector(s) we want to drop for our lower-dimensional subspace, we have to take a look at the corresponding eigenvalues of the eigenvectors. Roughly speaking, the eigenvectors with the lowest eigenvalues bear the least information about the distribution of the data, and those are the ones we want to drop.
The common approach is to rank the eigenvectors from highest to lowest corresponding eigenvalue and choose the top k eigenvectors.

In [37]:
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_val_cov[i]), eig_vec_cov[:,i]) for i in range(len(eig_val_cov))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort(key=lambda x: x[0], reverse=True)

# Visually confirm that the list is correctly sorted by decreasing eigenvalues
for i in eig_pairs:
    print(i[0])



16.5178969182
0.0211439743331
0.006792440789
1.63378467723e-15


In [42]:
eig_pairs[0][1]

array([-0.53594712, -0.50033874, -0.49129599, -0.47015964])

In [44]:
eig_pairs[1][1]

array([-0.48950914,  0.7933919 , -0.35241396,  0.08194211])

In [45]:
eig_pairs

[(16.517896918211271,
  array([-0.53594712, -0.50033874, -0.49129599, -0.47015964])),
 (0.021143974333078828,
  array([-0.48950914,  0.7933919 , -0.35241396,  0.08194211])),
 (0.0067924407889953008,
  array([-0.22509264, -0.34011537, -0.24878044,  0.8785005 ])),
 (1.6337846772348803e-15,
  array([-0.64998061, -0.06717125,  0.75666436,  0.02173187]))]

In [48]:
matrix_w = np.hstack((eig_pairs[0][1].reshape(4,1), eig_pairs[1][1].reshape(4,1)))
print('Matrix W:\n', matrix_w)

Matrix W:
 [[-0.53594712 -0.48950914]
 [-0.50033874  0.7933919 ]
 [-0.49129599 -0.35241396]
 [-0.47015964  0.08194211]]


In [58]:
transformed = X.dot(matrix_w)
transformed.shape


(150, 2)

In [59]:
transformed

array([[ -5.2663622 ,  -0.19661608],
       [ -4.9090034 ,  -0.4954102 ],
       [ -4.85275213,  -0.2035886 ],
       [ -4.84738274,  -0.30445967],
       [ -5.26280136,  -0.06832598],
       [ -5.86870255,  -0.11544783],
       [ -4.99537073,  -0.02300649],
       [ -5.21186321,  -0.26224576],
       [ -4.59099597,  -0.32999483],
       [ -4.96115091,  -0.45950662],
       [ -5.57634368,  -0.22003184],
       [ -5.15380339,  -0.19958532],
       [ -4.80839273,  -0.4546535 ],
       [ -4.39303037,  -0.10417475],
       [ -5.79343535,  -0.07209374],
       [ -6.18139686,   0.20487816],
       [ -5.67218416,   0.02551775],
       [ -5.31337816,  -0.18842187],
       [ -5.93243685,  -0.34983397],
       [ -5.51260938,   0.0143543 ],
       [ -5.52450125,  -0.5285322 ],
       [ -5.50959147,  -0.05679068],
       [ -4.85190412,   0.26844326],
       [ -5.45473114,  -0.43643602],
       [ -5.30119218,  -0.30530951],
       [ -5.06085731,  -0.61484391],
       [ -5.35502474,  -0.28109873],
 