### PCA for Reduced Dimensionality in Clustering

**Loading the image data matrix (with rows as images and columns as features), performing min-max normalization on the data matrix so that each feature is scaled**

In [1]:
# Importing necessary packages
import pandas as pd
import numpy as np
import pylab as pl
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Loading the data
seg_class = pd.read_csv("C:/Users/SRINI/Downloads/segmentation_data/segmentation_classes.txt",delimiter='\t', header = None)
seg_class.head(10)

Unnamed: 0,0,1
0,GRASS,0
1,GRASS,0
2,GRASS,0
3,GRASS,0
4,GRASS,0
5,GRASS,0
6,GRASS,0
7,GRASS,0
8,GRASS,0
9,GRASS,0


In [3]:
seg_data = pd.read_csv("C:/Users/SRINI/Downloads/segmentation_data/segmentation_data.txt", header = None)
seg_data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,110.0,189.0,9,0.0,0.0,1.0,0.666667,1.222222,1.186342,12.925926,10.888889,9.222222,18.666668,-6.111111,-11.111111,17.222221,18.666668,0.508139,1.910864
1,86.0,187.0,9,0.0,0.0,1.111111,0.720082,1.444444,0.750309,13.740741,11.666667,10.333334,19.222221,-6.222222,-10.222222,16.444445,19.222221,0.463329,1.941465
2,225.0,244.0,9,0.0,0.0,3.388889,2.195113,3.0,1.520234,12.259259,10.333334,9.333334,17.11111,-5.777778,-8.777778,14.555555,17.11111,0.480149,1.987902
3,47.0,232.0,9,0.0,0.0,1.277778,1.254621,1.0,0.894427,12.703704,11.0,9.0,18.11111,-5.111111,-11.111111,16.222221,18.11111,0.500966,1.875362
4,97.0,186.0,9,0.0,0.0,1.166667,0.691215,1.166667,1.00554,15.592592,13.888889,11.777778,21.11111,-5.111111,-11.444445,16.555555,21.11111,0.442661,1.863654
5,157.0,221.0,9,0.0,0.0,1.055556,0.64693,1.222222,0.620633,12.111111,10.222222,8.111112,18.0,-5.666666,-12.0,17.666666,18.0,0.54918,1.877146
6,62.0,224.0,9,0.0,0.0,0.944445,1.083547,2.333334,1.632993,14.62963,13.222222,11.444445,19.222221,-4.222222,-9.555555,13.777778,19.222221,0.408965,1.860191
7,27.0,248.0,9,0.111111,0.0,1.611111,0.64693,3.166667,1.722401,15.296296,14.777778,12.888889,18.222221,-1.555556,-7.222222,8.777778,18.222221,0.312227,1.783512
8,44.0,233.0,9,0.0,0.0,2.222222,2.146487,2.111111,1.327766,14.481482,12.555555,11.333333,19.555555,-5.777778,-9.444445,15.222222,19.555555,0.422174,1.950405
9,17.0,229.0,9,0.0,0.0,2.111111,1.98513,2.444445,1.614747,13.703704,11.222222,11.111111,18.777779,-7.444445,-7.777778,15.222222,18.777779,0.439852,2.099905


In [4]:
seg_names = pd.read_csv("C:/Users/SRINI/Downloads/segmentation_data/segmentation_names.txt", header = None)
seg_names.head(10)

Unnamed: 0,0
0,REGION-CENTROID-COL
1,REGION-CENTROID-ROW
2,REGION-PIXEL-COUNT
3,SHORT-LINE-DENSITY-5
4,SHORT-LINE-DENSITY-2
5,VEDGE-MEAN
6,VEDGE-SD
7,HEDGE-MEAN
8,HEDGE-SD
9,INTENSITY-MEAN


In [5]:
# Scaling the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))

# Normalizing the data matrix
seg_data_norm = scaler.fit_transform(seg_data)

seg_data_norm

array([[0.43083004, 0.74166667, 0.        , ..., 0.12371135, 0.50813884,
        0.83184923],
       [0.33596838, 0.73333333, 0.        , ..., 0.12739322, 0.46332908,
        0.83698646],
       [0.88537549, 0.97083333, 0.        , ..., 0.11340205, 0.48014903,
        0.84478233],
       ...,
       [0.50197628, 0.625     , 0.        , ..., 0.07216495, 0.5409177 ,
        0.17591546],
       [0.58893281, 0.6125    , 0.        , ..., 0.08100147, 0.50308645,
        0.18478933],
       [0.48616601, 0.62916667, 0.        , ..., 0.09646539, 0.4799313 ,
        0.17037463]])

**Performing Kmeans clustering on the image data using Euclidean distance as distance measure for the clustering.**

In [6]:
# Getting actual labels
ground_truth = seg_class.iloc[:,1]
ground_truth = np.array(ground_truth)
ground_truth

array([0, 0, 0, ..., 3, 3, 3], dtype=int64)

In [7]:
from sklearn.cluster import KMeans
from sklearn.metrics import completeness_score, homogeneity_score

# K-means clustering
kmeans = KMeans(n_clusters=7, random_state=42)
kmeans.fit(seg_data_norm)

# Cluster labels and centroids
cluster_lbls = kmeans.labels_
cluster_cent = kmeans.cluster_centers_
print("7 Cluster Centroids:")

for i in cluster_cent:
    print(i)
    print("\n")

# Compute Completeness and Homogeneity scores
completeness = completeness_score(ground_truth, cluster_lbls)
homogeneity = homogeneity_score(ground_truth, cluster_lbls)

print("Completeness score:", completeness)
print("Homogeneity score:", homogeneity)

7 Cluster Centroids:
[0.25610323 0.39346814 0.         0.0745098  0.01911765 0.07734288
 0.00410042 0.06057362 0.00496749 0.1481873  0.13808514 0.18515984
 0.11853881 0.71690628 0.34401385 0.35545822 0.18541324 0.41198624
 0.20188985]


[5.35098814e-01 1.50166667e-01 0.00000000e+00 2.77777769e-02
 1.66666667e-03 3.02281387e-02 5.42887957e-04 2.67660451e-02
 5.86661900e-04 8.23246433e-01 7.79716377e-01 8.94170356e-01
 7.88760696e-01 2.70665440e-01 6.66372551e-01 2.89386481e-01
 8.94170356e-01 2.11804171e-01 1.25065773e-01]


[0.51399369 0.80893659 0.         0.07744108 0.00505051 0.05447376
 0.00140719 0.04633498 0.00140097 0.10878994 0.09140296 0.09241408
 0.14267644 0.67916102 0.07900179 0.82128688 0.1349008  0.41449132
 0.89233263]


[0.30250553 0.53086158 0.         0.05225989 0.04661017 0.10081685
 0.00942022 0.08397199 0.01104328 0.40060809 0.37034723 0.47246075
 0.35303578 0.49714616 0.57088236 0.2130544  0.47246075 0.30226303
 0.16387917]


[0.75069626 0.5345641  0.         0.04



**Performing PCA on the normalized image data matrix.** 

In [8]:
X = np.mat(seg_data_norm)
meanVals = X.mean(axis=0)

# Centered matrix
A = X - meanVals

# Covariance matrix
C = np.cov(A, rowvar=0)    
print(C)

[[ 8.29317897e-02  1.45509262e-03  0.00000000e+00 -1.70431507e-03
  -1.05339972e-04 -7.94529927e-05  4.29715112e-04 -2.40780200e-04
   1.11085429e-04  4.87627299e-03  4.36002172e-03  5.19708772e-03
   5.02563731e-03 -5.12127668e-03  2.69074206e-03  8.68830456e-04
   5.29437854e-03 -7.05449978e-03  2.84721323e-03]
 [ 1.45509262e-03  5.72633497e-02  0.00000000e+00  1.70167006e-03
   1.13291382e-03  6.04849780e-04 -5.80370695e-04  2.15023073e-03
  -1.73693312e-04 -2.95625796e-02 -2.85332519e-02 -3.32321896e-02
  -2.66223856e-02  1.65406151e-02 -2.45789083e-02  2.28203077e-02
  -3.11729485e-02  4.55180374e-03  3.67577984e-02]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [-1.70431507e-03  1.70167006e-03  0.00000000e+00  1.51

In [9]:
import numpy as np

# Compute eigenvalues and eigenvectors
eigen_values, eigen_vectors = np.linalg.eig(C)

# Create a mapping of eigenvalues to eigenvectors
eigen_mapping = [(eigen_values[i], eigen_vectors[:, i]) for i in range(len(eigen_values))]

# Sort the eigenvectors based on eigenvalues in descending order
sorted_eigen_mapping = sorted(eigen_mapping, key=lambda x: x[0], reverse=True)

# Extract the sorted eigenvalues and eigenvectors
sorted_eigen_values = np.array([eigen_value for eigen_value, _ in sorted_eigen_mapping])
sorted_eigen_vectors = np.array([eigen_vector for _, eigen_vector in sorted_eigen_mapping])

# Print the sorted eigenvalues
print("Sorted Eigen Values:")
print(sorted_eigen_values)

# Print the sorted eigenvectors
print("Sorted Eigen Vectors:")
print(sorted_eigen_vectors)

Sorted Eigen Values:
[4.78977925e-01 1.04111694e-01 7.98670006e-02 3.58442302e-02
 2.79853268e-02 1.56837212e-02 1.49258576e-02 1.27440488e-02
 8.40658362e-03 5.61178073e-03 3.09411514e-03 1.24252859e-03
 3.85893692e-04 2.47845811e-05 2.05722862e-16 1.32242989e-16
 1.19706167e-16 8.22313156e-17 0.00000000e+00]
Sorted Eigen Vectors:
[[ 3.00799647e-02 -1.87585464e-01  0.00000000e+00 -5.39702065e-03
  -4.11374144e-04  3.91583879e-03  8.76745722e-04  5.44932759e-03
   1.16892052e-03  3.77712234e-01  3.60493683e-01  4.14230637e-01
   3.55031196e-01 -2.39425062e-01  2.64348427e-01 -1.82854510e-01
   4.06079990e-01 -1.96584702e-01 -1.71427257e-01]
 [-3.48148525e-01 -3.84379820e-01  0.00000000e+00 -2.94801796e-02
   2.21840647e-02  1.74985131e-02  6.27908825e-03  9.64666549e-04
   4.64305337e-03 -1.09753122e-01 -1.06515216e-01 -6.74849642e-02
  -1.57435282e-01  5.73777975e-02  1.76635398e-01 -3.43253489e-01
  -1.04222891e-01  3.14871256e-01 -6.43632311e-01]
 [ 9.34838934e-01 -1.21941140e-01  0

In [10]:
newFeatures = sorted_eigen_vectors.T
XTrans = np.dot(newFeatures, A.T)
print(XTrans.T)

[[-0.09593153 -0.08667121  0.54997492 ...  0.05169349  0.11566379
  -0.07105285]
 [-0.09591327 -0.06473555  0.55511215 ...  0.01223036  0.13009671
  -0.05444071]
 [-0.16262853 -0.2641914   0.56290802 ...  0.21786786  0.09542548
  -0.30890214]
 ...
 [-0.05178109 -0.06041104 -0.10595885 ...  0.11577396  0.07992111
   0.01973536]
 [-0.04594641 -0.07639229 -0.09708498 ...  0.1578365   0.05088456
   0.0015278 ]
 [-0.04136852 -0.27690066 -0.11149968 ...  0.10254763  0.21956189
   0.06639398]]


In [11]:
reducedFeatures = sorted_eigen_vectors[:,0:7].T
reducedXTrans = np.dot(reducedFeatures, A.T)
print(reducedXTrans.T)

[[-0.09593153 -0.08667121  0.54997492 ... -0.03014838 -0.0947324
   0.12868099]
 [-0.09591327 -0.06473555  0.55511215 ... -0.0260817  -0.09038353
   0.12546691]
 [-0.16262853 -0.2641914   0.56290802 ...  0.02341377 -0.04876999
   0.14230493]
 ...
 [-0.05178109 -0.06041104 -0.10595885 ... -0.047677   -0.09841532
   0.0672471 ]
 [-0.04594641 -0.07639229 -0.09708498 ... -0.01772054 -0.06873862
   0.07521966]
 [-0.04136852 -0.27690066 -0.11149968 ... -0.05750938 -0.11597675
   0.05910449]]


In [12]:
reducedXTrans.T.shape

(2100, 7)

**Performing Kmeans again and computing the Completeness and Homogeneity values of the new clusters.**

In [13]:
# Perform K-means clustering
kmeans = KMeans(n_clusters=7, random_state=42)
kmeans.fit(np.asarray(reducedXTrans.T))

# Cluster labels and centroids
cluster_lbls = kmeans.labels_
cluster_cent = kmeans.cluster_centers_
print("7 Cluster Centroids:")

for i in cluster_cent:
    print(i)
    print("\n")

# Compute Completeness and Homogeneity scores
completeness = completeness_score(ground_truth, cluster_lbls)
homogeneity = homogeneity_score(ground_truth, cluster_lbls)

print("Completeness score:", completeness)
print("Homogeneity score:", homogeneity)

7 Cluster Centroids:
[-0.00121195 -0.02368154  0.10358806 -0.00499305 -0.05559966 -0.10568824
  0.08510924]


[-0.1140402  -0.17504583  0.61045832 -0.02223723 -0.01944111 -0.08339894
  0.11326918]


[ 0.10085936  0.17071637 -0.1557069   0.06516524  0.03713375  0.186405
 -0.22495365]


[ 0.04397123  0.13485743 -0.09815682 -0.00286455  0.02210102  0.01554255
 -0.00853417]


[-0.058358   -0.10964474 -0.11532363 -0.00727499  0.0260986   0.04551928
 -0.04320091]


[ 0.01625095 -0.0069439  -0.12504433 -0.0124671  -0.04219383 -0.092387
  0.06474016]


[ 0.04244172  0.08126548 -0.15146926 -0.06921595  0.32404983  0.46250624
  0.17353999]


Completeness score: 0.6585886426136276
Homogeneity score: 0.6057460081163977




**Observation**

In [14]:
# without PCA - observation 1
Completeness: 0.613187012485301
Homogeneity: 0.6115021163370862

# with PCA (reduced data) - observation 2
Completeness: 0.6585886426136276
Homogeneity: 0.6057460081163977

Comparing the two results, we can see that the Completeness score of the reduced data(obs 2) is better than the observation 1.
And the Homogeneity score of observation 1 is slightly better than observation 2. Comparing the overall performance using the completeness and homogeneity, performing clustering in the reduced data has improved the performance, as the observation 2 captures more data points from the true class than the observation 1.

In a nutshell, based on the completeness and homogeneity scores, Clustering Result 2 seems to perform slightly better than Result 1. It captures a larger portion of data points from the same true class within the same cluster