In [5]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
from random import sample
import shutil
from zipfile import ZipFile

# 1. Consider entire Yale Dataset.

In [8]:
# Extracting images from zipfile

my_dir = os.getcwd()
zip_folder = os.path.join(my_dir, 'yalefaces.zip')
print(f'Path to folder is, {zip_folder}')
with ZipFile(zip_folder, 'r') as zipfolder:
    zipfolder.extractall()
    
data_folder = os.path.join(my_dir, 'yalefaces')
file_list = os.listdir(data_folder)

Path to folder is, C:\Users\Swaroop\Desktop\IIT_ML_AI\Feature Extraction\Assignments\yalefaces.zip


In [9]:
# 3. Divide the dataset into training and testing sets

train_folder_path = os.path.join(my_dir, 'Faces_Train')
test_folder_path = os.path.join(my_dir, 'Faces_Test')

# - Delete the folder if it aleady exists
if os.path.exists(train_folder_path):
    shutil.rmtree(train_folder_path)
if os.path.exists(test_folder_path):
    shutil.rmtree(test_folder_path)
    
# creating train and test folders
os.mkdir(train_folder_path)
os.mkdir(test_folder_path)

# renames 1 to 01, 5 to 05
idx_list = [str(i).zfill(2) for i in range(1,16,1)] 
print(idx_list) 

# Creating a list of all the paths to all the induvidual files
file_name_list = [[] for i in range(15)]
for i in range(len(idx_list)):
    for fname in file_list:
        if fname.startswith('subject'+idx_list[i]):
            file_name_list[i].append(os.path.join(data_folder, fname))

['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15']


In [10]:
# Copying the training set images and test set images into seperate folders
for i in range(len(idx_list)):
    ls = file_name_list[i]    
    
    test_idx = np.random.choice(11)
    test_file = ls[test_idx]
    shutil.copy(test_file, test_folder_path)
    
    ls.remove(ls[test_idx])
    
    for train_file in ls:
        shutil.copy(train_file, train_folder_path)

In [11]:
# Finding the eigen vectors

train_files_list = os.listdir(train_folder_path)
train_ls = []

for file in train_files_list:
    img_file = os.path.join(train_folder_path, file)
    arr = mpimg.imread(img_file)
    arr = arr.reshape(1, arr.shape[0]*arr.shape[1]) #reshaping to 1D to form a matrix
    train_ls.append(np.ravel(arr)) # converting to 1d array before appending

train_mat = np.matrix(train_ls)
mean_img = np.mean(train_mat, axis=0)

cov = np.cov(train_mat)
eig_val, eig_vec = np.linalg.eig(cov)

In [13]:
# Find the right Singular Matrix

eigen_vec_ls = []

for i in range(eig_vec.shape[1]):
    eig1 = train_mat.T@eig_vec[:,i]
    eig1 = eig1/eig_val[i]
    eigen_vec_ls.append(np.ravel(eig1))
print(len(eigen_vec_ls), eigen_vec_ls[0].shape)

151 (77760,)


In [15]:
# Find the principal components

sort_idx = np.argsort(eig_val)
sort_idx = sort_idx[::-1]

eig_val_sum = np.sum(eig_val)
temp_sum = 0

principal_eig_vec = []
principal_eig_val = []

i = 0

while(temp_sum <0.95*eig_val_sum):
    principal_eig_val.append(eig_val[sort_idx[i]])
    principal_eig_vec.append(eigen_vec_ls[sort_idx[i]])
    temp_sum += eig_val[sort_idx[i]]
    i += 1
print(f'Number of components is {i}') # reduced from 151

Number of components is 28



# 2. FInd the Transformation Matrix "Q".

In [19]:
# Finding Q
Q_hat = np.matrix(principal_eig_vec)
Q = np.linalg.pinv(Q_hat)
Q, Q.shape

(matrix([[ 3.20065795e-03+0.j, -5.40464086e-05+0.j, -3.28126169e-04+0.j,
          ..., -6.32259211e-05+0.j,  5.96446817e-05+0.j,
          -5.76096685e-05+0.j],
         [ 3.67923053e-03+0.j, -3.30131214e-04+0.j, -4.07367592e-04+0.j,
          ..., -7.60228545e-05+0.j,  6.93107244e-05+0.j,
          -2.94551321e-05+0.j],
         [ 4.24286108e-03+0.j, -4.24548775e-04+0.j, -3.04482647e-04+0.j,
          ...,  5.21364457e-05+0.j,  2.91760631e-05+0.j,
          -1.08432499e-07+0.j],
         ...,
         [ 2.06916412e-03+0.j,  2.59721031e-04+0.j,  3.98704309e-04+0.j,
          ...,  1.04584440e-04+0.j, -5.93482116e-05+0.j,
           6.25346802e-05+0.j],
         [ 2.06916412e-03+0.j,  2.59721031e-04+0.j,  3.98704309e-04+0.j,
          ...,  1.04584440e-04+0.j, -5.93482116e-05+0.j,
           6.25346802e-05+0.j],
         [ 2.06916412e-03+0.j,  2.59721031e-04+0.j,  3.98704309e-04+0.j,
          ...,  1.04584440e-04+0.j, -5.93482116e-05+0.j,
           6.25346802e-05+0.j]]),
 (77760, 28)

# 3. Find the Feature Vector for every Training Set.

In [21]:
# Obtain feature vectors

test_files = os.listdir(test_folder_path)
feat_vec_ls = []

for test_file in test_files:
    img_file = os.path.join(test_folder_path, test_file)
    test_img = mpimg.imread(img_file)
    test_img = test_img.reshape(test_img.shape[0]*test_img.shape[1], 1)
    test_img = test_img - mean_img.T
    
    feat_vec = Q.T@test_img
    feat_vec_ls.append(np.ravel(feat_vec))
feat_vec_ls

[array([  1995.18287859+0.j, -12204.20131495+0.j,   1669.00760705+0.j,
           847.12072768+0.j,  -2859.26619174+0.j,    781.80284594+0.j,
          1328.00804298+0.j,    736.82645184+0.j,    527.34392619+0.j,
           184.30555903+0.j,    316.20388382+0.j,   -999.79790147+0.j,
          -226.48533875+0.j,    -89.94582272+0.j,   -522.27190222+0.j,
           202.24494589+0.j,    -18.96823507+0.j,    151.37971377+0.j,
            62.5335776 +0.j,   -108.12346578+0.j,   -140.6261235 +0.j,
          -187.42892333+0.j,    -16.03085056+0.j,   -248.1027388 +0.j,
           362.05816627+0.j,   -293.94451154+0.j,    292.5197823 +0.j,
          -225.75034349+0.j]),
 array([ 2.09693450e+02+0.j, -2.06683733e+04+0.j,  4.06681780e+03+0.j,
         1.06466338e+03+0.j, -2.82538608e+02+0.j, -2.09664422e+03+0.j,
         2.48178916e+03+0.j,  2.93386361e+01+0.j, -1.79457257e+03+0.j,
         2.01242293e+02+0.j, -1.03440153e+03+0.j, -6.82899366e+02+0.j,
        -6.28421975e+02+0.j, -1.00331321e+02+0

# 4. Generate around 50 random vectors of dimension n_dim, where n_dim is the number of dimensions in v.

In [41]:
def genRandomVectors(m, length):
    random_vector = []
    for i in range(m):
        rand_vec = np.random.uniform(-1,1,length)
        rand_vec_norm = rand_vec/np.linalg.norm(rand_vec)
        random_vector.append(rand_vec_norm)
    return random_vector

In [44]:
# here,
m= 50
length = feat_vec_ls[0].size

rand_vec_50 = genRandomVectors(m, length)
rand_vec_50

[array([ 0.11608289,  0.08720936, -0.25670903,  0.04613902, -0.14329165,
        -0.13036193,  0.19552141,  0.24861812, -0.28043792, -0.04327443,
         0.11654703,  0.1640669 ,  0.18400888,  0.09528495, -0.06102304,
        -0.02189416, -0.18242617,  0.2035979 , -0.28264521, -0.23863574,
        -0.2033543 , -0.24004961, -0.24858038,  0.24866278, -0.07433158,
        -0.26950367, -0.27586987,  0.12935689]),
 array([ 0.00887715, -0.25880611,  0.10469505,  0.11294751, -0.13291682,
         0.30974493,  0.23926859, -0.02383351, -0.32676209,  0.17484443,
         0.29360493,  0.08283858, -0.27557475, -0.04874299, -0.04294978,
         0.19939783, -0.1670489 , -0.09002166, -0.26686736,  0.24791652,
        -0.2655357 , -0.11439429,  0.21117799,  0.15646812,  0.2192684 ,
         0.01789374, -0.0428187 ,  0.12597032]),
 array([-0.16208695, -0.1323287 ,  0.2150582 , -0.09853481, -0.22277685,
        -0.18873623,  0.29059562,  0.12343609, -0.06762298, -0.28245072,
        -0.1598735 , -0.22

# 5. Generate 50-bit hash representation of each of the feature vectors.

In [51]:
def generateHashRep(input_vec, rand_vec):
    hash_rep = []
    for i in range(len(input_vec)):
        if (input_vec@rand_vec_50[i]) >= 0:
            hash_rep.append('1')
        else:
            hash_rep.append('0')
    return hash_rep

In [65]:
feat_hash_dict = {}

for idx, i in enumerate(feat_vec_ls):
    feat_hash_dict[idx] = ''.join(generateHashRep(i, rand_vec_50))
feat_hash_dict

{0: '0111100001000110011010001101',
 1: '0111100101000110011110011100',
 2: '1100010010000100100011000001',
 3: '0010101010111011110001111110',
 4: '0110100111000110011110001110',
 5: '0010101011111011111100111010',
 6: '0101110101000100011110001101',
 7: '1000001000111001100001110010',
 8: '1000001000011001100011000010',
 9: '1000010010000000000011000001',
 10: '0111100101000110011110001101',
 11: '0111110101000100011110001101',
 12: '0100010110000100110111100001',
 13: '1011001010111011100101110010',
 14: '1000010000000000100011000001'}

In [70]:
i

4