In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from PIL import Image
import matplotlib.pyplot as plt
import os
import numpy as np
import cv2
from skimage.io import imread
from skimage.feature import hog,local_binary_pattern
from skimage.feature import local_binary_pattern
from skimage.transform import resize
from skimage.io import imread
import pandas as pd
from sklearn.decomposition import PCA
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms

In [None]:
lfw_allnames = pd.read_csv("lfw_allnames.csv")

In [None]:
data_w_path = lfw_allnames.loc[lfw_allnames.index.repeat(lfw_allnames['images'])]  #repeat till how many images are there 
data_w_path = data_w_path.drop("images",axis=1)  #remove the image total count column
data_w_path['image_path'] = 1 + data_w_path.groupby('name').cumcount()
data_w_path['image_path'] = data_w_path.image_path.apply(lambda x: '{0:0>4}'.format(x))
data_w_path['image_path'] = data_w_path.name + "/" + data_w_path.name + "_" + data_w_path.image_path + ".jpg"

In [None]:
print(data_w_path)

In [None]:
data_w_path['name'].value_counts()[:10].plot(kind = "bar")  #first 20 in ascending order

In [None]:
# splitting dataset
lfw_train, lfw_test = train_test_split(data_w_path, test_size=0.2)
lfw_train = lfw_train.reset_index().drop("index",axis=1)  #resetting the index and dropping the column names index
lfw_test = lfw_test.reset_index().drop("index",axis=1)


In [None]:
print(lfw_test)

In [None]:
# verifying the similarities between train and test
print(len(set(lfw_train.name).intersection(set(lfw_test.name))))  #total number of same names in train and test
print(len(set(lfw_test.name) - set(lfw_train.name)))  #names which occurred in test but not in train



In [None]:
im = Image.open("dataset" + str(lfw_train.image_path[0]))
plt.imshow(im)

<h3>HOG_95

In [None]:
folder = "dataset"


# Define a function to compute HOG features for an image
def compute_hog(img):
    # Resizing image
    resized_img = resize(img, (128, 64))
    # Creating HOG features
    fd, hog_image = hog(resized_img, orientations=9, pixels_per_cell=(8, 8),
                    cells_per_block=(2, 2), visualize=True, channel_axis=-1)
    return fd

# Assuming lfw_test is a DataFrame with 'image_path' column containing paths to images
# Loop through each row in lfw_test
hog_features = []
for index, row in lfw_train.iterrows():
    path = row['image_path']

    image_path=os.path.join(folder,path)
    img = imread(image_path)

    # Computing HOG features for the image
    features_hog = compute_hog(img)
    hog_features.append(features_hog)

# Converting hog_features list to DataFrame
hog_features_df = pd.DataFrame(hog_features)

# Apply PCA to reduce dimensionality while retaining 0.95 variance
pca_hog = PCA(n_components=0.95)
pca_result_hog = pca_hog.fit_transform(hog_features_df)

# Now pca_result contains the reduced dimensional features with 0.95 variance
print("Original number of features:", hog_features_df.shape[1])
print("Reduced number of features after PCA:", pca_result_hog.shape[1])


In [None]:
#pipeline for all functions for extracting features

# Load pre-trained ResNet-50 model
resnet = models.resnet50(weights='ResNet50_Weights.DEFAULT')
# Remove the last fully connected layer
resnet = nn.Sequential(*list(resnet.children())[:-1])
# Set the model to evaluation mode
resnet.eval()

# Define a function to extract features from an image
def extract_features(image_path, model):
    # Load and preprocess the image
    image = Image.open(image_path).convert('RGB')
    # plt.imshow(image)
    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    image = preprocess(image)
    
    # Add batch dimension
    image = image.unsqueeze(0)
    # Extract features
    with torch.no_grad():
        features = model(image)
    # Remove the batch dimension
    features = features.squeeze(0)
    features_array = np.array([round(feature.item(), 4) for feature in features])

    return features_array


cnn_features = []
for index, row in lfw_train.iterrows():
    path = row['image_path']
#
    image_path=os.path.join(folder,path)
    features_cnn = extract_features(image_path, resnet)  #for cnn features
    cnn_features.append(features_cnn)

cnn_array=np.array(cnn_features)


# for applying pca on cnn features
# # Convert cnn_features list to DataFrame
# cnn_features_df = pd.DataFrame(cnn_features)

# # Apply PCA to reduce dimensionality while retaining 0.95 variance
# pca_cnn = PCA(n_components=0.95)
# pca_result_cnn = pca_cnn.fit_transform(cnn_features_df)

# # Now pca_result contains the reduced dimensional features with 0.95 variance
# print("Original number of features:", cnn_features_df.shape[1])
# print("Reduced number of features after PCA:", pca_result_cnn.shape[1])

In [None]:
def get_pixel(img, center, x, y):
    new_value = 0
    try:
        if img[x][y] >= center:
            new_value = 1
    except:
        pass
    return new_value

def lbp_calculated_pixel(img, x, y):
    center = img[x][y]
    val_ar = []
    val_ar.append(get_pixel(img, center, x-1, y-1))     # top_left
    val_ar.append(get_pixel(img, center, x, y-1))       # top
    val_ar.append(get_pixel(img, center, x+1, y-1))     # top_right
    val_ar.append(get_pixel(img, center, x+1, y))       # right
    val_ar.append(get_pixel(img, center, x+1, y+1))     # bottom_right
    val_ar.append(get_pixel(img, center, x, y+1))       # bottom
    val_ar.append(get_pixel(img, center, x-1, y+1))     # bottom_left
    val_ar.append(get_pixel(img, center, x-1, y))       # left

    power_val = [1, 2, 4, 8, 16, 32, 64, 128] #this depict powers of 2 starting from top_left
    val = 0
    for i in range(len(val_ar)):
        val += val_ar[i] * power_val[i]
    return val


def calcLBP(img):
    height, width, channel = img.shape
    # print(height,width,channel)
    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    img_lbp = np.zeros((height, width,3), np.uint8)
    for i in range(0, height):
        for j in range(0, width):
             img_lbp[i, j] = lbp_calculated_pixel(img_gray, i, j)
    hist_lbp = cv2.calcHist([img_lbp], [0], None, [256], [0, 256])  
    hist_lbp=hist_lbp.flatten()
    return hist_lbp


In [None]:
lbp_features = []
for index, row in lfw_train.iterrows():
        path = row['image_path']
        image_path = os.path.join(folder, path)
        img2=cv2.imread(image_path)
        features_lbp=calcLBP(img2)
        lbp_features.append(features_lbp)

lbp_features_array = np.array(lbp_features)

In [None]:
lbp_features_array.shape

In [None]:
# giving labels to numpy array 
labels=lfw_train['name'].values

In [None]:
# Concatenating the arrays along the columns (axis=1)
concatenated_array = np.concatenate((pca_result_hog, cnn_array, lbp_features_array), axis=1)

# Combine concatenated features with labels
data_with_labels = np.column_stack((labels, concatenated_array))

# Save the combined data as a CSV file
np.savetxt('extracted_features_hog_95.csv', data_with_labels, delimiter=',', fmt='%s')

print("Extraction complete. Data saved to: extracted_features_hog_95.csv")

In [None]:
concatenated_array.shape

In [None]:
# verifying that csv is created properly
df = pd.read_csv("extracted_features_hog_95.csv",header=None)

In [None]:
df.head

In [None]:
# For Test data
hog_features_test = []
for index, row in lfw_test.iterrows():
    path = row['image_path']
    image_path=os.path.join(folder,path)
    img = imread(image_path)
    # Compute HOG features for the image
    features_hog = compute_hog(img)
    hog_features_test.append(features_hog)



hog_features_test_df = pd.DataFrame(hog_features_test)

# Apply PCA to reduce dimensionality while retaining 0.95 variance

pca_result_hog_test = pca_hog.transform(hog_features_test_df)  #using the same pca trained on training data so as to get the same number of features

# Now pca_result contains the reduced dimensional features with 0.95 variance
print("Original number of features:", hog_features_test_df.shape[1])
print("Reduced number of features after PCA:", pca_result_hog_test.shape[1])

In [None]:
cnn_features_test = []
for index, row in lfw_test.iterrows():
    path = row['image_path']
#
    image_path=os.path.join(folder,path)
    features_cnn = extract_features(image_path, resnet)  #for cnn features
    cnn_features_test.append(features_cnn)


cnn_array_test=np.array(cnn_features_test)

In [None]:
lbp_features_test = []
for index, row in lfw_test.iterrows():
        path = row['image_path']
        image_path = os.path.join(folder, path)
        img2=cv2.imread(image_path)
        features_lbp=calcLBP(img2)
        lbp_features_test.append(features_lbp)

        
lbp_features_array_test = np.array(lbp_features_test)

In [None]:
labels_test=lfw_test['name'].values

In [None]:
# Concatenate the arrays along the columns (axis=1)
concatenated_array_test = np.concatenate((pca_result_hog_test, cnn_array_test, lbp_features_array_test), axis=1)

# Combine concatenated features with labels
data_with_labels_test = np.column_stack((labels_test, concatenated_array_test))

# Save the combined data as a CSV file
np.savetxt('extracted_features_test_hog_95.csv', data_with_labels_test, delimiter=',', fmt='%s')

print("Extraction complete. Data saved to: extracted_features_test_hog95.csv")
