**Installing necessary Libraries**

In [None]:
!pip install pyheif
!pip install deepface

**Importing necessary Libraries/ Modules**

In [None]:
import cv2 as cv
import numpy as np
import pandas as pd
import pyheif
import os
import re
import time
from matplotlib import pyplot as plt
from google.colab import drive
from google.colab import files
from PIL import Image
from scipy import ndimage
from deepface import DeepFace
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime

In [None]:
"""
Mounting the Google drive
"""
drive.mount('/content/drive')

In [None]:
'''
Declaring constant variables for file paths
'''
# TRAINING_DATA_DIRECTORY = "/content/drive/MyDrive/Project/training"
# TRAINING_DATA_RENAMED_DIRECTORY = "/content/drive/My Drive/Project/Release_Renamed/"
# TRAINING_DATA_WITH_AUGMENTATION_DIRECTORY = "/content/drive/My Drive/Project/Release_with_flip/"
# TESTING_DATA_DIRECTORY = "/content/drive/My Drive/Project/test/"

**Data Cleanup and augmentation functions for Training dataset**

In [None]:
"""
This method should be only run once on the initial raw training set.
This function renames all files to a standard serialized format (<filename>_IMG.jpg).
"""
def rename_image_files():
  if os.path.isdir(TRAINING_DATA_RENAMED_DIRECTORY)==False:
    os.mkdir(TRAINING_DATA_RENAMED_DIRECTORY)
  for i, dirname in enumerate(os.listdir(TRAINING_DATA_DIRECTORY)):
    directory = TRAINING_DATA_DIRECTORY+dirname
    for j, fname in enumerate(os.listdir(directory)):
      newname = dirname+"_IMG"+str(j+1)+".jpg"
      if fname.endswith('HEIC'):
        print(directory+"/"+fname)
        heif_file = pyheif.read(directory+"/"+fname)
        image = Image.frombytes(heif_file.mode, heif_file.size, heif_file.data)
        image.save(TRAINING_DATA_RENAMED_DIRECTORY+newname)
      else:
        image = cv.imread(directory+"/"+fname)
        cv.imwrite(TRAINING_DATA_RENAMED_DIRECTORY+newname, image)

In [None]:
"""
This method parses all the training image files and performs augmentation on the dataset.
It uses the DeepFace.extract_faces function to extract faces from images.
The extracted face is then processed to remove black margins added by default by DeepFace, and its size is adjusted using OpenCV
This generates new images with applied augmentations.
"""
def extract_training_data_and_augment(TRAINING_DATA_RENAMED_DIRECTORY):
  start_time = datetime.now()
  if os.path.isdir(TRAINING_DATA_WITH_AUGMENTATION_DIRECTORY)==False:
    os.mkdir(TRAINING_DATA_WITH_AUGMENTATION_DIRECTORY)
  os.chdir(TRAINING_DATA_RENAMED_DIRECTORY)
  for fname in os.listdir(TRAINING_DATA_RENAMED_DIRECTORY):
    print(str(fname))
    img = DeepFace.extract_faces(fname, target_size=(110,110), detector_backend = "mtcnn", grayscale=True)
    img_data = img[0]['face']
    img_data = img_data[:,11:99]  #Removing the black margin added by default by DeepFace
    img_data = cv.resize(img_data,(88,88))
    cv.imwrite(TRAINING_DATA_WITH_AUGMENTATION_DIRECTORY+fname[:fname.index('.')]+"_1.jpg", img_data * 255)

    #Data Augmentation code:

    #Mirroring the extracted face
    flipped_img=cv.flip(img_data,1)
    cv.imwrite(TRAINING_DATA_WITH_AUGMENTATION_DIRECTORY+fname[:fname.index('.')]+"_2.jpg", flipped_img * 255)

  execution_time = datetime.now() - start_time
  return execution_time

In [None]:
"""
Other data augmentations explored
"""
"""
      alpha = 1.5
      beta = 10

      #Image Rotation
      rotated_image_1=ndimage.rotate(img_data,10,reshape=False) #Rotating the extracted face by 10 degrees to the right
      cv.imwrite(curated_data_directory+fname[:fname.index('.')]+"_3.jpg", rotated_image_1*255)

      rotated_image_2=ndimage.rotate(img_data,-10,reshape=False) #Rotating the extracted face by 10 degrees to the left
      cv.imwrite(curated_data_directory+fname[:fname.index('.')]+"_4.jpg", rotated_image_2*255)

      #Sharpening images
      #sharpened_img=cv.filter2D(img[0]['face'], -1, kernel)

      #Brightness and contrast control
      img_gamma_1=np.power(img_data*255, 1.1).clip(0,255).astype(np.uint8) #Gamma Changes to images - making it brighter #Clip used to keep the value between 0 and 255
      cv.imwrite(curated_data_directory+fname[:fname.index('.')]+"_5.jpg", img_gamma_1)

      img_gamma_2=np.power(img_data*255, 0.9).clip(0,255).astype(np.uint8) #Gamma Changes to images - making it darker
      cv.imwrite(curated_data_directory+fname[:fname.index('.')]+"_6.jpg", img_gamma_2)

      img_gamma_1 = cv.convertScaleAbs(img_data*255, alpha=alpha, beta=beta)       #Gamma changes
      cv.imwrite(curated_data_directory+fname[:fname.index('.')]+"_5.jpg", img_gamma_1)

      #Stretch along x-axis
      h, w = img_data.shape[:2]
      elongated_image = cv.resize(img_data, None, fx=1.05, fy=1)
      trim_left = ((elongated_image.shape[1]-w)//2)
      cv.imwrite(augmented_data_directory+fname[:fname.index('.')]+"_3.jpg", elongated_image[0:h, trim_left:trim_left+w] * 255)

      #Zooming images
      h, w = img_data.shape[:2]
      zoom_factor=1.1
      zh = int(np.round(h / zoom_factor))
      zw = int(np.round(w / zoom_factor))
      top = (h - zh) // 2
      left = (w - zw) // 2

      out = zoom(img_data[top:top+zh, left:left+zw], zoom_factor)

      # `out` might still be slightly larger than `img` due to rounding, so
      # trim off any extra pixels at the edges
      trim_top = ((out.shape[0] - h) // 2)
      trim_left = ((out.shape[1] - w) // 2)
      out = out[trim_top:trim_top+h, trim_left:trim_left+w]
      cv.imwrite(curated_data_directory+fname[:fname.index('.')]+"_3.jpg", out * 255)
      """

**Function call to extract training data and perform augmentation on it. CAN BE SKIPPED.**

In [None]:
rename_image_files()
total_execution_time = extract_training_data_and_augment(TRAINING_DATA_RENAMED_DIRECTORY)
print('Time required for extracting faces from files and performing augmentation: ', total_execution_time )

**Necessary functions**

In [None]:
"""
This function takes a list of arrays, assume they have the same shape (based on the shape of the first array), and
returns a new 2D NumPy array where each original array is flattened.
"""
def flatten_data(data):
  original_shape = data[0].shape
  data_flattened = np.array([data[i].flatten() for i in range(len(data))])
  return data_flattened

In [None]:
"""
This function reads and preprocess training data for a machine learning model.
This code assumes that the images are grayscale and that the class names are present in the file names before the first underscore.
Additionally, it uses OpenCV (cv) for image reading and manipulation.
Returns training data and corresponding labels.
"""
def read_training_data():
  data = []
  labels = []
  os.chdir(TRAINING_DATA_WITH_AUGMENTATION_DIRECTORY)
  for fname in os.listdir(TRAINING_DATA_WITH_AUGMENTATION_DIRECTORY):
    data.append(cv.imread(fname,cv.IMREAD_GRAYSCALE)/255)
    classname = fname[:fname.index('_')]
    labels.append(classname)
  y_train = np.array(labels)
  x_train = flatten_data(data)
  return x_train, y_train

In [None]:
"""This function reads and preprocess testing data for a machine learning model.
This code performs following action:
  1. Reads the testing dataset from the specific folder
  2. Extracts the labels from the .txt file
  3. Crops faces using DeepFace
  4. Resizes the image to a certain size
Returns testing data and labels.
"""

def read_testing_data():
  start_time = datetime.now()
  os.chdir(TESTING_DATA_DIRECTORY)
  x_test = []
  y_test = []
  test_labels = {}
  label_file = open("labels.txt","r")

  for l in label_file:
    key = l.split()[0].strip()
    value = l.split()[1].strip()
    test_labels[key] = value

  for fname in os.listdir(TESTING_DATA_DIRECTORY):
    if '.jpg' in fname.lower() or '.png' in fname.lower() or '.jpeg' in fname.lower():
      print(fname)
      img = DeepFace.extract_faces(fname, target_size = (110,110), detector_backend = "mtcnn", grayscale = True)
      img_data = img[0]['face']
      img_data = img_data[:,11:99]
      img_data = cv.resize(img_data,(88,88))
      x_test.append(img_data)
      y_test.append(test_labels[fname])

  y_test = np.array(y_test)
  x_test = flatten_data(x_test)
  execution_time = datetime.now() - start_time
  return x_test, y_test, execution_time

In [None]:
"""
This code implements Linear Discriminant Analysis (LDA) for classification.
Calculates and returns the accuracy score by comparing the predicted labels (y_pred) with the true labels (y_test).
"""
def lda(x_train, y_train, x_test, y_test):
  start_time = datetime.now()
  lda = LinearDiscriminantAnalysis()
  lda.fit(x_train, y_train)
  y_pred = lda.predict(x_test)
  execution_time = datetime.now() - start_time
  return accuracy_score(y_test, y_pred), execution_time

In [None]:
"""
This code performs dimensionality reduction using LDA with a specified number of components (no_components).
Returns the trained LDA model.
"""
def lda_dimension_reduction(x_train, y_train, no_of_components):
  lda_data = LinearDiscriminantAnalysis(n_components = no_of_components)
  lda_data.fit(x_train, y_train)
  return lda_data

In [None]:
"""
This code is designed to transform data to a reduced dimensionality using a pre-trained LDA model.
"""
def transformation_to_reduced_dimension(lda, data):
  return lda.transform(data)

In [None]:
"""
This code implements the k-Nearest Neighbors (kNN) algorithm for classification.
Calculates and returns the accuracy score by comparing the predicted labels (y_pred) with the true labels (y_test).
"""
def knn(x_train, y_train, x_test, y_test):
  start_time = datetime.now()
  knn_clf = KNeighborsClassifier(n_neighbors = 5, weights = 'distance')
  knn_clf.fit(x_train, y_train)
  y_pred = knn_clf.predict(x_test)
  execution_time = datetime.now() - start_time
  return accuracy_score(y_test, y_pred), execution_time

In [None]:
"""
This code implements a One-Vs-Rest Support Vector Machine (SVM) classifier with cross-validation.
Returns the average accuracy over the 10 iterations, effectively performing a form of 10-fold cross-validation.
"""
def svm_one_vs_rest(x_train, y_train, x_test, y_test, c):
  start_time = datetime.now()
  accuracy = 0
  for i in range(10):
    svc = SVC(C = c, probability = True)
    ovr_clf = OneVsRestClassifier(svc)
    ovr_clf = ovr_clf.fit(x_train, y_train)
    y_pred = ovr_clf.predict(x_test)
    accuracy += accuracy_score(y_test, y_pred)
  execution_time = datetime.now() - start_time
  return (accuracy/10), execution_time

In [None]:
def svm_one_vs_rest_with_gamma(x_train, y_train, x_test, y_test, gamma, degree):
  start_time = datetime.now()
  svc_gamma = SVC (gamma = gamma, degree = degree)
  ovr_gamma = OneVsRestClassifier(svc_gamma)
  ovr_gamma = ovr_gamma.fit(x_reduced_train,y_train)
  y_pred = ovr_gamma.predict(x_reduced_test)
  execution_time = datetime.now() - start_time
  return accuracy_score(y_test,y_pred), execution_time

In [None]:
"""
This code implements an ensemble model using Stacking.
Defines a list of base estimators, k-Nearest Neighbors classifier, a one-vs-rest Support Vector Classifier and a Linear Discriminant Analysis classifier and
logistic regression classifier as the final estimator.
Calculates and returns the accuracy score by comparing the predicted labels (y_pred) with the true labels (y_test).
"""
def ensemble_model(x_train, y_train, x_test, y_test):
  start_time = datetime.now()
  estimators = [('knn', KNeighborsClassifier(n_neighbors = 5, weights = 'distance')),
                ('ovr_svc',OneVsRestClassifier(SVC(gamma = 0.001, degree = 1))),
                ('lda',LinearDiscriminantAnalysis())]
  ensemble_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
  ensemble_clf = ensemble_clf.fit(x_train, y_train)
  y_pred = ensemble_clf.predict(x_test)
  execution_time = datetime.now() - start_time
  return accuracy_score(y_test, y_pred), execution_time

**Function calls to train model and predict accuracy of models on testing data**

In [None]:
#Reading training data
x_train, y_train = read_training_data()
print("Total images in training data: {}".format(len(x_train)))
print("Total number of classes: {}".format(len(set(y_train))))

#Reading testing data
x_test, y_test, execution_time = read_testing_data()
print("Total images in testing data: {}".format(len(x_test)), 'Executed in: ', execution_time)

#Calling function to claculate accuracy of LDA
lda_accuracy, execution_time = lda(x_train, y_train, x_test, y_test)
print("Accuracy using LDA model: {}".format(lda_accuracy), ' Executed in: ', execution_time)

#Calling function to perform dimension reduction using LDA with 32 number of components
lda_data = lda_dimension_reduction(x_train, y_train, 32)
x_reduced_train = transformation_to_reduced_dimension(lda_data, x_train)
x_reduced_test = transformation_to_reduced_dimension(lda_data, x_test)

#Calling function to calculate the accuracy of a kNN algorithm on reduced-dimensional data
knn_accuracy, execution_time = knn(x_reduced_train, y_train, x_reduced_test, y_test)
print("Accuracy using kNN classifier: {}".format(knn_accuracy), ' Executed in: ', execution_time)

#Calling function to calculate the accuracy of a SVM algorithm on reduced-dimensional data
C = [1,2]
for c in C:
  svm_accuracy, execution_time = svm_one_vs_rest(x_reduced_train, y_train, x_reduced_test, y_test, c)
  print("Accuracy using SVM One_Vs_Rest classifier c={}: {}".format(c, svm_accuracy), ' Executed in: ', execution_time)

svm_accuracy, execution_time = svm_one_vs_rest_with_gamma(x_reduced_train, y_train, x_reduced_test, y_test, 0.001, 1)
print("Accuracy using SVM One_Vs_Rest classifier gamma={} and degree={}: {}".format(0.001, 1, svm_accuracy), ' Executed in: ', execution_time)

#Calling function to calculate the accuracy of a Stacking Classifier (kNN, SVM and LDA) on reduced-dimensional data
ensemble_accuracy, execution_time = ensemble_model(x_reduced_train, y_train, x_reduced_test, y_test)
print("Accuracy using Ensemble model: {}".format(ensemble_accuracy), ' Executed in: ', execution_time)

**Below are snippets of codes of other algorithms exploited**

In [None]:
rfc_clf = RandomForestClassifier(n_estimators = 150, criterion = 'gini', max_depth = 13)
rfc_clf = rfc_clf.fit(x_reduced_train, y_train)
y_pred = rfc_clf.predict(x_reduced_test)
rf_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy using random forest algorithm: {}".format(rf_accuracy))

In [None]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsOneClassifier
C = [1,2,3,4,5,8,16,32]
for c in C:
  acc = 0
  for i in range(10):
    svc = SVC (C = c, probability = True)
    ovo = OneVsOneClassifier(svc)
    ovo = ovo.fit(x_reduced_train,y_train)
    y_pred = ovo.predict(x_reduced_test)
    acc += accuracy_score(y_test,y_pred)
  print("Average accuracy for C={} is: {}".format(c,acc/10))

In [None]:
estimators = [('knn', KNeighborsClassifier(n_neighbors = 5, weights = 'distance')),
                ('ovr_svc',OneVsRestClassifier(SVC (C = 1))),
                ('lda',LinearDiscriminantAnalysis())]
ensemble_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
ensemble_clf = ensemble_clf.fit(x_reduced_train, y_train)
y_pred = ensemble_clf.predict(x_reduced_test)
print(accuracy_score(y_test,y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
kernel_range=('linear','rbf','poly')
degree_range = (1,2,3,4,5,6,7,8,9,10)
param_grid = dict(gamma=gamma_range, C=C_range, kernel=kernel_range,degree=degree_range)

svm_grid = SVC()
clf = GridSearchCV(svm_grid, param_grid)
clf.fit(x_reduced_train,y_train)
print(clf.best_estimator_)

In [None]:
svc_best_fit = SVC(gamma=1e-09,kernel='linear',C=0.01)
svc_best_fit = svc_best_fit.fit(x_reduced_train,y_train)
y_pred = svc_best_fit.predict(x_reduced_test)
print(accuracy_score(y_test,y_pred))

In [None]:
from sklearn.naive_bayes import GaussianNB, CategoricalNB
clf = GaussianNB()
clf = clf.fit(x_reduced_train,y_train)
y_pred = clf.predict(x_reduced_test)
print(accuracy_score(y_test,y_pred))