# Build a deep-learning classifer that recognises students in the classroom from close up facial images to register their attendance

## 1. Install all the necessary libraries 

In [None]:
import os 
import cv2
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from collections import OrderedDict
import keras 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from PIL import Image
path_dataset = "Dataset/lfw-deepfunneled/lfw-deepfunneled"

%pip install mtcnn
from mtcnn.mtcnn import MTCNN

import shutil
from shutil import unpack_archive
from subprocess import check_output

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense

## 2. Exploratory Data Analysis (EDA)

### 2.1 Loading and reading all the data 

In [None]:
# 1. Loading all the necessary data 
lfw_allnames = pd.read_csv("Dataset/lfw_allnames.csv")
matchpairsDevTest = pd.read_csv("Dataset/matchpairsDevTest.csv")
matchpairsDevTrain = pd.read_csv("Dataset/matchpairsDevTrain.csv")
mismatchpairsDevTest = pd.read_csv("Dataset/mismatchpairsDevTest.csv")
mismatchpairsDevTrain = pd.read_csv("Dataset/mismatchpairsDevTrain.csv")
pairs = pd.read_csv("Dataset/pairs.csv")
people = pd.read_csv("Dataset/people.csv")
peopleDevTest = pd.read_csv("Dataset/peopleDevTest.csv")
peoplleDevTrain = pd.read_csv("Dataset/peopleDevTrain.csv")

### 2.2 Initial Exploration of the data

In [None]:
lfw_allnames.head()
matchpairsDevTest.head() 
matchpairsDevTrain.head() 
mismatchpairsDevTest.head() 
mismatchpairsDevTrain.head() 
pairs.head() 
people.head() 
peopleDevTest.head()
peoplleDevTrain.head()

### 2.3 Exploring the initial statistics and distributions of the data

In [None]:
# lfw_allnames.describe()
# lfw_allnames.hist()
# people.describe()
# people.hist()
person_images = lfw_allnames.groupby('name')['images'].sum().reset_index()
person_images_sorted_desc = person_images.sort_values('cimages', ascending=False).head(10)
plt.figure(figsize=(12, 6))
plt.bar(person_images_sorted_desc['name'], person_images_sorted_desc['images'])
plt.xlabel('Person')
plt.ylabel('Number of Images')
plt.title('Number of images per person in Descending Order (Top 10)')
plt.xticks(rotation=90)
plt.show()

### 2.4 A general overview of the data

In [None]:
unique_persons = lfw_allnames.shape[0]
has_multiple_images = sum(lfw_allnames.images > 1)
total_num_images = sum(lfw_allnames.images)
has_most_images = lfw_allnames.iloc[lfw_allnames['images'].idxmax()][0]
unique_images = max(lfw_allnames.images)

# Printing the information 
print("Important to note:")
print("\n")
print("1. The dataset has a total of "+str(total_num_images)+  " images. ")
print("2. In which there are a total of " +str(unique_persons)+ " unique names of people.")
print("3. "+str(has_multiple_images)+  " of people in the dataset have multiple images. ")
print("4. The person who has the most number of images is: "+str(has_most_images))
print("5. There are a total of "+str(unique_images)+  " unique images in the dataset. ")
print("\n")

# lfw_allnames.describe()
# lfw_allnames.hist()

### 2.5 Cleaning and handling missing data

In [None]:
# Cleaning pairs data 
pairs = pairs.rename(columns ={'name': 'name1', 'Unnamed: 3': 'name2'})
matched_pairs = pairs[pairs["name2"].isnull()].drop("name2",axis=1)
mismatched_pairs = pairs[pairs["name2"].notnull()]

# Handling missing/null values
lfw_allnames.dropna(inplace=True)
people = people[people.name.notnull()]

### 2.6 Organising data and splitting them into train/test

In [None]:
# The dataframe has the 'name' column for the person's name and the 'path_of_image' column for the corresponding image file path.
path_of_image = lfw_allnames.loc[lfw_allnames.index.repeat(lfw_allnames['images'])]
# counting the number of images in each group 
path_of_image['image_path'] = 1 + path_of_image.groupby('name').cumcount()
# Formatting the file to start with 0 and having a max of 4 characters
path_of_image['image_path'] = path_of_image.image_path.apply(lambda x: '{0:0>4}'.format(x))
path_of_image['image_path'] = path_of_image.name + "/" + path_of_image.name + "_" + path_of_image.image_path + ".jpg"
path_of_image = path_of_image.drop("images",1)


In [None]:
train_data, test_data = train_test_split(path_of_image, test_size=0.2)
train_data = train_data.reset_index().drop("index",1)
test_data = test_data.reset_index().drop("index",1)

In [None]:
train_individuals = set(train_data['name'])
test_individuals = set(test_data['name'])
common_individuals = train_individuals.intersection(test_individuals)
if len(common_individuals) > 0:
    print("There are common individuals in both the training and test set.")
else:
    print("There are no common individuals in the training and test set.")

In [None]:
expected_width = None
expected_height = None
files = path_of_image.image_path

# Iterate over each image file path in the dataframe
for file in files:
    im_path = "Dataset/lfw-deepfunneled/lfw-deepfunneled/" + str(file)
    # Open the image using PIL library
    image = Image.open(im_path)
    width, height = image.size
    
    # Set the expected resolution if it's the first iteration
    if expected_width is None or expected_height is None:
        expected_width = width
        expected_height = height
    
    # Compare the resolution of the current image with the expected resolution
    if width != expected_width or height != expected_height:
        print(f"Inconsistent resolution found in image: {image_path}")
        print(f"Expected resolution: {expected_width}x{expected_height}")
        print(f"Actual resolution: {width}x{height}")
        break
else:
    print("All images have consistent resolution.")

In [None]:
path_of_image['name'].value_counts()[:10].plot(kind = "bar")

In [None]:
person_with_single_img = len(lfw_allnames[lfw_allnames['images'] == 1])
print(str(person_with_single_img)+ " number of people have only one image in their group/class.")

In [None]:
im = Image.open("Dataset/lfw-deepfunneled/lfw-deepfunneled/" + str(train_data.image_path[0]))
plt.imshow(im)

## 3. Model 

### 3.1 Initial model

In [None]:
import matplotlib.patches as patches
face_detector = MTCNN()
image = cv2.imread("Dataset/lfw-deepfunneled/lfw-deepfunneled/" + str(train_data.image_path[0]))
result = face_detector.detect_faces(image)

# Boundaries
box = result[0]['box']
keypoints = result[0]['keypoints']

# Plot image
fig,ax = plt.subplots(1)
ax.imshow(image)

rect = patches.Rectangle(box[0:2],box[2],box[3],linewidth=1,edgecolor='b',facecolor='none')

ax.add_patch(rect)

for key in keypoints:
    rect_key = patches.Rectangle(keypoints[key],1,1,linewidth=10,edgecolor='r',facecolor='none')
    ax.add_patch(rect_key)
    
plt.show()

In [None]:
path_of_image['name'].value_counts()[:6]

In [None]:
data = pd.concat([path_of_image[path_of_image.name=="George_W_Bush"].sample(75),
                        path_of_image[path_of_image.name=="Colin_Powell"].sample(75),
                        path_of_image[path_of_image.name=="Tony_Blair"].sample(75),
                        path_of_image[path_of_image.name=="Donald_Rumsfeld"].sample(75),
                        path_of_image[path_of_image.name=="Gerhard_Schroeder"].sample(75),
                        path_of_image[path_of_image.name=="Ariel_Sharon"].sample(75)])

In [None]:
# Move Images to train/test/val folders 

def directory_mover(data,dir_name):
    co = 0
    for image in data.image_path:
        # create top directory
        if not os.path.exists(os.path.join('Core/',dir_name)):
            shutil.os.mkdir(os.path.join('Core/',dir_name))
        
        data_type = data[data['image_path'] == image]['name']
        data_type = str(list(data_type)[0])
        if not os.path.exists(os.path.join('Core/',dir_name,data_type)):
            shutil.os.mkdir(os.path.join('Core/',dir_name,data_type))
        path_from = os.path.join('Dataset/lfw-deepfunneled/lfw-deepfunneled/',image)
        path_to = os.path.join('Core/',dir_name,data_type)
        
        shutil.copy(path_from, path_to)
        co += 1
        
    print('Moved {} images to {} folder.'.format(co,dir_name))
    
#     source_p = source["image_path"].iloc[0] 
    
#     # Making directories for train/test/val
#     train_dir = os.path.join(dest, 'train')
#     test_dir = os.path.join(dest, 'test')
#     val_dir = os.path.join(dest, 'val')
#     os.makedirs(train_dir, exist_ok=True)
#     os.makedirs(test_dir, exist_ok=True)
#     os.makedirs(val_dir, exist_ok=True)
    
#     #Get image files 
#     img_files = os.listdir(source_p)
#     num_of_images = len(img_files)
    
#     # Move images to the train directory
#     source_path = os.path.join(source, img_files[i])
#     destination_path = os.path.join(train_dir, img_files[i])
#     shutil.move(source_path, destination_path)
    
#     # Move images to the test directory
#     source_path = os.path.join(source, img_files[i])
#     destination_path = os.path.join(test_dir, img_files[i])
#     shutil.move(source_path, destination_path)
    
#     # Move images to the val directory
#     source_path = os.path.join(source_dir, img_files[i])
#     destination_path = os.path.join(val_dir, img_files[i])
#     shutil.move(source_path, destination_path)

#     print("Images have moved to their respective train/test/val directories successfully.")

In [None]:

# withhold final test data
d_train, d_test = train_test_split(data, test_size=0.2)
# split into validation data
d_train, d_val = train_test_split(d_train,test_size=0.2)

# Model Setup
classifier = Sequential()
classifier.add(Conv2D(32, (3, 3), input_shape = (250, 250, 3), activation = 'relu'))
classifier.add(MaxPooling2D(pool_size = (2, 2)))
classifier.add(Flatten())
classifier.add(Dense(units = 128, activation = 'relu'))
classifier.add(Dense(units = 6, activation = 'softmax'))
classifier.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

# Move to a seperate directory 
directory_mover(d_train,"train/")
directory_mover(d_val,"val/")
directory_mover(d_test,"test/")

# Create image data generators
train_datagen = keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True
)
    
test_datagen = keras.preprocessing.image.ImageDataGenerator(rescale=1./255)
    
training_set = train_datagen.flow_from_directory('Core/train/',
                                                 target_size = (250, 250),
                                                 batch_size = 32,
                                                 class_mode = 'categorical')

validation_set = test_datagen.flow_from_directory('Core/val',
                                            target_size = (250, 250),
                                            batch_size = 32,
                                            class_mode = 'categorical')
    
testing_set = train_datagen.flow_from_directory('Core/test/',
                                                 target_size = (250, 250),
                                                 batch_size = 32,
                                                 class_mode = 'categorical')
history = classifier.fit(training_set, steps_per_epoch = 9, epochs = 20, validation_data = validation_set, validation_steps = 2)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
multi_test_names = []
# collect all file names
for i in range(len(testing_set.filenames)):
    multi_test_names.append(testing_set.filenames[i])
# extract unique names, in order
for i in range(len(multi_test_names)):
    multi_test_names[i] = multi_test_names[i].split("/")[0]
multi_test_name_order = list(OrderedDict.fromkeys(multi_test_names))

In [None]:
from tensorflow.keras.preprocessing.image import load_img,img_to_array
# create a function to predict class of images in a directory, given a trained classifier
def predictions(directory, classifier, binary=False):
    predictions = []
    class_labels = sorted(os.listdir(directory))
    
    for filename in os.listdir(directory):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            image_path = os.path.join(directory, filename)
            test_image = load_img(image_path, target_size=(250, 250))
            test_image = img_to_array(test_image)
            test_image = np.expand_dims(test_image, axis=0)
            test_image /= 255.0
            
            if binary:
                result = float(str(classifier.predict(test_image))[2])
            else:
                result = np.argmax(classifier.predict(test_image))
            
            predicted_class = class_labels[result]
            predictions.append((image_path, predicted_class))
    
    return predictions

In [None]:

multi_predictions_0 = predictions("Core/test/" + multi_test_name_order[0] + "/",classifier,binary=False)
multi_predictions_1 = predictions("Core/test/" + multi_test_name_order[1] + "/",classifier,binary=False)
multi_predictions_2 = predictions("Core/test/" + multi_test_name_order[2] + "/",classifier,binary=False)
multi_predictions_3 = predictions("Core/test/" + multi_test_name_order[3] + "/",classifier,binary=False)
multi_predictions_4 = predictions("Core/test/" + multi_test_name_order[4] + "/",classifier,binary=False)
multi_predictions_5 = predictions("Core/test/" + multi_test_name_order[5] + "/",classifier,binary=False)

In [None]:
multi_predictions_frame = pd.DataFrame(list(zip(multi_predictions_0 + multi_predictions_1 + multi_predictions_2 + multi_predictions_3 + multi_predictions_4 + multi_predictions_5,
                                                [0] * len(multi_predictions_0) + [1] * len(multi_predictions_1) + [2] * len(multi_predictions_2) + [3] * len(multi_predictions_3) + [4] * len(multi_predictions_4) + [5] * len(multi_predictions_5))),
                                       columns = ['Predictions','Actual'])

In [None]:
def prec_acc(predictions_frame):
    classes = predictions_frame['Actual'].unique()
    precision = []
    recall = []
    accuracy = []
    
    for i in classes:
        tp = predictions_frame[(predictions_frame['Actual'] == i) & (predictions_frame['Predictions'] == i)].shape[0]
        fp = predictions_frame[(predictions_frame['Actual'] != i) & (predictions_frame['Predictions'] == i)].shape[0]
        tn = predictions_frame[(predictions_frame['Actual'] != i) & (predictions_frame['Predictions'] != i)].shape[0]
        fn = predictions_frame[(predictions_frame['Actual'] == i) & (predictions_frame['Predictions'] != i)].shape[0]
        total_preds = predictions_frame.shape[0]
        
        precision.append(tp / (tp + fp + 1e-10))  # Adding a small value to avoid division by zero
        accuracy.append((tp + tn) / total_preds)
        recall.append(tp / (tp + fn + 0.5))  # Adding a small value to avoid division by zero
    
    return precision, accuracy, recall

In [None]:
import warnings
warnings.filterwarnings("ignore")
multi_accuracy = prec_acc(multi_predictions_frame)
print('Precision:' + str(multi_accuracy[1]))
print('Recall:' + str(multi_accuracy[2]))
print(multi_test_name_order)
plt.imshow(image)
plt.axis('off')
plt.show()

In [None]:
if "train" in os.listdir("./"):
    shutil.rmtree("./train")
if "val" in os.listdir("./"):
    shutil.rmtree("./val")
if "test" in os.listdir("./"):
    shutil.rmtree("./test")

## 4. Custom Dataset Model