### Trying different classifiers on data of syndromic patients and ID controls
Overview of this notebook:

First the deepface representations of the cropped images are read in from an Excel file. The data is then plotted by using either t-sne or PCA for dimension reduction. It is clear that there aren't two clear clusters.

In the rest of the notebook the following classifiers are tested: k-NN, SVM, Random Forest, Gradient Boosting, AdaBoost, Gaussian Naive Bayes. In the end also an ensemble of all these methods or some of them is tried. None outperforming the Gradient Boosting classifier. 

To normalize the data either Normalizer (unit form) or StandardScaler (z = (x - mean)/std) is used, without any specific difference in performance yet.  

In [1]:
import pandas as pd
import numpy as np
import csv
from tqdm import tqdm
import itertools
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, roc_curve
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
%matplotlib inline
import matplotlib.pyplot as plt
from datetime import date
from os.path import join, isfile
from os import listdir
import time
import seaborn as sns

In [2]:
def read_rep(syn_name, syn_csv, ID_csv, data_dir):
    
    # open directories
    syn_dir = data_dir+"\\{}-patients".format(syn_name)
    ID_dir = data_dir+ "\\{}-selected-ID-controls".format(syn_name)

    # get list of filenames
    files_syn = [f for f in listdir(syn_dir) if (isfile(join(syn_dir, f))) and ".jpg" in f]
    files_ID = [f for f in listdir(ID_dir) if (isfile(join(ID_dir, f))) and ".jpg" in f]
    
    data, labels, indices_to_drop = [], [], []

    data_syn = []
    with open (syn_csv, newline='') as file:
        reader = csv.reader(file, delimiter=',')
        for index, row in enumerate(reader):
            if row[0] in files_syn: 
                rep = list(map(float, row[1:]))
                data_syn.append(rep)
                if all(v == 0 for v in rep):
                    indices_to_drop.append(index)
                    
    data_ID = []                    
    with open (ID_csv, newline='') as file:
        reader = csv.reader(file, delimiter=',')
        for index, row in enumerate(reader):
            if row[0] in files_ID:
                rep = list(map(float, row[1:]))
                data_ID.append(rep)
                if all(v == 0 for v in rep):
                    indices_to_drop.append(index)
    

    for index, (syn_item, ID_item) in enumerate(zip(data_syn, data_ID)):
        if index not in indices_to_drop:
            data.append(syn_item)
            labels.append(1)
            data.append(ID_item)
            labels.append(0)

    return np.array(data), np.array(labels)

In [3]:
def read_rep2(syn_name, syn_csv, ID_csv, data_dir):
    
    # open directories
    syn_dir = data_dir+"\\{}-patients".format(syn_name)
    ID_dir = data_dir+ "\\{}-selected-ID-controls".format(syn_name)

    # get list of filenames
    files_syn = [f for f in listdir(syn_dir) if (isfile(join(syn_dir, f))) and ".jpg" in f]
    files_ID = [f for f in listdir(ID_dir) if (isfile(join(ID_dir, f))) and (".jpg" in f or ".JPG" in f) ]
    
    data = []
    labels = []

    for i, csv_file in enumerate([ID_csv, syn_csv]):
        with open (csv_file, newline='') as file:
            reader = csv.reader(file, delimiter=',')
            for row in reader:
                if row[0] in files_syn or row[0] + ".jpg" in files_ID or row[0] + ".JPG" in files_ID:
                    rep = list(map(float, row[1:]))
                    data.append(row)
                    labels.append(i)
    
    return np.array(data), np.array(labels)

In [4]:
def normalize(data, i):
    
    if i == 1:
        return Normalizer().fit_transform(data)


In [5]:
def concatenate(syn_name, data_dir, data_combination, nr_feats): 

    method = "deepface"
    syn_csv = data_dir+"\\representations\{}-patients-{}.csv".format(syn_name, method)
    ID_csv  = data_dir+"\\representations\ID-controls-{}.csv".format(method)
    data_df, labels_df = read_rep(syn_name, syn_csv, ID_csv, data_dir)
    
    if data_combination == 0: # or data_combination == 2 or data_combination == 3:
        # only deepface
        data = data_df
        labels = labels_df
    
    return 0, np.array(data), np.array(labels)  

In [6]:
def get_header(data_combination, nr_feats):
    if data_combination == 0:
        return "0: Classifying data with deepface representation\n\n"

In [7]:
import pickle5 as pickle

def main():    
        
    GENERAL_DIR = r"H:\Genetica Projecten\Facial Recognition\Studenten en Onderzoekers\Fien" 

    syn_list = ['ADNP', 'ANKRD11', 'DEAF1', 'DYRK1A']  

    for syn_name in syn_list:      

        data_dir = GENERAL_DIR + "\\{}".format(syn_name) 
        print("Syndrome that will be classified: {} \n\n".format(syn_name))

        nr_comps, data, labels = concatenate(syn_name, data_dir, 0, 0) 
        data = normalize(data, 1) 

        print("Data shape: {} and labels shape: {}".format(data.shape, labels.shape))

        model = KNeighborsClassifier(n_neighbors=3, weights='distance')
        model.fit(data, labels)   
        
        knnpickle = open(r'C:\Users\manz616236\Documents\face-classification\web_application\models\knn-deepface-{}'.format(syn_name), 'wb') 
        pickle.dump(model, knnpickle)                     
            
main()
    

Syndrome that will be classified: ADNP 


Data shape: (66, 4096) and labels shape: (66,)
Syndrome that will be classified: ANKRD11 


Data shape: (50, 4096) and labels shape: (50,)
Syndrome that will be classified: DEAF1 


Data shape: (38, 4096) and labels shape: (38,)
Syndrome that will be classified: DYRK1A 


Data shape: (32, 4096) and labels shape: (32,)
