# ID control selection
This notebook selects fitting ID controls to patients with a specific syndrome. 

In [1]:
import pandas as pd
import os
import os.path
from os import listdir
from os.path import isfile, join
import numpy as np
import random
from PIL import Image
import matplotlib.pyplot as plt
import openpyxl
from tqdm import tqdm

In [2]:
# open KdV excel sheet
#kdv_file = r"H:\Genetica Projecten\Facial Recognition\Studenten en Onderzoekers\Fien\kdv-patients\DatabasePatients_edit_nov_2017.xlsx"
def open_syn_excel(syn_name):
    syn_file = r"H:\Genetica Projecten\Facial Recognition\Studenten en Onderzoekers\Fien\{}\{}_Database.xlsx".format(syn_name, syn_name)

    assert os.path.exists(syn_file), "This path doesn't exist."

    df_syn = pd.read_excel(syn_file)
    df_syn = df_syn[['Patient', 'Age on photo', 'Gender']]
    df_syn.rename(columns={'Patient':'image','Age on photo':'age', 'Gender':'gender'},inplace=True)

    index_with_nan = df_syn.index[df_syn.isnull().any(axis=1)]
    df_syn.drop(index_with_nan,0, inplace=True)
    #df_syn

    ### check whether that image is actually present
    syn_dir = r"H:\Genetica Projecten\Facial Recognition\Studenten en Onderzoekers\Fien\{}\{}-all-photos".format(syn_name, syn_name)
    drop_indices = []

    for index, row in df_syn.iterrows():
        image = row['image']
        files = [f for f in listdir(syn_dir) if (isfile(join(syn_dir, f)) and image.replace("ANKRD","ANKRD11" ) + ".jpg" in f)]
        if(len(files)==0):
            drop_indices.append(index)

    df_syn = df_syn.drop(drop_indices)
    df_syn
    
    return df_syn

In [3]:
# Make a histogram of all ages
def make_hist(df_syn):
    ages_syn = df_syn.age.values
    plt.hist(ages_syn)
    plt.xlabel("Age")
    plt.title("Syndromic patients patients")
    plt.show()

In [4]:
# open ID control excel sheet
def open_control_excel(syn_name):
    ID_file = r"H:\Genetica Projecten\Facial Recognition\Studenten en Onderzoekers\Fien\ID-controls\all_ID_controls_info_complete.xlsm" 
    assert os.path.exists(ID_file), "This path doesn't exist."

    df_ID = pd.read_excel(ID_file)
    df_ID = df_ID[['pnummer', 'frontal face image', 'agecorrected', 'gender']]
    df_ID = df_ID[df_ID['frontal face image'].notnull()]
    df_ID = df_ID.rename(columns={"frontal face image": "image", "agecorrected": "age"})

    # print(df_ID.shape)
    # df_ID.sample(2)
    return df_ID

In [5]:
def select_controls(df_syn, df_ID):
    # empty object
    df_select_syn = pd.DataFrame(columns=['image', 'age', 'gender'])
    df_select_ID = pd.DataFrame(columns=['pnummer', 'image', 'age', 'gender'])

    age_dif_list = []
    low_age = 0
    high_age = 100 # aka all patients, can be adjusted in case of age groups

    # find control ID for each syndromic patients
    for index, row in df_syn.iterrows():

        age_syn = int(row['age'])

        if  low_age <= age_syn <= high_age:

            gender_syn = row['gender'].lower()

            # find a control ID with exact same age
            matches_ID = df_ID.loc[(df_ID['age'] == age_syn) & (df_ID['gender'] == gender_syn)]

            # try different age differences
            age_dif = [1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6]
            i = 0   
            while matches_ID.shape[0] == 0:
                matches_ID = df_ID.loc[(df_ID['age'] == age_syn + age_dif[i]) & (df_ID['gender'] == gender_syn)]
                i+= 1
                if i == len(age_dif):
                    break

            if(matches_ID.shape[0] ==0):
                print("For patient {}, gender: {}, age: {}".format(row['image'], row['gender'], row['age']))
                print("No match found within {} and {} years".format(max(age_dif), min(age_dif)))
                continue

            #age_dif_list.append(age_dif[i])

            # a match is found, so append sy patient
            df_select_syn = df_select_syn.append(row) 

            # pick a random control from this list to append to selected controls
            random_index = random.randint(0, matches_ID.shape[0]-1)
            select_ID = matches_ID.iloc[random_index]
            df_select_ID = df_select_ID.append(select_ID)

            # remove selected row from set of all controls 
            i = df_ID[(df_ID.image == select_ID.image) & (df_ID.pnummer == select_ID.pnummer)].index

            OG_shape = df_ID.shape
            df_ID = df_ID.drop(i)
            new_shape = df_ID.shape  


            if(OG_shape[0] - new_shape[0]> 1):
                print("Error")

    print("Done finding all ID controls.")
    return df_select_syn, df_select_ID

In [6]:
def save_info(syn_name, df_select_syn, df_select_ID):
    syn_info_save = r"H:\Genetica Projecten\Facial Recognition\Studenten en Onderzoekers\Fien\{}\{}_patients_info.xlsx".format(syn_name, syn_name)
    ID_info_save = r"H:\Genetica Projecten\Facial Recognition\Studenten en Onderzoekers\Fien\{}\{}_matched_ID_controls_info.xlsx".format(syn_name, syn_name)
    df_select_syn.to_excel(syn_info_save)
    df_select_ID.to_excel(ID_info_save)


In [7]:
def empty_dir(directory):
    files = [join(directory, f) for f in listdir(directory)]

    for file in files:
        os.remove(file)

### Open Excel files and write the found images to a new directory

In [8]:
def save_img_from_excel_controls(syn_name):
    ID_dir = r"H:\Genetica Projecten\Facial Recognition\Studenten en Onderzoekers\Fien\ID-controls"
    select_ID_dir =  r"H:\Genetica Projecten\Facial Recognition\Studenten en Onderzoekers\Fien\{}\{}-selected-ID-controls".format(syn_name, syn_name)
    empty_dir(select_ID_dir)
    
    ID_info_save = r"H:\Genetica Projecten\Facial Recognition\Studenten en Onderzoekers\Fien\{}\{}_matched_ID_controls_info.xlsx".format(syn_name, syn_name)
    df_ID = pd.read_excel(ID_info_save)

    for index,rows in df_ID.iterrows():
        pnr = rows['pnummer']
        image = rows['image']

        files = [f for f in listdir(ID_dir) if (isfile(join(ID_dir, f)) & ((pnr + "_small_" + image.replace(".JPG", "")) in f))]
        if(len(files)==1):
            im = Image.open(join(ID_dir, files[0]))
            im.save(join(select_ID_dir, files[0]))
        else: 
            print("Manually find image for " + str(pnr) + "_small_" + str(image.replace(".JPG", "")))   
            print("in " + str(ID_dir))

    print("Done saving ID files.")

In [9]:
def save_img_from_excel_patients(syn_name):
    
    syn_dir = r"H:\Genetica Projecten\Facial Recognition\Studenten en Onderzoekers\Fien\{}\{}-all-photos".format(syn_name, syn_name)
    select_syn_dir = r"H:\Genetica Projecten\Facial Recognition\Studenten en Onderzoekers\Fien\{}\{}-patients".format(syn_name, syn_name)
    empty_dir(select_syn_dir)    
    
    syn_info_save = r"H:\Genetica Projecten\Facial Recognition\Studenten en Onderzoekers\Fien\{}\{}_patients_info.xlsx".format(syn_name, syn_name)
    df_syn = pd.read_excel(syn_info_save)

    for index,rows in df_syn.iterrows():
        image = rows['image']
        files = [f for f in listdir(syn_dir) if (isfile(join(syn_dir, f)) and image.replace("ANKRD","ANKRD11" ) + ".jpg" in f)]
        if(len(files)==1):
            im = Image.open(join(syn_dir, files[0]))
            im.save(join(select_syn_dir, files[0]))
        else: 
            print("Manually find image for image: {}".format(image))

    print("Done saving syndrome files.")

## Write syndrome files and control files to txt 

In [10]:
def save_control_patients_info(syn_name, trial_nr):    
    control_dir = r"H:/Genetica Projecten/Facial Recognition/Studenten en Onderzoekers/Fien/{}/{}-selected-ID-controls".format(syn_name, syn_name)
    control_files = [f for f in listdir(join(control_dir)) if isfile(join(control_dir, f)) and ".jpg" in f or ".JPG" in f ]
   
    syn_dir = r"H:/Genetica Projecten/Facial Recognition/Studenten en Onderzoekers/Fien/{}/{}-patients".format(syn_name, syn_name)
    syn_files = [f for f in listdir(join(syn_dir)) if isfile(join(syn_dir, f)) and ".jpg" in f or ".JPG" in f ]
   
    control_patient_info = open("results/{}/{}-patient-control-info-run-{}.txt".format(syn_name, syn_name, trial_nr), "w")
    
    control_patient_info.write("Patients for syndrome {}\n".format(syn_name))
    for syn in syn_files:
        control_patient_info.write(syn + "\n")
   
    control_patient_info.write("\nControls for syndrome {}\n".format(syn_name))
    for control in control_files:
        control_patient_info.write(control + "\n")
    control_patient_info.close()


In [11]:
#MAIN

trial_nr = 3
syn_list = ['KDVS','ADNP', 'ANKRD11', 'CDK13', 'DEAF1', 'DYRK1A', 'EHMT1', 'FBXO11', 'SON', 'WAC', 'YY1']

print("Selecting controls for trial {} \nfor syndromens: {}".format(trial_nr, syn_list))
for syn_name in tqdm(syn_list):
    df_syn = open_syn_excel(syn_name)
    df_ID = open_control_excel(syn_name)
    #make_hist(df_syn)
    
    df_select_syn, df_select_ID = select_controls(df_syn, df_ID)
    print(df_select_syn.shape)
    print(df_select_ID.shape)
    
    save_info(syn_name, df_select_syn, df_select_ID)
    
    save_img_from_excel_controls(syn_name)
    save_img_from_excel_patients(syn_name)
    
    save_control_patients_info(syn_name, trial_nr)    




  0%|                                                   | 0/11 [00:00<?, ?it/s]

Selecting controls for trial 3 
for syndromens: ['KDVS', 'ADNP', 'ANKRD11', 'CDK13', 'DEAF1', 'DYRK1A', 'EHMT1', 'FBXO11', 'SON', 'WAC', 'YY1']
For patient kdvs_22, gender: F, age: 46
No match found within 6 and -6 years


  0%|                                                   | 0/11 [00:03<?, ?it/s]

Done finding all ID controls.
(74, 3)
(74, 4)





TypeError: save_img_from_excel_controls() missing 1 required positional argument: 'ID_info_save'