# Plot most important features Random Forest 
This notebook plots the most important features of Model 3, which is based on a Random Forest. The features, aka the distances which have a combined importance of at least 0.8 are visualized for a randome image for that syndrome. 

In [None]:
from os.path import join, isfile
from os import listdir
import numpy as np
import csv
import itertools

import matplotlib as mpl
import matplotlib.pyplot as plt
from pylab import figure
from mpl_toolkits.mplot3d import Axes3D

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer

%matplotlib notebook

In [None]:
import sys
sys.path.append("..")
from global_variables import GENERAL_DIR, syn_list, LEFT, RIGHT

In [None]:
def read_rep(syn, syn_csv, ID_csv, data_dir):
    
    # open directories
    syn_dir = data_dir+"\\{}-patients".format(syn)
    ID_dir = data_dir+ "\\{}-selected-ID-controls".format(syn)

    # get list of filenames
    files_syn = [f for f in listdir(syn_dir) if (isfile(join(syn_dir, f))) and ".jpg" in f]
    files_ID = [f for f in listdir(ID_dir) if (isfile(join(ID_dir, f))) and ".jpg" in f]
    
    data_syn, data_ID, labels_syn, labels_ID = [], [], [], []
    
    with open (syn_csv, newline='') as file:
        reader = csv.reader(file, delimiter=',')
        for index, row in enumerate(reader):
            if row[0] in files_syn: 
                rep = list(map(float, row[1:]))
                data_syn.append(rep)
                labels_syn.append(1)

    with open (ID_csv, newline='') as file:
        reader = csv.reader(file, delimiter=',')
        for index, row in enumerate(reader):
            if row[0] in files_ID:
                rep = list(map(float, row[1:]))
                data_ID.append(rep)
                labels_ID.append(0)

    return np.array(data_syn), np.array(data_ID), np.array(labels_syn), np.array(labels_ID)

In [None]:
def load_data(syn, GENERAL_DIR, data_dir): 
    
    method = "facereader-landmarks-distances"
    syn_csv = GENERAL_DIR+ "\\features_facereader_landmarks_distances_patient_groups_left_right.csv"
    ID_csv = GENERAL_DIR+ "\\features_facereader_landmarks_distances_all_controls_left_right.csv"    
    data_syn_dis, data_ID_dis, labels_syn_dis, labels_ID_dis = read_rep(syn, syn_csv, ID_csv, data_dir)  
        
    indices_to_keep = []
    
    for index, rep in enumerate(data_syn_dis):
        if not all(v == 0 for v in data_syn_dis[index]) and not all(v == 0 for v in data_ID_dis[index]):
            indices_to_keep.append(index)
                  
    # only distance with facereader rep
    data_syn_dis = data_syn_dis[indices_to_keep]
    data_ID_dis = data_ID_dis[indices_to_keep]
    data_dis = data_syn_dis.tolist() + data_ID_dis.tolist()
    
    # labels with facereader rep
    labels_syn_dis = labels_syn_dis[indices_to_keep]
    labels_ID_dis = labels_ID_dis[indices_to_keep]
    labels = labels_syn_dis.tolist() + labels_ID_dis.tolist() 

    return np.array(data_dis), np.array(labels)


In [None]:
def randomforest_classifier(data, labels):

    forest = RandomForestClassifier(n_estimators=10,random_state=0) # 10 has been found with best aroc scores
    forest.fit(data, labels)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],axis=0)
    indices = np.argsort(importances)[::-1]
    
    for i in range(1, 30):
        important_indices = indices[0:i] 
        importance = sum(importances[important_indices])
        if importance >= 0.8:
            break
            
    return important_indices, importance

In [None]:
def random_image(GENERAL_DIR, syn):
    
    file = GENERAL_DIR+ "\\features_facereader_landmarks_patient_groups.csv"

    # Find random image of syndrome and retrieve keypoints 
    with open(file, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            if syn in row[0] and len(row[1:]) > 93:
                landmarks_left, landmarks_right = [], []
                i = 1
                count = 0

                while i < len(row[1:]):
                    if count in LEFT:
                        landmarks_left.append((float(row[i]), float(row[i+1]), float(row[i+2])))
                    if count in RIGHT:
                        landmarks_right.append((float(row[i]), float(row[i+1]), float(row[i+2])))

                    count += 1
                    i+=3   

                return landmarks_left, landmarks_right, row[0]          

In [None]:
def get_pairs():
    pairs = []
    combs = [comb for comb in itertools.combinations([*range(0, len(LEFT))], 2)]
    
    for comb in combs:
        a = comb[0]
        b = comb[1]
        pairs.append([a, b])
        
    return pairs

In [None]:
def visualize_image(landmarks_left, landmarks_right, important_indices, pairs, image_name, nr_feats, importance):
    fig = figure(figsize=(6,6))
    ax = Axes3D(fig)
            
    for [x, y, z] in landmarks_left:
        ax.scatter(x, y, z, color='b', s=5)
    for [x, y, z] in landmarks_right:
        ax.scatter(x, y, z, color='b', s=5) 
            
    combs = [comb for comb in itertools.combinations([*range(0, len(LEFT))], 2)]
    amount_dis = len(combs)
       
    for index in important_indices:
        if index < amount_dis:
            [a, b] = pairs[index]        
            [x, y, z] = landmarks_left[a]
            [x2, y2, z2] = landmarks_left[b]
            ax.plot((x, x2), (y, y2), (z, z2)) 
        else:          
            index = index - amount_dis
            [a, b] = pairs[index]        
            [x, y, z] = landmarks_right[a]
            [x2, y2, z2] = landmarks_right[b]
            ax.plot((x, x2), (y, y2), (z, z2)) 
    
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_zlabel('z')
    title = "{}: with {} features and {:.3f} importance".format(image_name, nr_feats, importance)
    ax.set_title(title)
    plt.show()

In [None]:
syn = '22q11'   

# load all data of this syn 
data_dir = GENERAL_DIR + "\\{}".format(syn) 
data, labels = load_data(syn, GENERAL_DIR, data_dir)
data = Normalizer().fit_transform(data)

# train complete random forest model and return indices
indices, importance = randomforest_classifier(data, labels)
landmark_pairs = get_pairs()

# get landmarks and image name of a random face
landmarks_left, landmarks_right, image_name = random_image(GENERAL_DIR, syn)
visualize_image(landmarks_left, landmarks_right, indices, landmark_pairs, image_name, len(indices), importance)