# Data Exploration: Distances

Exploring euclidean distances between facial points caputred by nuiCapture as candidate features.
<p>
    <img src="nuicapture.png">
    <em>Source: CadavidConcepts</em>
</p>

In [41]:
import glob as gl
import numpy as np
from scipy.io import loadmat
from typing import List
from scipy.spatial import distance
import matplotlib.pyplot as plt
%matplotlib notebook

In [70]:
def labelname(file_name):
    label = file_name.replace("data/points/sample", "").lower()
    return label.replace(".mat", "")


class Signal:
    def __init__(self, x, y, label):
        self.x = x
        self.y = y
        self.label = label

def dst(sig, points):
    frame_x = np.split(sig.x, 5, axis=1)
    frame_y = np.split(sig.y, 5, axis=1)
    
    nframes, nrecs, idx = np.shape(frame_x)
    eucdist = np.zeros((nframes, nrecs))
    p = points[0]
    q = points[1]
    
    for f in range(nframes):
        for r in range(nrecs):
            u = frame_x[f][r][p], frame_y[f][r][p]
            v = frame_x[f][r][q], frame_y[f][r][q]
            eucdist[f][r] = distance.euclidean(u, v)
    
    return eucdist.transpose()

In [104]:
files = gl.glob("data/points/*.mat")  # type: list
signals = []  # type: List[Signal]

for f in files:
    data = loadmat(f).get('pontosSinal')
    signals.append(Signal(data[:, ::2], data[:, 1::2], labelname(f)))

n_signs = len(signals)
n_recs, n_x = np.shape(signals[0].x)  # Number of recordings and number of features
n_frames = 5

# arbitrarily defined points, refer to notebook 'Distances'
points = [[6, 3], [6, 11], [65, 32], [8, 9], [49, 16], [50, 17], [91, 92], [20, 25], [53, 58]]
n_points, n_dim = np.shape(points)

signals_feat = []  # Updated signals, according to each experiment
signals_labels = []
labels_dict = {}  # Dictionary of signals' labels, for reference
i = 0

for s in signals:
    distances = []
    for p in points:
        distances.append(dst(s, p))     
    signals_feat.append(np.hstack(distances))
    signals_labels.append([i] * n_recs)
    labels_dict[i] = s.label
    i += 1

sig_features = np.vstack(signals_feat)
sig_labels = np.reshape(signals_labels, (n_signs * n_recs,))

(100,)

In [100]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from pprint import pprint

(20, 45)

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 7]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
niter = 30
results = []
train_acc = []
test_acc = []
train_report =[]
class_report = []
selected_params = []
cm = [] # confusion matrix
feature_importance = []
col_names = range(1,1211)

for i in range(niter):
    print("Iteration:: ", i)
    sss = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42)
    sss.get_n_splits(sig_features, sig_labels)

    for train_index, test_index in sss.split(sig_features, sig_labels):
        train_x, test_x = sig_features[train_index], sig_features[test_index]
        train_y, test_y = sig_labels[train_index], sig_labels[test_index]

    rf = RandomForestClassifier()
    rf_grid = GridSearchCV(estimator=rf, param_grid=random_grid, cv=3, verbose=1, n_jobs=-1)

    rf_grid.fit(train_x, train_y)
    predictions = rf_grid.predict(test_x)
    
    selected_params.append(rf_grid.best_params_)
    train_acc.append(accuracy_score(train_y, rf_grid.predict(train_x)))
    test_acc.append(accuracy_score(test_y, predictions))
    cm.append(confusion_matrix(test_y, predictions, labels=test_y))
    class_report.append(classification_report(test_y, predictions, target_names=list(labels_dict.values())))
    feature_importance.append(pd.DataFrame(data=sorted(zip(map(lambda x: round(x, 4), 
                                                               rf_grid.best_estimator_.feature_importances_), 
                                                           col_names), reverse=True)))