In [None]:
%matplotlib inline

from IPython.display import display, clear_output
from IPython.core.pylabtools import figsize
figsize(12, 5)
import ipywidgets as widgets
import os
import time
import random
import math
import pickle
import numpy as np
np.warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from collections import namedtuple
from sklearn import metrics

from esper.widget import *
from esper.prelude import *
from esper.plot_util import *
import esper.face_embeddings as face_embeddings

def split_list(l, idx):
    return l[:idx], l[idx:]

ReferenceFaces = namedtuple(
    'ReferenceFaces', ['name', 'ids', 'embs', 'imgs'])

def show_reference_imgs(refs):
    tiled_imgs = tile_images(
        [cv2.resize(x, (100, 100)) for x in refs.imgs], 
        cols=10, blank_value=255)
    print('Your reference images for {}.'.format(refs.name))
    plt.figure()
    imshow(tiled_imgs)
    plt.tight_layout()
    plt.show()

In [None]:
POS_LABEL = 1
NEG_LABEL = 0

####
# DEBUG PLOTS
####

def plot_roc(y_true, y_pred, title='Receiver Operating Characteristic'):
    fpr, tpr, threshold = metrics.roc_curve(y_true, y_pred)
    roc_auc = metrics.auc(fpr, tpr)
    plt.figure()
    plt.title(title)
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    
def plot_binary_score_histograms(y_true, y_pred, face_references, y_max=None, 
                                 title='Score Distribution by Class', ):
    bins = np.linspace(0, 1, 100)
    plt.figure()
    plt.hist([x for i, x in enumerate(y_pred) if y_true[i] == POS_LABEL], 
             bins, alpha=0.5, label=face_references.name)
    plt.hist([x for i, x in enumerate(y_pred) if y_true[i] == NEG_LABEL], 
             bins, alpha=0.5, label='Not {}'.format(face_references.name))
    plt.title(title)
    plt.xlabel('Predicted Score')
    if y_max is not None: 
        plt.ylim(0, y_max)
    plt.legend()
    plt.show()
    
def plot_score_histogram(predictions, sample, x_min=None):
    bins = np.linspace(0, 1, 100)
    plt.figure()
    sampled_pred = (
        random.sample(predictions, sample) 
        if sample < len(predictions) else predictions
    )
    plt.hist([s for _, s in sampled_pred], bins, alpha=1)
    plt.title('Predicted Score Distribution (sample={})'.format(
              min(sample, len(predictions))))
    plt.xlabel('Predicted Score')
    plt.xticks(np.arange(11) / 10)
    if x_min is not None:
        plt.xlim(left=x_min)
    plt.yscale('log', nonposy='clip')
    plt.show()
    
def plot_estimated_cdf(predictions, sample, x_min=None):
    n_bins = 100
    def score_to_bin(s):
        v = math.ceil(s * n_bins)
        return min(v, n_bins)
    bins = np.zeros(n_bins + 1)
    sampled_pred = (
        random.sample(predictions, sample) 
        if sample < len(predictions) else predictions
    )
    for _, s in sampled_pred:
        bins[score_to_bin(s)] += s

    sample_est_pos = np.sum(bins)
    total_est_pos = int(sample_est_pos / sample * len(predictions))
    
    norm_bins = bins / sample_est_pos
    cdf_bins = np.cumsum(norm_bins)
    inds = np.arange(bins.size) / n_bins
    plt.figure()
    plt.title('CDF of Positive Predictions ' +
              '(total estimated positives={})'.format(
              total_est_pos))
    plt.plot(inds, cdf_bins, label='Est. Cumulative Proportion')
    plt.plot(inds, norm_bins, label='Est. Bin Proportion ({} bins)'.format(n_bins))
    plt.ylabel('Proportion')
    plt.xlabel('Predicted Score')
    plt.ylim(bottom=0)
    plt.xticks(np.arange(11) / 10)
    if x_min is not None:
        plt.xlim(left=x_min)
    plt.legend()
    plt.show()
    
    print('Est. Positives by Threshold')
    total_est_bins = bins / sample * len(predictions)
    total_est_cdf_thresh = np.cumsum(total_est_bins[::-1])
    num_thesh = 10
    for i in range(num_thesh):
        t = 1. - 0.1 * (i + 1)
        print('  t={:0.1f}\t{:0.1f}'.format(
              t, total_est_cdf_thresh[num_thesh * (i + 1) - 1]))

def train_model(params, face_references, pos_examples, neg_examples, score_threshold, 
                train_val_ratio=10):
    print('Training logistic classifier with {}:1 train to validation split'.format(
          train_val_ratio))
    
    print('Hyperparameters')
    print('  Epochs:', params['num_epochs'])
    print('  Learning rate:', params['learning_rate'])
    print('  L2 penalty:', params['l2_penalty'])

    start_time = time.time()
    
    pos_examples_copy = list(pos_examples)
    random.shuffle(pos_examples_copy)
    pos_split_idx = int(len(pos_examples_copy) / train_val_ratio)
    val_pos, train_pos = split_list(pos_examples_copy, pos_split_idx)
    
    neg_examples_copy = list(neg_examples)
    random.shuffle(neg_examples_copy)
    neg_split_idx = int(len(neg_examples_copy) / train_val_ratio)
    val_neg, train_neg = split_list(neg_examples_copy, neg_split_idx)
    
    train_ids = train_pos + train_neg
    train_y = ([POS_LABEL] * len(train_pos)) + ([NEG_LABEL] * len(train_neg))
    
    val_ids = val_pos + val_neg
    val_y = ([POS_LABEL] * len(val_pos)) + ([NEG_LABEL] * len(val_neg))
    
    weights = face_embeddings.logreg(train_ids, train_y, **params)
    predictions = face_embeddings.logreg.logreg_predict(weights, min_thresh=score_threshold)
    
    model_time = time.time()
    print('Trained model and obtained predictions: {:0.4f}s'.format(model_time - start_time))
    
    train_id_to_idx = {v: i for i, v in enumerate(train_ids)}
    train_pred_y = [0] * len(train_ids)
    val_id_to_idx = {v: i for i, v in enumerate(val_ids)}
    val_pred_y = [0] * len(val_ids)
    
    for v, s in predictions:
        if v in train_id_to_idx:
            train_pred_y[train_id_to_idx[v]] = s
        if v in val_id_to_idx:
            val_pred_y[val_id_to_idx[v]] = s
            
    num_tabs = 3
    outputs = [widgets.Output() for _ in range(num_tabs)]
    tabs = widgets.Tab(children=outputs)
    
    with outputs[0]:
        tabs.set_title(0, 'Entire Dataset')
        x_min = params.get('min_thresh', None)
        if x_min is not None:
            print('Minimum score threshold: {}'.format(x_min))
        plot_score_histogram(predictions, sample=100000, x_min=x_min)
        print('If we interpret the scores produced by the model as probabilities, '
              'we can estimate the number of true positives that we expect to find '
              'in the dataset. The following plot makes this assumption and shows '
              'the expected contribution of faces of varying scores to the total.')
        plot_estimated_cdf(predictions, sample=100000, x_min=x_min)
        
    with outputs[1]:
        tabs.set_title(1, 'Training Set')
        plot_roc(train_y, train_pred_y)
        plot_binary_score_histograms(train_y, train_pred_y, face_references)
        
    with outputs[2]:
        tabs.set_title(2, 'Validation Set')
        plot_roc(val_y, val_pred_y)
        plot_binary_score_histograms(val_y, val_pred_y, face_references)
    
    print('Generate debugging plots: {:0.4f}s'.format(time.time() - model_time))
    display(tabs)
    return weights, predictions

In [None]:
def load_model_and_examples(path):
    print('Loading model: {}'.format(path))
    with open(path, 'rb') as f:
        model = pickle.load(f)
    
    embs = model['init_embs']
    ids = set(model['init_ids'])
    imgs = model['init_imgs']
    weights = model['weights']
    
    references = ReferenceFaces(
        name=model['name'], ids=ids, imgs=imgs, embs=embs)
    pos_examples = set(model['pos_examples'])
    neg_examples = set(model['neg_examples'])
    print('Done! Loaded {} reference faces; {} positive and {} negative examples'.format(
          len(embs), len(pos_examples), len(neg_examples)))
    return references, pos_examples, neg_examples, weights

def list_models(model_dir):
    result = []
    for fname in os.listdir(model_dir):
        result.append(os.path.join(model_dir, fname))
    return result

NATIVE_MODEL_DIR = '/app/data/identity_models_v2'
CONVERTED_MODEL_DIR = '/app/data/identity_models_v2_converted'

native_model_files = list_models(NATIVE_MODEL_DIR)
print('Found {} native models'.format(len(native_model_files)))

converted_model_files = list_models(CONVERTED_MODEL_DIR)
print('Found {} converted models'.format(len(converted_model_files)))

In [None]:
MAX_TRANSACTION_SIZE = 100000

def run_model(path, labeler_name_prefix, save_to_db=False, score_threshold=0.2):    
    face_references, pos_examples, neg_examples, weights = load_model_and_examples(path)
    print('Running:', face_references.name)
    name = face_references.name.lower()
    labeler_name = '{}:{}'.format(labeler_name_prefix, name)
    if save_to_db:
        existing_label_count = FaceIdentity.objects.filter(
            identity__name=name,
            labeler__name=labeler_name
        ).count()
        if existing_label_count > 0:
            print('Identities already saved for {} (count={})'.format(
                  face_references.name, existing_label_count))
            return

    show_reference_imgs(face_references)
    params = {
        'num_epochs': 40,
        'learning_rate': 1,
        'l2_penalty': 1e-5,
    }
    weights, predictions = train_model(params, face_references, pos_examples, neg_examples, 
                                       score_threshold)
    print('{} faces passed the threshold'.format(len(predictions)))
    
    if save_to_db:
        labeler, created = Labeler.objects.get_or_create(name=labeler_name)
        if created:
            print('Created labeler:', labeler_name)
        identity, created = Identity.objects.get_or_create(name=name)
        if created:
            print('Created identity:', name)
        valid_face_ids = {
            x['id'] for x in 
            Face.objects.filter(
                id__in=[i for i, _ in predictions]
            ).values('id')
        }
        face_identities = []
        for i, s in predictions:
            if i in valid_face_ids:
                face_identities.append(
                    FaceIdentity(
                        face_id=i, identity=identity, 
                        probability=s, labeler=labeler))
                valid_face_ids.remove(i)
        print('Saving {} face identities to the DB. ({} face ids were missing)'.format(
              len(face_identities), len(predictions) - len(face_identities)))
        saved_count = 0
        for i in range(0, len(face_identities), MAX_TRANSACTION_SIZE):
            face_identities_subset = face_identities[i:i + MAX_TRANSACTION_SIZE]
            FaceIdentity.objects.bulk_create(face_identities_subset)
            saved_count += len(face_identities_subset)
            print('  saved {} / {}'.format(saved_count, len(face_identities)))
        print('Done!')
    print()

In [None]:
SAVE_TO_DB = True
NATIVE_LABELER_NAME_PREFIX = 'face-identity'

for model_path in sorted(native_model_files):
    run_model(model_path, NATIVE_LABELER_NAME_PREFIX, save_to_db=SAVE_TO_DB)

In [None]:
SAVE_TO_DB = True
CONV_LABELER_NAME_PREFIX = 'face-identity-converted'

def get_name_from_path(p):
    return p.split('/')[-1].split('.')[0].lower()

for model_path in sorted(converted_model_files):
    name = get_name_from_path(model_path).replace('_', ' ')
    if Labeler.objects.filter(name='face-identity:{}'.format(name)).count() > 0:
        print('Native labels already exist for {}. Skipping import.'.format(name))
    else:
        run_model(model_path, CONV_LABELER_NAME_PREFIX, save_to_db=SAVE_TO_DB)

In [None]:
SAVE_TO_DB = True
NATIVE_LABELER_NAME_PREFIX = 'face-identity'

for model_path in sorted([
    '/app/data/identity_models_v2/mika_brzezinski.pkl', 
    '/app/data/identity_models_v2/willie_geist.pkl',
    '/app/data/identity_models_v2/joe_scarborough.pkl'
]):
    run_model(model_path, NATIVE_LABELER_NAME_PREFIX, save_to_db=SAVE_TO_DB)