In [None]:
// load openCV libraries and headers
#pragma cling add_library_path("/usr/local/Cellar/opencv@3/3.4.5/lib/")
#pragma cling add_include_path("/usr/local/Cellar/opencv@3/3.4.5/include/")
#pragma cling add_include_path("/usr/local/Cellar/opencv@3/3.4.5/include/opencv2/")
#pragma cling load("/usr/local/Cellar/opencv@3/3.4.5/lib/libopencv_aruco.3.4.5.dylib")
#pragma cling load("/usr/local/Cellar/opencv@3/3.4.5/lib/libopencv_dnn_objdetect.3.4.5.dylib")
#pragma cling load("/usr/local/Cellar/opencv@3/3.4.5/lib/libopencv_dnn.3.4.5.dylib")
#pragma cling load("/usr/local/Cellar/opencv@3/3.4.5/lib/libopencv_objdetect.3.4.5.dylib")

In [None]:
// openCV specific includes
#include <opencv2/opencv.hpp>

In [None]:
// standard C++ headers used later in the notebook
#include <iostream>
#include <string>
#include <vector>
#include <sstream>

In [None]:
// Starting from version 3.3, OpenCV supports Caffe
std::cout <<"OpenCV version: " << CV_VERSION << std::endl;

In [None]:
// define some helper structs representing faces and eyes
struct cv_circle {
    int x;
    int y;
    cv_circle(int x_, int y_) : x(x_), y(y_) {}
};

In [None]:
struct cv_rect {
    int x_left_bottom;
    int y_left_bottom;
    int x_right_top;
    int y_right_top;
    cv_rect(int x_left_bottom_, int y_left_bottom_, int x_right_top_, int y_right_top_) :
        x_left_bottom(x_left_bottom_),
        y_left_bottom(y_left_bottom_),
        x_right_top(x_right_top_),
        y_right_top(y_right_top_) {}
};

In [None]:
// input: a rectangle representing a face
//        an STL vector containing eyes
// output: if eyes are symmetric in respect to vertical line with x=const=middle-rect
//         [yes, no]
bool analyze_attention(const std::vector<cv_circle>& eyes, const cv_rect& face) {
    if(eyes.size() == 2) {
        std::cout << "eye-1 = [" << eyes[0].x << ", " << eyes[0].y << "]" << std::endl;
        std::cout << "eye-2 = [" << eyes[1].x << ", " << eyes[1].y << "]" << std::endl;
        
        int rect_middle_x = face.x_left_bottom + (face.x_right_top - face.x_left_bottom) / 2;
        std::cout << "face = [" << face.x_left_bottom << ", " << face.y_left_bottom << "], right-top = [" << face.x_right_top << ", " << face.y_right_top << "] -- middle x =  " << rect_middle_x << std::endl;

        // eye-1 = [407, 181]
        // eye-2 = [491, 32687]
        // face = [354, 81], right-top = [533, 347] -- middle x =  443
        int x_dist_1 = std::abs(rect_middle_x - eyes[0].x);
        int x_dist_2 = std::abs(rect_middle_x - eyes[1].x);
        
        int x_distance_diff_threshold_pixel = 20;
        std::cout << "diff = " << std::abs(x_dist_1-x_dist_2) << std::endl;
        if(std::abs(x_dist_1-x_dist_2) < x_distance_diff_threshold_pixel)
            return true;
        
        return false;
    }
    return false;
}

In [None]:
#define WEB_CAM 0
#define USB_CAM 17

// showing a plain video stream using a standard webcam or a different camera.
void show_video_stream() {

    //Open the default video camera
    cv::VideoCapture cap(WEB_CAM);

    // if not success, exit program
    if (cap.isOpened() == false) {
        std::cout << "Cannot open the video camera" << std::endl;
        return -1;
    } 

    double dWidth = cap.get(cv::CAP_PROP_FRAME_WIDTH); //get the width of frames of the video
    double dHeight = cap.get(cv::CAP_PROP_FRAME_HEIGHT); //get the height of frames of the video

    std::cout << "Resolution of the video : " << dWidth << " x " << dHeight << std::endl;

    cv::Mat edges;
    cv::namedWindow("edges",1);
    for(;;)
    {
        cv::Mat frame;
        cap >> frame; // get a new frame from camera
        cv::imshow("Video", frame);
        if(cv::waitKey(30) >= 0) break;
    }

    // the camera will be deinitialized automatically in VideoCapture destructor
    return 0;
}

In [None]:
cv::dnn::Net initialize_network() {
    // The .prototxt file defines the model architecture (the layers)
    cv::String model = "/Users/robert/Downloads/deep-learning-face-detection/deploy.prototxt.txt"; 
    
    // The .caffemodel contains the weights for the actual layers
    // res10_300x300_ssd_iter_140000.caffemodel: The model was created with SSD framework using ResNet-10 like architecture as a backbone. Channels count in ResNet-10 convolution layers was significantly dropped (2x- or 4x- fewer channels). The model was trained in Caffe framework on some huge and available online dataset.
    cv::String config = "/Users/robert/Downloads/deep-learning-face-detection/res10_300x300_ssd_iter_140000.caffemodel";
    
    CV_Assert(!model.empty());
    
    // initialize network
    cv::dnn::Net net = cv::dnn::readNetFromCaffe(model, config);
    
    return net;
}

In [None]:
int initialize_cascade_classifier(
    cv::CascadeClassifier& cascade,
    cv::CascadeClassifier& nestedCascade) {
    
    std::string cascadeName = "/Users/robert/Downloads/deep-learning-face-detection/haarcascade_frontalface_alt.xml";
    std::string nestedCascadeName = "/Users/robert/Downloads/deep-learning-face-detection/haarcascade_eye_tree_eyeglasses.xml";
    if (!nestedCascade.load(cv::samples::findFile(nestedCascadeName))) {
        std::cout << "ERROR: Could not load classifier cascade for nested objects" << std::endl;
        return -1;
    }
    
    if (!cascade.load(cv::samples::findFile(cascadeName))) {
        std::cerr << "ERROR: Could not load classifier cascade" << std::endl;
        return -1;
    }
    
    return 1;
}

In [None]:
// return detected eyes in form of a STL vector of opencv circles
// using Cascade classifiers
std::vector<cv_circle> detect_eyes(
    cv::CascadeClassifier& cascade,
    cv::CascadeClassifier& nestedCascade,
    cv::Mat& img,
    bool draw_face = false) {
    
    std::vector<cv_circle> result;
    
    double scale = 1.3;
    
    double t = 0;
    std::vector<cv::Rect> faces, faces2;
    
    const static cv::Scalar colors[] =
    {
        cv::Scalar(255,0,0),
        cv::Scalar(255,128,0),
        cv::Scalar(255,255,0),
        cv::Scalar(0,255,0),
        cv::Scalar(0,128,255),
        cv::Scalar(0,255,255),
        cv::Scalar(0,0,255),
        cv::Scalar(255,0,255)
    };
    
    cv::Mat gray, smallImg;
    cv::cvtColor(img, gray, cv::COLOR_BGR2GRAY);
    
    double fx = 1 / scale;
    
    cv::resize(gray, smallImg, cv::Size(), fx, fx, cv::INTER_LINEAR_EXACT);
    cv::equalizeHist(smallImg, smallImg);
    
    t = (double)cv::getTickCount();
    cascade.detectMultiScale(smallImg, faces,
        1.1, 2, 0
        //|CASCADE_FIND_BIGGEST_OBJECT
        //|CASCADE_DO_ROUGH_SEARCH
        |cv::CASCADE_SCALE_IMAGE,
        cv::Size(30, 30) );

    t = (double)cv::getTickCount() - t;
    printf( "detection time = %g ms\n", t*1000/cv::getTickFrequency());
    
    for ( size_t i = 0; i < faces.size(); i++ )
    {
        cv::Rect r = faces[i];
        cv::Mat smallImgROI;
        std::vector<cv::Rect> nestedObjects;
        cv::Point center;
        cv::Scalar color = colors[i%8];
        int radius;
        
        if(draw_face) {
            double aspect_ratio = (double)r.width/r.height;
            if( 0.75 < aspect_ratio && aspect_ratio < 1.3 )
            {
                center.x = cvRound((r.x + r.width*0.5)*scale);
                center.y = cvRound((r.y + r.height*0.5)*scale);
                radius = cvRound((r.width + r.height)*0.25*scale);
                cv::circle( img, center, radius, color, 3, 8, 0 );
            }
            else
                cv::rectangle( img, cv::Point(cvRound(r.x*scale), cvRound(r.y*scale)),
                           cv::Point(cvRound((r.x + r.width-1)*scale), cvRound((r.y + r.height-1)*scale)),
                           color, 3, 8, 0);
        }
        
        if( nestedCascade.empty() )
            continue;
        
        smallImgROI = smallImg(r);
        nestedCascade.detectMultiScale( smallImgROI, nestedObjects,
            1.1, 2, 0
            //|CASCADE_FIND_BIGGEST_OBJECT
            //|CASCADE_DO_ROUGH_SEARCH
            //|CASCADE_DO_CANNY_PRUNING
            |cv::CASCADE_SCALE_IMAGE,
            cv::Size(30, 30) );
        
        for ( size_t j = 0; j < nestedObjects.size(); j++ ) {
            cv::Rect nr = nestedObjects[j];
            center.x = cvRound((r.x + nr.x + nr.width*0.5)*scale);
            center.y = cvRound((r.y + nr.y + nr.height*0.5)*scale);
            radius = cvRound((nr.width + nr.height)*0.25*scale);
            cv::circle(img, center, radius, color, 3, 8, 0);
            
            result.push_back(cv_circle(center.x, center.y));
        }
    }
    
    return result;
}

In [None]:
void add_label(cv::Mat& frame,
               bool attention_given,
               float confidence,
               int xLeftBottom,
               int yLeftBottom) {
    std::stringstream ss;
    ss.str("");
    ss << static_cast<int>(confidence*100);
    std::stringstream ss2;
    ss2.str("");
    ss2 << attention_given;
    cv::String conf(ss.str());
    cv::String attention(ss2.str());
    cv::String label = "Face: " + conf + "% | attention: " + attention;
    int baseLine = 0;
    cv::Size labelSize = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
    cv::rectangle(frame,
                  cv::Rect(cv::Point(xLeftBottom, yLeftBottom-labelSize.height),
                      cv::Size(labelSize.width, labelSize.height + baseLine)),
                  cv::Scalar(255,255,255), cv::FILLED);

    cv::putText(frame, label, cv::Point(xLeftBottom, yLeftBottom), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0,0,0));
}

In [None]:
#define WEB_CAM 0
#define USB_CAM 17

void video_face_detect() {
    
    bool use_cascade_classifier_only = false;
    
    cv::CascadeClassifier cascade;
    cv::CascadeClassifier nestedCascade;
    
    if(initialize_cascade_classifier(cascade, nestedCascade) == -1)
        return;
    
    cv::dnn::Net net = initialize_network();
    if(net.empty()) {
        std::cout << "Could not initialize network!" << std::endl;
    }
    
    static const std::string windown_name = "OpenCV/Caffe face detection demo";
    cv::namedWindow(windown_name, 1);
    
    //Open the default video camera
    cv::VideoCapture cap(WEB_CAM);

    // if not success, exit program
    if (cap.isOpened() == false) {
        std::cout << "Cannot open the video camera" << std::endl;
        return -1;
    }
    
    const size_t in_width = 300;
    const size_t in_height = 300;
    const double in_scaling_factor = 1.0;
    // Mean subtraction is used to help combat illumination changes
    const cv::Scalar mean_val(104.0, 177.0, 123.0);
    const double min_confidence = 0.5;
    
    for(int k = 0; k < 500; ++k) {
        cv::Mat frame;
        cap >> frame; // get a new frame from camera
        
        if(frame.empty()) {
            cv::waitKey();
            break;
        }
        
        if(use_cascade_classifier_only) {
            detect_eyes(cascade, nestedCascade, frame, true);
            cv::imshow(windown_name, frame);
            if(cv::waitKey(30) >= 0) {
                break;
            }
        }
        else {
            // process the image in the DNN algorithm:
            //  to send the image to the DNN we convert the OpenCV Mat structure to a DNN structure
            //  called a "blob". OpenCV uses the Mat class to hold the blob.
            cv::Mat input_blob;
            cv::dnn::blobFromImage(frame, input_blob, in_scaling_factor, cv::Size(in_width, in_height), mean_val, false, false);

            // now the frame is converted into a blob. let's feed it to the DNN and make detection using the forward function
            net.setInput(input_blob);
            cv::Mat detection = net.forward();
            cv::Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr<float>());

            for(int i = 0; i < detectionMat.rows; ++i) {
                float confidence = detectionMat.at<float>(i, 2);
                if(confidence > min_confidence) {
                    std::cout << "confidence for detection #" << i << " = " << confidence << std::endl;

                    int xLeftBottom = static_cast<int>(detectionMat.at<float>(i, 3) * frame.cols);
                    int yLeftBottom = static_cast<int>(detectionMat.at<float>(i, 4) * frame.rows);
                    int xRightTop = static_cast<int>(detectionMat.at<float>(i, 5) * frame.cols);
                    int yRightTop = static_cast<int>(detectionMat.at<float>(i, 6) * frame.rows);
                    
                    cv::Rect object((int)xLeftBottom,
                                    (int)yLeftBottom,
                                    (int)(xRightTop-xLeftBottom),
                                    (int)(yRightTop-yLeftBottom) );

                    cv::rectangle(frame, object, cv::Scalar(0,255,0));     
                    
                    int xLeftBottom_roi = xLeftBottom - 20;
                    int yLeftBottom_roi = yLeftBottom - 20;
                    int xRightTop_roi = xRightTop + 20;
                    int yRightTop_roi = yRightTop + 20;
                    
                    cv::Rect rect_roi((int)xLeftBottom_roi,
                                      (int)yLeftBottom_roi,
                                      std::min(frame.cols - xLeftBottom_roi, (int)(xRightTop_roi-xLeftBottom_roi)),
                                      std::min(frame.rows - yLeftBottom_roi, (int)(yRightTop_roi-yLeftBottom_roi)));

                    std::vector<cv_circle> eyes;
                    
                    // to reduce computational complexity we try to crop the original image to the defined ROI, i.e., the detected face
                    std::cout << xLeftBottom << ", " << yLeftBottom << ", " << xRightTop-xLeftBottom << ", " << yRightTop-yLeftBottom << " / mat rows = " << frame.rows << ", frame.cols " << frame.cols << std::endl;
                    if(0 <= rect_roi.x && 0 <= rect_roi.width && rect_roi.x + rect_roi.width <= frame.cols && 0 <= rect_roi.y && 0 <= rect_roi.height && rect_roi.y + rect_roi.height <= frame.rows) {
                        cv::Mat crop = frame(rect_roi);
                        eyes = detect_eyes(cascade, nestedCascade, crop, false);
                    } else {
                        eyes = detect_eyes(cascade, nestedCascade, frame, false);
                    }
                    
                    cv_rect rec(0, 0, xRightTop_roi-xLeftBottom_roi, yRightTop_roi-yLeftBottom_roi);     
                    bool attention_given = analyze_attention(eyes, rec);
                    std::cout << "XX2 attention_given = " << attention_given << std::endl;
                    
                    std::cout << std::endl;
                    add_label(frame, attention_given, confidence, xLeftBottom, yLeftBottom);
                    
                    cv::imshow(windown_name, frame);
                    if(cv::waitKey(30) >= 0)
                        break;
                }
            }
        }
    }
}

In [None]:
video_face_detect();

In [None]:
// interesting resources:

// Jupyter kernel for C++
// -> https://github.com/jupyter-xeus/xeus-cling

// various approaches for face detection
// -> https://medium.com/@walmaly/let-the-face-meets-machine-learning-8dd18ff96efd


// more infos around Caffe, DNN, etc.

// When using OpenCV’s deep neural network module with Caffe models, you’ll need two sets of files:
// - The .prototxt file(s) which define the model architecture (i.e., the layers themselves)
// - The .caffemodel file which contains the weights for the actual layers

// OpenCV’s deep learning face detector is based on the Single Shot Detector (SSD) framework with a ResNet
//  base network (unlike other OpenCV SSDs that you may have seen which typically use MobileNet as the base network).

// Cascade classifier:
// https://docs.opencv.org/3.4.5/d4/d26/samples_2cpp_2facedetect_8cpp-example.html#a20

// https://books.google.com/books?id=ZVqWDwAAQBAJ&pg=PA437&lpg=PA437&dq=res10_300x300_ssd_iter_140000.caffemodel&source=bl&ots=CkV8wVZ1X0&sig=ACfU3U2xCG9kvGB2HWI0AG7tgwv85fSQ-g&hl=en&sa=X&ved=2ahUKEwjz1KnIoZvqAhUDYKwKHR8tB-4Q6AEwCXoECAsQAQ#v=onepage&q=res10_300x300_ssd_iter_140000.caffemodel&f=false

// mean subtraction:
// https://www.pyimagesearch.com/2017/11/06/deep-learning-opencvs-blobfromimage-works/
//  Mean subtraction is used to help combat illumination changes in the input images in our dataset. We can therefore view mean subtraction as a technique used to aid our Convolutional Neural Networks.

// general overview of ML and DNN: http://adilmoujahid.com/posts/2016/06/introduction-deep-learning-python-caffe/