In [1]:
import cv2 
import os
import numpy as np 
import torch
import torchvision
from torch.utils.data import DataLoader
from torchvision import models, datasets, transforms
from PIL import Image
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

import cv2
import numpy as np 
import argparse
import time

In [2]:
faceCascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')

font = cv2.FONT_HERSHEY_SIMPLEX 
fontScale = 1
color = (255, 0, 0) 
thickness = 2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

maskModel = models.resnet34()
maskModel.fc = nn.Linear(in_features=512, out_features=2, bias=True)
maskModel.load_state_dict(torch.load('./model98.pth'))
maskModel.to(device)
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])


transform_video = transforms.Compose([transforms.Resize(256),
    transforms.CenterCrop(256),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),(0.247, 0.243, 0.261))])

In [3]:


def load_yolo():
    net = cv2.dnn.readNet("yolov3-tiny.weights", "yolov3-tiny.cfg")
    classes = []
    with open("coco.names", "r") as f:
        classes = [line.strip() for line in f.readlines()]
    layers_names = net.getLayerNames()
    output_layers = [layers_names[i[0]-1] for i in net.getUnconnectedOutLayers()]
    colors = np.random.uniform(0, 255, size=(len(classes), 3))
    return net, classes, colors, output_layers


def detect_objects(img, net, outputLayers):
    blob = cv2.dnn.blobFromImage(img, scalefactor=0.00392, size=(320, 320), mean=(0, 0, 0), swapRB=True, crop=False)
    net.setInput(blob)
    outputs = net.forward(outputLayers)
    return blob, outputs


def get_box_dimensions(outputs, height, width):
    boxes = []
    confs = []
    class_ids = []
    for output in outputs:
        for detect in output:
            scores = detect[5:]
            print(scores)
            class_id = np.argmax(scores)
            conf = scores[class_id]
            if conf > 0.3:
                center_x = int(detect[0] * width)
                center_y = int(detect[1] * height)
                w = int(detect[2] * width)
                h = int(detect[3] * height)
                x = int(center_x - w/2)
                y = int(center_y - h / 2)
                boxes.append([x, y, w, h])
                confs.append(float(conf))
                class_ids.append(class_id)
    return boxes, confs, class_ids


def draw_labels(boxes, confs, colors, class_ids, classes, img): 
    people = []
    indexes = cv2.dnn.NMSBoxes(boxes, confs, 0.5, 0.4)
    font = cv2.FONT_HERSHEY_PLAIN
    for i in range(len(boxes)):
        if i in indexes:
            x, y, w, h = boxes[i]
            label = str(classes[class_ids[i]])
            color = colors[i]
            if(label == "person"):
                people.append([x,y,w,h])
                cv2.rectangle(img, (x,y), (x+w, y+h), color, 2)
                cv2.putText(img, label, (x, y - 5), font, 1, color, 1)
    return people


yolo_model, yolo_classes, yolo_colors, yolo_output_layers = load_yolo()

def humanDetection(frame):
    height, width, channels = frame.shape
    blob, outputs = detect_objects(frame, yolo_model, yolo_output_layers)
    boxes, confs, class_ids = get_box_dimensions(outputs, height, width)
    people = draw_labels(boxes, confs, yolo_colors, class_ids, yolo_classes, frame)
    
    if people ==[] : 
        return False, people
    else:
        return True, people


In [4]:

COCO_INSTANCE_CATEGORY_NAMES = (
    "__background__",
    "person",
    "bicycle",
    "car",
    "motorcycle",
    "airplane",
    "bus",
    "train",
    "truck",
    "boat",
    "traffic light",
    "fire hydrant",
    "N/A",
    "stop sign",
    "parking meter",
    "bench",
    "bird",
    "cat",
    "dog",
    "horse",
    "sheep",
    "cow",
    "elephant",
    "bear",
    "zebra",
    "giraffe",
    "N/A",
    "backpack",
    "umbrella",
    "N/A",
    "N/A",
    "handbag",
    "tie",
    "suitcase",
    "frisbee",
    "skis",
    "snowboard",
    "sports ball",
    "kite",
    "baseball bat",
    "baseball glove",
    "skateboard",
    "surfboard",
    "tennis racket",
    "bottle",
    "N/A",
    "wine glass",
    "cup",
    "fork",
    "knife",
    "spoon",
    "bowl",
    "banana",
    "apple",
    "sandwich",
    "orange",
    "broccoli",
    "carrot",
    "hot dog",
    "pizza",
    "donut",
    "cake",
    "chair",
    "couch",
    "potted plant",
    "bed",
    "N/A",
    "dining table",
    "N/A",
    "N/A",
    "toilet",
    "N/A",
    "tv",
    "laptop",
    "mouse",
    "remote",
    "keyboard",
    "cell phone",
    "microwave",
    "oven",
    "toaster",
    "sink",
    "refrigerator",
    "N/A",
    "book",
    "clock",
    "vase",
    "scissors",
    "teddy bear",
    "hair drier",
    "toothbrush",
)


# define the torchvision image transforms
transform = transforms.Compose(
    [
        transforms.ToTensor(),
    ]
)

MIN_SIZE = 800


# download or load the model from disk
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True, min_size=MIN_SIZE)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def predict(image, detection_threshold):
    # transform the image to tensor
    image = transform(image).to(device)
    image = image.unsqueeze(0)  # add a batch dimension
    outputs = model(image)  # get the predictions on the image
    # get all the predicited class names
    labels = outputs[0]["labels"].cpu().numpy()
    # get score for all persons
    pred_scores = outputs[0]["scores"].detach().cpu().numpy()
    # get all the predicted bounding boxes
    pred_bboxes = outputs[0]["boxes"].detach().cpu().numpy()
    # get all outputs of class person
    pred_bboxes = pred_bboxes[labels == 1]
    pred_scores = pred_scores[labels == 1]
    # return boxes above the threshold score
    return pred_bboxes[pred_scores >= detection_threshold].astype(np.int32)



In [5]:
def draw_boxes(box, image, color, message):
    image = np.asarray(image)
    cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[2]) +int(box[0])   , int(box[3]) + int(box[1])), color, 4)
    cv2.putText(image, message, (int(box[0]), int(box[1] - 5)), cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 2)
    return image


def run(input_image):
    image = Image.fromarray(input_image.astype('uint8'), 'RGB')
    model.eval().to(device)
    boxes = predict(image, 0.85)
#     print(boxes)
    return boxes

In [11]:
def crop(image , x, y, h, w):
    return image[y:y+h, x:x+w]


def face(image):
    r = False
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # You must enter the values for the parameters denoted with an x

    faces = faceCascade.detectMultiScale(
        gray,
        scaleFactor=1.1,
        minNeighbors=4
        #flags=cv2.CASCADE_SCALE_IMAGE
    )

    for (x, y, w, h) in faces:
        image = crop(image, x, y, h, w )
        r = True 
    
    return r, faces 

def humanDetector(image):
    box = run(image)
    if box == ():
        return False , box
    else:
        return True , box

def maskDectector(image):
    image = Image.fromarray(image.astype('uint8'), 'RGB')
    image = train_transform(image).to(device)
    image = image.unsqueeze(0)
    output = maskModel(image)
    _, pred = torch.max(output.data, 1)
    [pred] = pred.detach().cpu().numpy()
    return not pred 



In [13]:
cap = cv2.VideoCapture(0)
       


while(True):
    ret , frame = cap.read()
    
    isHuman = True
    isFace  = False
    isMask  = False

    isHuman, humanBoxes= humanDetector(frame)
    if isHuman:
        for [x,y,w,h] in humanBoxes:
            human = crop(frame,x,y,h,w)
            isFace , faces = face(human)
            if isFace :
                for (xf,yf,wf,hf) in faces:
                    faceHuman = crop(human,xf,yf,hf,wf)
                    isMask = maskDectector(human)
                    if isMask:
                        frame = draw_boxes([x+xf,y+yf,wf,hf], frame,(0,255,0), "mask on")
                        # cv2.rectangle(frame, (x,y), (x+w, y+h), (0,255,0) , 2)
                        # cv2.putText(frame, "Mask on", (x, y - 5), font, 1, (0,255,0) , 1)
                    else:
                        frame = draw_boxes([x+xf,y+yf,wf,hf], frame,(0,0,255), "mask off")
                        # cv2.rectangle(frame, (x,y), (x+w, y+h), (0,0,255) , 2)
                        # cv2.putText(frame, "Mask off", (x, y - 5), font, 1, (0,0,255) , 1)


    cv2.imshow("capture", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



In [8]:

    
# isHuman = True
# isFace  = False
# isMask  = False

# frame = cv2.imread("/home/woolllff/Desktop/vr/miniproj/data/mask/augmented_image_231.jpg")

# isHuman, humanBoxes= humanDetector(frame)
# if isHuman:
#     for [x,y,w,h] in humanBoxes:
#         human = crop(frame,x,y,h,w)
#         isFace , faces = face(human)
#         if isFace :
#             for (xf,yf,wf,hf) in faces:
#                 faceHuman = crop(human,xf,yf,hf,wf)
#                 isMask = maskDectector(faceHuman)
#                 if isMask:
#                     frame = draw_boxes([x+xf,y+yf,wf,hf], frame,(0,255,0), "mask on")
                 
#                 else:
#                     frame = draw_boxes([x+xf,y+yf,wf,hf], frame,(0,0,255), "mask off")
               


# cv2.imshow("capture", frame)
# cv2.waitKey(0)