In [2]:
from __future__ import division, print_function
from PIL import Image
from torchvision import datasets, models, transforms
import copy
import cv2
import dlib
import matplotlib.pyplot as plt
import numpy as np
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision

In [3]:
INCEPTION = 'inception'
VGG19 = 'vgg-19'
MODEL = INCEPTION # Define o tipo de modelo a ser usado.
IMG_SIZE = {
  INCEPTION: 299,
  VGG19: 224,
}[MODEL]
NORMALIZE_MEAN = [0.485, 0.456, 0.406]
NORMALIZE_STD = [0.229, 0.224, 0.225]

In [17]:
class_names = ['mask', 'no-mask']
faceDetector = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
transform = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(NORMALIZE_MEAN, NORMALIZE_STD),
])

def image_loader(image_name):
    image = Image.open(image_name)
    image = transform(image).float()
    image = image.unsqueeze(0)
    return image

def image_loader_from_image(image):
    image = Image.fromarray(image)
    image = transform(image).float()
    image = image.unsqueeze(0)
    return image

def get_face(frame):
    original_image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    gray_image = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    detected_faces = faceDetector.detectMultiScale(gray_image)
    largest = 0.0
    output = []
    for (column, row, width, height) in detected_faces:
        area = width * height
        if area > largest:
            largest = area
            output = []
            output.append([column, row, width, height])
            output.append(cv2.resize(original_image[row : row + height, column : column + width], (IMG_SIZE, IMG_SIZE)))
    return output

def test_image(image):
    #start_time = time.time()

    model.eval()

    with torch.no_grad():
        outputs = model(image)
        _, preds = torch.max(outputs, 1)

        #elapsed_time = time.time() - start_time
        #print('Took {:.0f}m {:.0f}s to predict'.format(elapsed_time // 60, elapsed_time % 60))

        return class_names[preds[0]]

print(class_names)

['mask', 'no-mask']


In [5]:
if MODEL == INCEPTION:
  model = models.inception_v3(pretrained = False, progress = True)

  print(model.fc)

  for param in model.parameters():
    param.requires_grad = False
  
  num_features = model.fc.in_features
  model.fc = nn.Linear(num_features, len(class_names))
elif MODEL == VGG19:
  model = models.vgg19(pretrained = False, progress = True)

  print(model.classifier[6])

  for param in model.parameters():
    param.requires_grad = False
  
  num_features = model.classifier[6].in_features
  model.classifier[6] = nn.Linear(num_features, len(class_names))
else:
  print('ERRO: Nenhum tipo de modelo definido!')

criterion = nn.CrossEntropyLoss()

print(model)

el_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(48, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (branch5x5_2): BasicConv2d(
      (conv): Conv2d(48, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (branch3x3dbl_1): BasicConv2d(
      (conv): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (branch3x3dbl_2): BasicConv2d(
      (conv): Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(96, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (branch3x3dbl_3): BasicConv2d(
      (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(96, eps=0.001, momentum=0.1, affine=True, track_run

In [6]:
model.load_state_dict(torch.load('model-inception.pth'))

<All keys matched successfully>

In [7]:
test_image(image_loader('./dataset/test/mask/Img1.png'))

Took 0m 3s to predict


'mask'

In [19]:
vcap = cv2.VideoCapture('./video.mp4')
video = cv2.VideoWriter('./video-inception.avi', cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 25, (600, 600))
n_frames = 0
while (True):
  ret, frame = vcap.read()
  if ret == False:
    print('Frame is empty!')
    break
  h = frame.shape[0]
  w = frame.shape[1]
  resized_frame = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))
  output = get_face(resized_frame)
  all_label = '?'
  if len(output) > 0:
    bbox = output[0]
    face = output[1]
    label = test_image(image_loader_from_image(face))
    cv2.rectangle(
      resized_frame,
      (bbox[0], bbox[1]),
      (bbox[0] + bbox[2], bbox[1] + bbox[3]),
      (0, 255, 0),
      2
    )
    cv2.putText(resized_frame, label, (bbox[0], bbox[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
  else:
    all_label = test_image(image_loader_from_image(resized_frame))
  frame = cv2.resize(resized_frame, (600, 600))
  video.write(frame)
  n_frames += 1
  print('Frame: {}'.format(n_frames))
cv2.destroyAllWindows()
video.release()

Frame: 1
Frame: 2
Frame: 3
Frame: 4
Frame: 5
Frame: 6
Frame: 7
Frame: 8
Frame: 9
Frame: 10
Frame: 11
Frame: 12
Frame: 13
Frame: 14
Frame: 15
Frame: 16
Frame: 17
Frame: 18
Frame: 19
Frame: 20
Frame: 21
Frame: 22
Frame: 23
Frame: 24
Frame: 25
Frame: 26
Frame: 27
Frame: 28
Frame: 29
Frame: 30
Frame: 31
Frame: 32
Frame: 33
Frame: 34
Frame: 35
Frame: 36
Frame: 37
Frame: 38
Frame: 39
Frame: 40
Frame: 41
Frame: 42
Frame: 43
Frame: 44
Frame: 45
Frame: 46
Frame: 47
Frame: 48
Frame: 49
Frame: 50
Frame: 51
Frame: 52
Frame: 53
Frame: 54
Frame: 55
Frame: 56
Frame: 57
Frame: 58
Frame: 59
Frame: 60
Frame: 61
Frame: 62
Frame: 63
Frame: 64
Frame: 65
Frame: 66
Frame: 67
Frame: 68
Frame: 69
Frame: 70
Frame: 71
Frame: 72
Frame: 73
Frame: 74
Frame: 75
Frame: 76
Frame: 77
Frame: 78
Frame: 79
Frame: 80
Frame: 81
Frame: 82
Frame: 83
Frame: 84
Frame: 85
Frame: 86
Frame: 87
Frame: 88
Frame: 89
Frame: 90
Frame: 91
Frame: 92
Frame: 93
Frame: 94
Frame: 95
Frame: 96
Frame: 97
Frame: 98
Frame: 99
Frame: 100
Frame: 1