In [1]:
from numpy import expand_dims
from keras.models import load_model
from keras_preprocessing.image import load_img
from keras_preprocessing.image import img_to_array   

from keras import losses 
from keras import optimizers 
from keras import metrics 


import numpy as np
from numpy import expand_dims
from keras.models import load_model
from keras_preprocessing.image import load_img
from keras_preprocessing.image import img_to_array
from matplotlib import pyplot
from matplotlib.patches import Rectangle

from numpy import expand_dims
from keras.models import load_model
from keras_preprocessing.image import load_img
from keras_preprocessing.image import img_to_array


from PIL import Image, ImageFont, ImageDraw 
import cv2
import pyttsx3 as p 
from threading import Thread


In [2]:
class BoundBox:
	def __init__(self, xmin, ymin, xmax, ymax, objness = None, classes = None):
		self.xmin = xmin
		self.ymin = ymin
		self.xmax = xmax
		self.ymax = ymax
		self.objness = objness
		self.classes = classes
		self.label = -1
		self.score = -1

	def get_label(self):
		if self.label == -1:
			self.label = np.argmax(self.classes)

		return self.label

	def get_score(self):
		if self.score == -1:
			self.score = self.classes[self.get_label()]
		return self.score
    
def _sigmoid(x):
	return 1. / (1. + np.exp(-x))
    
    

def decode_netout(netout, anchors, obj_thresh, net_h, net_w):
	grid_h, grid_w = netout.shape[:2] 
	nb_box = 3 
	netout = netout.reshape((grid_h, grid_w, nb_box, -1)) 
	nb_class = netout.shape[-1] - 5
	boxes = []
	netout[..., :2]  = _sigmoid(netout[..., :2])
	netout[..., 4:]  = _sigmoid(netout[..., 4:])
	netout[..., 5:]  = netout[..., 4][..., np.newaxis] * netout[..., 5:]
	netout[..., 5:] *= netout[..., 5:] > obj_thresh
	for i in range(grid_h*grid_w):
		row = i / grid_w
		col = i % grid_w
		for b in range(nb_box):
			objectness = netout[int(row)][int(col)][b][4]
			if(objectness.all() <= obj_thresh): continue
			x, y, w, h = netout[int(row)][int(col)][b][:4]
			x = (col + x) / grid_w 
			y = (row + y) / grid_h 
			w = anchors[2 * b + 0] * np.exp(w) / net_w 
			h = anchors[2 * b + 1] * np.exp(h) / net_h 
			classes = netout[int(row)][col][b][5:]
			box = BoundBox(x-w/2, y-h/2, x+w/2, y+h/2, objectness, classes)
			boxes.append(box)
	return boxes    

def correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w):
	new_w, new_h = net_w, net_h
	for i in range(len(boxes)):
		x_offset, x_scale = (net_w - new_w)/2./net_w, float(new_w)/net_w
		y_offset, y_scale = (net_h - new_h)/2./net_h, float(new_h)/net_h
		boxes[i].xmin = int((boxes[i].xmin - x_offset) / x_scale * image_w)
		boxes[i].xmax = int((boxes[i].xmax - x_offset) / x_scale * image_w)
		boxes[i].ymin = int((boxes[i].ymin - y_offset) / y_scale * image_h)
		boxes[i].ymax = int((boxes[i].ymax - y_offset) / y_scale * image_h)
    
    

def _interval_overlap(interval_a, interval_b):
	x1, x2 = interval_a
	x3, x4 = interval_b
	if x3 < x1:
		if x4 < x1:
			return 0
		else:
			return min(x2,x4) - x1
	else:
		if x2 < x3:
			 return 0
		else:
			return min(x2,x4) - x3 


def bbox_iou(box1, box2):
	intersect_w = _interval_overlap([box1.xmin, box1.xmax], [box2.xmin, box2.xmax])
	intersect_h = _interval_overlap([box1.ymin, box1.ymax], [box2.ymin, box2.ymax])
	intersect = intersect_w * intersect_h
    
    
	w1, h1 = box1.xmax-box1.xmin, box1.ymax-box1.ymin  
	w2, h2 = box2.xmax-box2.xmin, box2.ymax-box2.ymin
	union = w1*h1 + w2*h2 - intersect
	return float(intersect) / union

def do_nms(boxes, nms_thresh):   
	if len(boxes) > 0:
		nb_class = len(boxes[0].classes)
	else:
		return
	for c in range(nb_class):
		sorted_indices = np.argsort([-box.classes[c] for box in boxes])
		for i in range(len(sorted_indices)):
			index_i = sorted_indices[i]
			if boxes[index_i].classes[c] == 0: continue
			for j in range(i+1, len(sorted_indices)):
				index_j = sorted_indices[j]
				if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_thresh:
					boxes[index_j].classes[c] = 0 
    

# load and prepare an image
def load_image_pixels(filename, shape):
	image = load_img(filename)
	width, height = image.size
	image = load_img(filename, target_size=shape) 
	image = img_to_array(image)
	image = image.astype('float32')
	image /= 255.0 
	image = expand_dims(image, 0)
	return image, width, height
    
    

def get_boxes(boxes, labels, thresh):
	v_boxes, v_labels, v_scores = list(), list(), list()
	for box in boxes:
		for i in range(len(labels)):
			if box.classes[i] > thresh:
				v_boxes.append(box)
				v_labels.append(labels[i])
				v_scores.append(box.classes[i]*100)
	return v_boxes, v_labels, v_scores 
    
    

# -------------------------------------------- distance of object  ---------------------------------------------- 

def dis(real_width,ref_width,width):
    known_dis = 45 
    focal_length = (known_dis * real_width ) / ref_width
    distance = (focal_length * real_width ) / width
    return distance   





# -------------------------------------------- voice output ---------------------------------------------- 

def speak(txt):
    speak = p.init()
    speak.say(txt)
    speak.runAndWait()  

def talk(view):
    for k,v in view.items():
        if len(v) != 0:
            for k1,v1 in v.items():
                if k == 'front':
                    if len(v1) == 1: 
                        txt = 'one ' + k1 + ' are in ' + k + ' of you ' + str(min(v1)) + ' meters'
                        speak(txt)  
                    else:
                        txt = str(len(v1)) + ' ' + k1 + 's are in ' + k + ' of you '  + str(min(v1)) + ' meters'
                        speak(txt) 
                else:
                    if len(v1) == 1: 
                        txt = 'one ' + k1 + ' is in ' + k + ' side'+ ' from you '  + str(min(v1)) + ' meters'
                        speak(txt)
                    else:
                        txt = str(len(v1)) + ' ' + k1 + 's are in ' + k  + ' side'+ ' from you '  + str(min(v1)) + ' meters'
                        speak(txt) 


# --------------------------------------- direction dictionary ------------------------------------------

def check(x,y,w,view,display,label,real_width,ref_width):
    mid_point = x + w // 2
    if (x + w) < display['left'][1] :
        if label not in view['left'].keys(): 
            view['left'][label] = [] 
        view['left'][label].append(round(dis(real_width[label],ref_width[label],w)*10,2)) 
        return 
    if x > display['right'][0]:  
        if label not in view['right'].keys(): 
            view['right'][label] = []
        view['right'][label].append(round(dis(real_width[label],ref_width[label],w)*10,2))
        return 
    for k,v in display.items():
        if v[0] <= mid_point and mid_point <= v[1]:
            if label not in view[k].keys(): 
                view[k][label] = []
            view[k][label].append(round(dis(real_width[label],ref_width[label],w)*10,2))
            return 
    if mid_point < display['left'][0]:
        if label not in view['left'].keys(): 
            view['left'][label] = []
        view['left'][label].append(round(dis(real_width[label],ref_width[label],w)*10,2))
        return 
    if label not in view['right'].keys(): 
        view['right'][label] = []
    view['right'][label].append(round(dis(real_width[label],ref_width[label],w)*10,2)) 
    return 
    


# --------------------------------------- distance estimation varaibles ---------------------------------------

p2 = ''

real_width = {"person" : 16, "bicycle" : 66, "car" : 109, "motorbike" : 57, "aeroplane" : 590, "bus": 472 , "train": 529 , "truck" : 496,
        "boat": 120 , "traffic light" : 11, "fire hydrant": 12  , "stop sign" : 20 , "parking meter": 177 , "bench":48 ,
        "bird":9 , "cat":15 , "dog":25 , "horse": 35 , "sheep": 27 , "cow": 35 , "elephant": 145 , "bear": 96, "zebra":96 , "giraffe":192 ,
        "backpack":12 , "umbrella":51 , "handbag":9 , "tie": 3 , "suitcase":20 , "frisbee":3937  , "skis":70 , "snowboard":63 ,
        "sports ball":9 , "kite":9 , "baseball bat":34 , "baseball glove":13 , "skateboard":9 , "surfboard":77 ,
        "tennis racket":27 , "bottle":3 , "wine glass":3 , "cup":4 , "fork":6 , "knife":8 , "spoon":6 , "bowl":5 , "banana":6 ,
        "apple":3 , "sandwich":4 , "orange":3 , "broccoli":4 , "carrot":5 , "hot dog":6 , "pizza":8 , "donut":5 , "cake":10 ,
        "chair":17 , "sofa":77 , "pottedplant":9 , "bed":85 , "diningtable":50 , "toilet":16 , "tvmonitor":32 , "laptop":15 , "mouse":4 ,
        "remote":6 , "keyboard":15 , "cell phone":5 , "microwave":14 , "oven":17 , "toaster":8 , "sink":19 , "refrigerator":23 ,
        "book":11 , "clock":12 , "vase":9 , "scissors":7 , "teddy bear":15 , "hair drier":12 , "toothbrush":7 }  

ref_width =   {'person': 179, 'bicycle': 387, 'car': 334, 'motorbike': 395, 'aeroplane': 296, 'bus': 283, 'train': 290, 'truck': 329, 'boat': 230,
           'traffic light': 349, 'fire hydrant': 389, 'stop sign': 191, 'parking meter': 382, 'bench': 342, 'bird': 315, 'cat': 341
           , 'dog': 242, 'horse': 386, 'sheep': 377, 'cow': 339, 'elephant': 329, 'bear': 270, 'zebra': 385,
           'giraffe': 211, 'backpack': 335, 'umbrella': 387, 'handbag': 285, 'tie': 374, 
           'suitcase': 170, 'frisbee': 303, 'skis': 174, 'snowboard': 128, 'sports ball': 361, 'kite': 338, 'baseball bat': 300,
           'baseball glove': 160, 'skateboard': 378, 'surfboard': 229, 'tennis racket': 417, 'bottle': 70, 'wine glass': 256,
           'cup': 255, 'fork': 315, 'knife': 395, 'spoon': 379, 'bowl': 376, 'banana': 356, 'apple': 272, 'sandwich': 322,
           'orange': 295, 'broccoli': 261, 'carrot': 243, 'hot dog': 379, 'pizza': 388, 'donut': 342, 'cake': 378, 
           'chair': 258, 'sofa': 336, 'pottedplant': 407, 'bed': 394, 'diningtable': 350, 'toilet': 292, 
           'tvmonitor': 375, 'laptop': 378, 'mouse': 275, 'remote': 434, 'keyboard': 180, 'cell phone': 256, 
           'microwave': 367, 'oven': 417, 'toaster': 136, 'sink': 120, 'refrigerator': 257, 'book': 462, 
           'clock': 299, 'vase': 157, 'scissors': 379, 'teddy bear': 273, 'hair drier': 109, 'toothbrush': 412}


# --------------------------------------- drawing rectangle in cv2 ---------------------------------------
    
def cvdraw(photo_filename, v_boxes, v_labels, v_scores):
    img=cv2.imread(photo_filename) 
    direction = ''
    height,width,channels = img.shape
    image_size = width
    ratio = '1:2:1'
    ratio_list = list(map(int,ratio.split(':'))) 
    parts = sum(ratio_list)
    one_part = image_size // parts 
    l = ratio_list[0] * one_part 
    f = ratio_list[1] * one_part 
    r =  ratio_list[2] * one_part  
    display = {
    'left'  : [0 , l],
    'front' : [l + 1 , f + l] , 
    'right' : [f + l + 1 , image_size] 
    } 
    view = {
        'left' :dict(),
        'front':dict(),
        'right':dict()
    }
    count = 0 
    op = []
    for i in range(len(v_boxes)):
        count += 1 
        font = cv2.FONT_HERSHEY_SIMPLEX
        org = (50, 50)
        fontScale = 0.4
        color = (0, 0, 0) 
        thickness = 1
        x = v_boxes[i].xmin 
        y = v_boxes[i].ymin
        w = v_boxes[i].xmax
        h = v_boxes[i].ymax  
        op.append([x,y,w])
        
        check(x,y,w,view,display,v_labels[i],real_width,ref_width)  
        
        confi = str(round(v_scores[i],2)/100) 
        confi = confi[0:4]
        text_size, _ = cv2.getTextSize(v_labels[i] + '  ' +confi, font, fontScale,thickness)
        text_w, text_h = text_size
        cv2.rectangle(img, (x,y) , (x + text_w + 5, y - text_h - 5), (0,255,0) , -1)
        cv2.rectangle(img, (x, y), (w, h),color=(0, 255, 0), thickness=2)
        cv2.putText(img, v_labels[i] + '  ' +confi,(x+2,y-2), font, fontScale, color, thickness, cv2.LINE_AA)
    for k,v in view.items():
        for k1,v1 in v.items():
            print(k,k1,v1)
    print("total detections : ",count) 
    print(op)   
#     print('width : ',w)
    cv2.imshow('img',img)
#     cv2.waitKey(0)
#     cv2.destroyAllWindows() 
#     talk(view) 
    if len(view) != 0:
        p2 = Thread(target = talk , args = (view,))
        p2.start()
    else:
        talk('You can walk freely now') 
#     if cv2.waitKey(48) == ord('q'):
#     cv2.waitKey(0)   
#     cv2.destroyAllWindows() 
#     return w 

# --------------------------------------- drawing rectangle in pillow --------------------------------------- 
# def cvdraw(photo_filename, v_boxes, v_labels, v_scores):
#     image = Image.open(photo_filename)
#     for i in range(len(v_boxes)):
#         x = v_boxes[i].xmin
#         y = v_boxes[i].ymin
#         w = v_boxes[i].xmax
#         h = v_boxes[i].ymax 
#         draw = ImageDraw.Draw(image) 
#         left = (x,y)
#         right = (w,h) 
#         position = (x+3,y+3)
#         text = v_labels[i] + '  ' +str(round(v_scores[i],2) )
#         bbox = draw.textbbox(position, text ) 
#         draw.rectangle(bbox, fill=(0,255,0))
#         draw.text(position, text,  fill="black") 
#         draw.rectangle((left,right),outline=(0,255,0),width=3) 
#     image.show()
#     image.close() 
    
    

In [3]:
import cv2 
model = load_model('model.h5')
model.compile(loss = 'mean_squared_error',optimizer = 'sgd', metrics = [metrics.categorical_accuracy]) 
def image_detection(photo_filename): 
#     photo_filename = cv2.resize(photo_filename,(416,416))
    input_w, input_h = 416, 416   
    input_w, input_h = 416, 416 
    image, image_w, image_h = load_image_pixels(photo_filename, (input_w, input_h))
    yhat = model.predict(image) 
    anchors = [[116,90, 156,198, 373,326], [30,61, 62,45, 59,119], [10,13, 16,30, 33,23]]  
    class_threshold = 0.6
    boxes = list()
    for i in range(len(yhat)):
        boxes += decode_netout(yhat[i][0], anchors[i], class_threshold, input_h, input_w)
    correct_yolo_boxes(boxes, image_h, image_w, input_h, input_w)
    do_nms(boxes, 0.5)   
    labels = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck",
        "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench",
        "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
        "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
        "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana",
        "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake",
        "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse",
        "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator",
        "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"]  
    v_boxes, v_labels, v_scores = get_boxes(boxes, labels, class_threshold)
    cvdraw(photo_filename, v_boxes, v_labels, v_scores)          




In [4]:
import os     
import cv2    
video = cv2.VideoCapture(0)
# video = cv2.VideoCapture('sample.mp4')    
count = 0 
while True:    
    ret , frame = video.read()
    if ret == True:  
        count += 1 
        if count % 48 == 0:  
            name = 'frame {} in {} sec.png'.format((count//5),count//32) 
            cv2.imwrite(name,frame)
            image_detection(name)  
#             p1 = Thread(target = image_detection , agrs = (name,) )
#             p1.start() 
    
            os.remove(name) 
    if cv2.waitKey(1) == ord('q'):
        break
video.release()  
cv2.destroyAllWindows()        

total detections :  0
[]
total detections :  0
[]
front person [1.07]
total detections :  1
[[53, 70, 600]]
front person [1.07]
total detections :  1
[[60, 81, 600]]


In [5]:
# image_detection('people.png')