

Let's see how to use the state of the art in object detection! Please make sure to watch the video, there is no code along here, since we can't reasonably train the YOLOv3 network ourself, instead we will use a pre-established version.

CODE SOURCE: https://github.com/xiaochus/YOLOv3

REFERENCE (for original YOLOv3): 

        @article{YOLOv3,  
              title={YOLOv3: An Incremental Improvement},  
              author={J Redmon, A Farhadi },
              year={2018} 
--------
----------
-------
-------

In [1]:
import os
import time
import cv2
import numpy as np
from model.yolo_model import YOLO
from time import sleep
from google.cloud import texttospeech
from google.cloud import texttospeech_v1
from playsound import playsound


Using TensorFlow backend.


In [2]:
def readyText(text):
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:/Users/Rikki/Downloads/Computer-Vision-with-Python/Computer-Vision-with-Python/DATA/blindapp-310321-10ccdaacf88e.json'
    client = texttospeech_v1.TextToSpeechClient()
    #text = 'Hi my name is Ricky'

    synthesis_input = texttospeech_v1.SynthesisInput(text=text)
    voice = texttospeech_v1.VoiceSelectionParams(
    language_code='en',
    ssml_gender=texttospeech_v1.SsmlVoiceGender.MALE
    #name="en-US-Wavenet-J",
    #language_codes='en-US'
    )    
    audio_config = texttospeech_v1.AudioConfig(
        audio_encoding = texttospeech_v1.AudioEncoding.MP3
    )

    response = client.synthesize_speech(
        input = synthesis_input,
        voice = voice,
        audio_config = audio_config
    )

    with open('theaudio.mp3', 'wb') as output:
        output.write(response.audio_content)

    playsound('C:/Users/Rikki/Downloads/Computer-Vision-with-Python/Computer-Vision-with-Python/06-Deep-Learning-Computer-Vision/06-YOLOv3/theaudio.mp3')
    os.remove('C:/Users/Rikki/Downloads/Computer-Vision-with-Python/Computer-Vision-with-Python/06-Deep-Learning-Computer-Vision/06-YOLOv3/theaudio.mp3')


In [3]:
def process_image(img):
    """Resize, reduce and expand image.

    # Argument:
        img: original image.

    # Returns
        image: ndarray(64, 64, 3), processed image.
    """
    image = cv2.resize(img, (416, 416),
                       interpolation=cv2.INTER_CUBIC)

                       
    image = np.array(image, dtype='float32')         
    image /= 255.
    image = np.expand_dims(image, axis=0)

    return image

In [4]:
def get_classes(file):
    """Get classes name.

    # Argument:
        file: classes name for database.

    # Returns
        class_names: List, classes name.

    """
    with open(file) as f:
        class_names = f.readlines()
    class_names = [c.strip() for c in class_names]

    return class_names

In [33]:
def draw(image, boxes, scores, classes, all_classes):
    """Draw the boxes on the image.

    # Argument:
        image: original image.
        boxes: ndarray, boxes of objects.
        classes: ndarray, classes of objects.
        scores: ndarray, scores of objects.
        all_classes: all classes name.
    """
    for box, score, cl in zip(boxes, scores, classes):
        x, y, w, h = box

        top = max(0, np.floor(x + 0.5).astype(int))
        left = max(0, np.floor(y + 0.5).astype(int))
        right = min(image.shape[1], np.floor(x + w + 0.5).astype(int))
        bottom = min(image.shape[0], np.floor(y + h + 0.5).astype(int))

        cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 2)
        cv2.putText(image, '{0} {1:.2f}'.format(all_classes[cl], score),
                    (top, left - 6),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.6, (0, 0, 255), 1,
                    cv2.LINE_AA)

        print('class: {0}, score: {1:.2f}'.format(all_classes[cl], score))
        print('box coordinate x,y,w,h: {0}'.format(box))
        theClass = '{0}'.format(all_classes[cl])
        theRegion = findRegion(image, top, left, right, bottom)
        prox = findSize(image, top, left, right, bottom)
        final = 'There is a '+theClass+' in the '+theRegion+' that is '+prox
        print(final)
        print(findRegion(image, top, left, right, bottom))
        readyText(final)

    print()

In [6]:
def detect_image(image, yolo, all_classes):
    """Use yolo v3 to detect images.

    # Argument:
        image: original image.
        yolo: YOLO, yolo model.
        all_classes: all classes name.

    # Returns:
        image: processed image.
    """
    pimage = process_image(image)

    start = time.time()
    boxes, classes, scores = yolo.predict(pimage, image.shape)
    end = time.time()

    print('time: {0:.2f}s'.format(end - start))

    if boxes is not None:
        draw(image, boxes, scores, classes, all_classes)
        #print(str(draw(image, boxes, scores, classes, all_classes)).partition('\n')[0])

    return image

In [7]:
def detect_video(video, yolo, all_classes):
    """Use yolo v3 to detect video.

    # Argument:
        video: video file.
        yolo: YOLO, yolo model.
        all_classes: all classes name.
    """
    video_path = os.path.join("videos", "test", video)
    camera = cv2.VideoCapture(video_path)
    cv2.namedWindow("detection", cv2.WINDOW_AUTOSIZE)

    # Prepare for saving the detected video
    sz = (int(camera.get(cv2.CAP_PROP_FRAME_WIDTH)),
        int(camera.get(cv2.CAP_PROP_FRAME_HEIGHT)))
    fourcc = cv2.VideoWriter_fourcc(*'mpeg')

    
    vout = cv2.VideoWriter()
    vout.open(os.path.join("videos", "res", video), fourcc, 20, sz, True)

    while True:
        res, frame = camera.read()

        if not res:
            break

        image = detect_image(frame, yolo, all_classes)
        cv2.imshow("detection", image)

        # Save the video frame by frame
        vout.write(image)

        if cv2.waitKey(110) & 0xff == 27:
                break

    vout.release()
    camera.release()
    

In [46]:
yolo = YOLO(0.6, 0.5)
file = 'data/coco_classes.txt'
all_classes = get_classes(file)

### Detecting Images

In [48]:
f = 'puppyyay.jpg'
path = 'images/'+f
image = cv2.imread(path)
image = detect_image(image, yolo, all_classes)
cv2.imwrite('images/res/' + f, image)

time: 29.80s
class: dog, score: 1.00
box coordinate x,y,w,h: [ 764.4168148   387.37153959 2374.701828   2566.18217182]
There is a dog in the middle right that is close
middle right



True

In [38]:
def findSize(img, x, y, w, h):
    (W, H) = img.shape[0],img.shape[1]
    area = W * H
    objArea = w * h
    if objArea / area >= 0.5:
        return 'close'
    elif objArea / area >= 0.3:
        return 'medium'
    else:
        return 'far'
def findRegion(img, x, y, w, h):
    (W, H) = img.shape[0],img.shape[1]
    (centerX, centerY) = (x+(w/2), y+(h/2))
    region = ''
    if centerX < W/3:
        region = region + 'left'
    elif centerX < 2*W/3:
        region = region + 'middle'
    else:
        region = region + 'right'
    if centerY < H/3:
        region = 'top ' + region
    elif centerY < 2 * H / 3:
        region = 'middle ' + region
    else:
        region = 'bottom ' + region
    if region == 'middle middle':
        region = 'center'
    return region


# Detecting on Video

In [25]:
# detect videos one at a time in videos/test folder    
video = ''
detect_video(video, yolo, all_classes)

In [37]:
cap = cv2.VideoCapture(0)
cv2.namedWindow("detection", cv2.WINDOW_AUTOSIZE)
counter = 0
while(True):
    # Capture frame-by-frame
    ret, frame = cap.read()
    # Our operations on the frame come here
    frame = cv2.flip(frame, 1)
    frame_copy = frame.copy()
    if(counter == 5):
        counter = 0
    
    if(counter == 0):
        image = detect_image(frame_copy, yolo, all_classes)
    
    else:
        image = frame_copy
    
    counter+=1

    

    # Display the resulting frame
    
    cv2.imshow("detection", image)
    sleep(1)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# When everything done, release the capture
cap.release()
cv2.destroyAllWindows()

time: 7.56s
time: 7.61s
class: person, score: 0.70
box coordinate x,y,w,h: [251.85112    221.90494537 101.67886734 203.46209049]
There is a person in the bottom right that is close
bottom right

time: 7.79s
time: 7.80s
time: 7.84s
class: person, score: 0.70
box coordinate x,y,w,h: [190.54901123 209.05441761 173.22324753 215.46286583]
There is a person in the middle right that is close
middle right

time: 7.98s
time: 8.01s
time: 8.05s
time: 8.02s
class: person, score: 1.00
box coordinate x,y,w,h: [116.79619789 208.14376831 277.98002243 216.79654598]
There is a person in the center that is close
center

time: 8.15s
class: person, score: 0.98
box coordinate x,y,w,h: [143.13367844 232.49763966 241.2537384  184.91875648]
There is a person in the bottom right that is close
bottom right

time: 8.26s
time: 8.31s
class: person, score: 0.73
box coordinate x,y,w,h: [ 44.92977142  50.48140526 539.790802   374.12747383]
There is a person in the middle right that is close
middle right

time: 8.34s
t