## Sign Language Translator using Object Detection
### Sadat Taseen

The idea of this script is to use an object detection model to translate sign language and display it on the screen. Using [SSD ResNet101 V1 FPN 640x640](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2_detection_zoo.md), I used transfer learning to train it on 7 words/classes (hello, clear, day, good, look_around, below and peace). This prototype explores the idea of extracting words from sign using object detection and the possibility of feeding this output to a Natural Language Processor to create sentences.

Detections are read every frame and every 1 second, a new class with the highest detections (and more than 8) is added to a buffer. If no new signs are detected, the buffer is cleared and the sentence on the screen removed.


In [18]:
from tensorflow.python.keras.utils.data_utils import get_file
import numpy as np
import os
import cv2
import tensorflow as tf
import time

In [19]:
# Set seed to get reproducable results

np.random.seed(123)

In [20]:
class Detector:
    def __init__(self):
        self.wordCount = {}
        self.buffer = []
    
    # Read the coco file
    def readClasses(self, classesFilePath):
        with open(classesFilePath, 'r') as f:
            self.classesList = f.read().splitlines()
            
            # Colours List
            self.colorList = np.random.uniform(low=0, high=255, size=(len(self.classesList), 3))
            
    # This is for use of models from tensorflow model zoo
    def downloadModel(self, modelURL):
        fileName = os.path.basename(modelURL)
        self.modelName = fileName[:fileName.index('.')]
        
        # Download and store it from the model zoo
        self.cacheDir = './pretrained_models'
        os.makedirs(self.cacheDir, exist_ok = True)
        get_file(fname=fileName, 
                origin=modelURL,
                cache_dir=self.cacheDir,
                cache_subdir='checkpoints',
                extract=True)
        
    # Set a custom model as detector model
    def setModel(self, modelName):
        self.cacheDir = './pretrained_models'
        self.modelName = modelName
        
    # Load the model into model variable
    def loadModel(self):
        print("Loading Model " + self.modelName)
        tf.keras.backend.clear_session() # Release memory from older models
        self.model = tf.saved_model.load(os.path.join(self.cacheDir, 'checkpoints', self.modelName, "saved_model"))

        print("Model " + self.modelName + " loaded successfully...")

    # Create and return images with boxes around detected objects
    def createBoundingBox(self, image, threshold = 0.5):
        inputTensor = cv2.cvtColor(image.copy(), cv2.COLOR_RGB2BGR)
        inputTensor = tf.convert_to_tensor(inputTensor, dtype=tf.uint8) # Investigate tf.tensor objects
        inputTensor = inputTensor[tf.newaxis, ...] # What is happening here?

        detections = self.model(inputTensor) # Get predictions

        bboxs = detections['detection_boxes'][0].numpy()
        classIndexes = detections['detection_classes'][0].numpy().astype(np.int32)
        classScores = detections['detection_scores'][0].numpy()

        imH, imW, imC = image.shape
        bboxIdx = tf.image.non_max_suppression(bboxs, classScores, max_output_size=50, iou_threshold=0.5, score_threshold=0.5)

        
        if len(bboxIdx) != 0:
            for i in bboxIdx:
                bbox = tuple(bboxs[i].tolist())
                classConfidence = round(100*classScores[i])
                classIndex = classIndexes[i]

                classLabelText = self.classesList[classIndex]
                classColor = self.colorList[classIndex]

                displayText = f'{classLabelText}: {classConfidence}%'
                
                # Add count to the class in the buffer
                
                if classLabelText not in self.wordCount.keys():
                    self.wordCount[classLabelText] = 1
                else:
                    self.wordCount[classLabelText] = self.wordCount[classLabelText] + 1
                    

                ymin, xmin, ymax, xmax = bbox
                xmin, xmax, ymin, ymax = (xmin * imW, xmax*imW, ymin*imH, ymax*imH)
                xmin, xmax, ymin, ymax =  int(xmin), int(xmax), int(ymin), int(ymax)
                
                cv2.rectangle(image, (xmin, ymin), (xmax, ymax), color = classColor, thickness=1)
                cv2.putText(image, displayText, (xmin, ymin-10), cv2.FONT_HERSHEY_PLAIN, 1, classColor, thickness=2)
                
        return image
                
            
    # Predict objects from a single picture 
    def predictImage(self, imagePath, threshold):
        image = cv2.imread(imagePath)
        bboxImage = self.createBoundingBox(image, threshold)

        cv2.imwrite(self.modelName + ".jpg", bboxImage)
        cv2.imshow("Result", image)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
        
    # Store sign language words in to a buffer
    def fillBuffer(self):
        currentTime = time.time()
        
        # Clear buffer if no signs in 2 seconds, otherwise store new words in to buffer
        if (currentTime-self.translateTime >= 1):
            try:
                # Choose word with the highest number of detections
                word = max(self.wordCount, key=self.wordCount.get)
                if len(self.buffer) == 0  or self.buffer[-1] != word:
                    if self.wordCount[word] < 8: return
                    self.buffer.append(word)
                    self.translateTime = time.time()
                    self.wordCount.clear()
                elif (currentTime-self.translateTime >= 2):
                    self.buffer.clear()
                    self.translateTime = time.time()
                    self.wordCount.clear()
                

            except ValueError:
                pass
        
    # Make a string from words in the buffer
    def getTranslation(self):
        text = ''
        for word in self.buffer:
            text += f'{word} '
        return text
        
    # Translate and display sign language translation using object detection
    def predictSign(self, videoPath, threshold = 0.5):
        cap = cv2.VideoCapture(videoPath)
        
        if(cap.isOpened() == False):
            print("Error opening file...")
            
        (success, image) = cap.read()

        startTime = 0
        self.translateTime = time.time()

        while success:
            currentTime = time.time()
            
            # Count and display FPS
            fps = 1/(currentTime-startTime)
            startTime = currentTime
            bboxImage = self.createBoundingBox(image, threshold)
            cv2.putText(bboxImage, 'FPS: '+ str(int(fps)), (20, 30), cv2.FONT_HERSHEY_PLAIN, 1, (0, 235, 141), 2)
            
            # Store detections in buffer and print out the buffer
            self.fillBuffer()
            translation = self.getTranslation()
            textsize = cv2.getTextSize(translation, cv2.FONT_HERSHEY_PLAIN, 1, 2)[0]
            
            
            # Get coords based on boundary to fake center align
            textX = int((bboxImage.shape[1] - textsize[0]) / 2)
            
            cv2.putText(bboxImage, translation, (textX, 430), cv2.FONT_HERSHEY_PLAIN, 2, (2, 210, 238), 2)
            cv2.imshow('Result', bboxImage)
    
            # Press q to exit
            key = cv2.waitKey(1) & 0xFF
            if key ==ord('q'):
                break

            (success, image) = cap.read()
                
        cv2.destroyAllWindows()
            

### Setup Detector

In [21]:
classFile = 'sign-language.names'
detector = Detector()
detector.readClasses(classFile)

modelName = 'my_model'
detector.setModel(modelName)
detector.loadModel()

### Run Detector

In [22]:
# imagePath = r"C:\Users\sttas\OneDrive\Documents\sign-language\training_demo\images\train\thumbsup.a97238da-04af-11ed-a1fb-9078414817bd.jpg"
# videoPath = 'test/video-2.mp4' #0 for webcam

videoPath = 0
threshold = 0.9

# detector.predictImage(imagePath, threshold)

detector.predictSign(videoPath, threshold)

Loading Model my_model
Model my_model loaded successfully...
