# 사전 세팅
- 필요한 라이브러리를 import 합니다.
- 모델 구현 함수를 작성합니다.
- id 정보를 레이블(한글)로 바꿔줄 함수를 작성합니다.(id_to_label)
- colab에서 video를 사용하기 위한 코드를 작성합니다.

In [None]:
# https://tutorials.pytorch.kr/intermediate/torchvision_tutorial.html
# https://hyungjobyun.github.io/machinelearning/FasterRCNN2/
# 다음을 참조: https://github.com/HyungjoByun/Projects/blob/main/Faster%20RCNN/Faster_RCNN_Test.ipynb
    
import os
# os.environ['PYTORCH_ENABLE_MPS_FALLBACK']='1'

# Image drawing
import cv2  # open cv
from PIL import Image
from PIL import ImageFont
from PIL import ImageDraw
import matplotlib.pyplot as plt

# pytorch
import torch             
import torch.nn as nn     #pytorch network
import torch.optim as optim         #pytorch optimizer

import torch.utils.data
from torch.utils.data import Dataset, DataLoader 

import torchvision                  #torchvision
from torchvision import transforms as T  #torchvision transform

# Image augmentation 
import imgaug as ia        #imgaug
from imgaug import augmenters as iaa

# tensorboard(training status check)
import torch.utils.tensorboard as tensorboard
from torch.utils.tensorboard import SummaryWriter    

# xml parsing
import time
import xml.etree.ElementTree as Et      
from xml.etree.ElementTree import Element, ElementTree

import time                
from collections import OrderedDict # 라벨 dictionary를 만들 때 필요
import numpy as np

In [None]:
# box score threshold 0.05 -> 0.1

def build_model(class_n):
    backbone = torchvision.models.vgg16(pretrained=True).features[:-1]
    backbone_out = 512
    backbone.out_channels = backbone_out

    anchor_generator = torchvision.models.detection.rpn.AnchorGenerator(sizes=((128, 256, 512),),aspect_ratios=((0.5, 1.0, 2.0),))

    resolution = 7
    roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'], output_size=resolution, sampling_ratio=2)

    box_head = torchvision.models.detection.faster_rcnn.TwoMLPHead(in_channels= backbone_out*(resolution**2),representation_size=4096) 
    box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(4096, class_n+1) # class + 배경 1. 

    model = torchvision.models.detection.FasterRCNN(backbone, num_classes=None,
                       min_size = 600, max_size = 1000,
                       rpn_anchor_generator=anchor_generator,
                       rpn_pre_nms_top_n_train = 6000, rpn_pre_nms_top_n_test = 6000,
                       rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=300,
                       rpn_nms_thresh=0.7,rpn_fg_iou_thresh=0.7,  rpn_bg_iou_thresh=0.3,
                       rpn_batch_size_per_image=256, rpn_positive_fraction=0.5,
                       box_roi_pool=roi_pooler, box_head = box_head, box_predictor = box_predictor,
                       box_score_thresh=0.05, box_nms_thresh=0.7, box_detections_per_img=300,
                       box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5,
                       box_batch_size_per_image=128, box_positive_fraction=0.25
                     )
    #roi head 있으면 num_class = None으로 함
    
    # weight parameter initialize
    for param in model.rpn.parameters():
        torch.nn.init.normal_(param,mean = 0.0, std=0.01)

    for name, param in model.roi_heads.named_parameters():
        if "bbox_pred" in name:
            torch.nn.init.normal_(param,mean = 0.0, std=0.001)
        elif "weight" in name:
            torch.nn.init.normal_(param,mean = 0.0, std=0.01)
        if "bias" in name:
            torch.nn.init.zeros_(param)
    
    from torchsummary import summary
    summary(backbone,(3,600,1000), device = 'cpu')
    print(model)
    
    return model

In [None]:
# label 준비
label_to_id = { #'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, \
               'ㄱ': 1, 'ㄴ': 2, 'ㄷ': 3, 'ㄹ': 13, 'ㅁ': 14, 'ㅂ': 15, 'ㅅ': 16, 'ㅇ': 17, \
               'ㅈ': 18, 'ㅊ': 19, 'ㅋ': 20, 'ㅌ': 21, 'ㅍ': 22, 'ㅎ': 23, \
               'ㅏ': 24, 'ㅐ': 25, 'ㅑ': 26, 'ㅒ': 27, 'ㅓ': 28, 'ㅔ': 29, 'ㅕ': 30, \
               'ㅖ': 31, 'ㅗ': 32, 'ㅚ': 33, 'ㅛ': 34, 'ㅜ': 35, 'ㅟ': 36, 'ㅠ': 37, 'ㅡ': 38, 'ㅢ': 39, 'ㅣ': 40}

id_to_label = {}
cnt = 1
for k, v in label_to_id.items():
    #id_to_label[str(v)] = k
    id_to_label[str(cnt)] = k
    cnt+=1

display(id_to_label)

print(len(id_to_label))

{'1': 'ㄱ',
 '2': 'ㄴ',
 '3': 'ㄷ',
 '4': 'ㄹ',
 '5': 'ㅁ',
 '6': 'ㅂ',
 '7': 'ㅅ',
 '8': 'ㅇ',
 '9': 'ㅈ',
 '10': 'ㅊ',
 '11': 'ㅋ',
 '12': 'ㅌ',
 '13': 'ㅍ',
 '14': 'ㅎ',
 '15': 'ㅏ',
 '16': 'ㅐ',
 '17': 'ㅑ',
 '18': 'ㅒ',
 '19': 'ㅓ',
 '20': 'ㅔ',
 '21': 'ㅕ',
 '22': 'ㅖ',
 '23': 'ㅗ',
 '24': 'ㅚ',
 '25': 'ㅛ',
 '26': 'ㅜ',
 '27': 'ㅟ',
 '28': 'ㅠ',
 '29': 'ㅡ',
 '30': 'ㅢ',
 '31': 'ㅣ'}

31


In [None]:
# https://www.youtube.com/watch?v=YjWh7QvVH60

# import dependencies
from IPython.display import display, Javascript, Image
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import cv2
import numpy as np
import PIL
import io
import html
import time

In [None]:
# function to convert the JavaScript object into an OpenCV image
def js_to_image(js_reply):
  """$
  Params:
          js_reply: JavaScript object containing image from webcam
  Returns:
          img: OpenCV BGR image
  """
  # decode base64 image
  image_bytes = b64decode(js_reply.split(',')[1])
  # convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)

  return img

# function to convert OpenCV Rectangle bounding box image into base64 byte string to be overlayed on video stream
def bbox_to_bytes(bbox_array):
  """
  Params:
          bbox_array: Numpy array (pixels) containing rectangle to overlay on video stream.
  Returns:
        bytes: Base64 image byte string
  """
  # convert array into PIL image
  bbox_PIL = PIL.Image.fromarray(bbox_array, 'RGBA')
  iobuf = io.BytesIO()
  # format bbox into png for return
  bbox_PIL.save(iobuf, format='png')
  # format return string
  bbox_bytes = 'data:image/png;base64,{}'.format((str(b64encode(iobuf.getvalue()), 'utf-8')))

  return bbox_bytes

In [None]:
# JavaScript to properly create our live video stream using our webcam as input
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;
    
    var pendingResolve = null;
    var shutdown = false;
    
    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }
    
    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }
    
    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);
      
      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      labelElement.style.fontSize = '30px';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);
           
      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);
      
      const instruction = document.createElement('div');
      instruction.innerHTML = 
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };
      
      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640; //video.videoWidth;
      captureCanvas.height = 480; //video.videoHeight;
      window.requestAnimationFrame(onAnimationFrame);
      
      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();
      
      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }
            
      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }
      
      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;
      
      return {'create': preShow - preCreate, 
              'show': preCapture - preShow, 
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)
  
def video_frame(label, bbox):
  data = eval_js('stream_frame("{}", "{}")'.format(label, bbox))
  return data

# 모델 불러오기
- 미리 저장해 둔 모델 파일을 불러오기 위해 구글 드라이브에 마운트 해 줍니다.
- gpu를 연결합니다.
- 사전 세팅에서 작성한 모델 구현 함수를 통해 모델을 구현합니다. 
- 학습된 가중치를 구현한 모델에 불러옵니다. 


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# model 저장 경로에 따라 달라짐 ** 개인적 drive 상황 확인 필요 
!cp '/content/drive/MyDrive/Colab Notebooks/model_vgg16_01.pth' '.'

cp: cannot stat '/content/drive/MyDrive/Colab Notebooks/ Master-2023/인공지능1/model_vgg16_01.pth': No such file or directory


In [None]:
#GPU연결
if torch.cuda.is_available():
    device = torch.device('cuda:0')
    
else:
    device = torch.device('cpu')

print('현재 사용중인 device: ', device)

현재 사용중인 device:  cpu


In [None]:
import warnings

warnings.filterwarnings(action='ignore')

print('build a model...')
print('backbone is vgg16')

class_n = 31
model = build_model(class_n)

print('model building is done!')


# load_fine-tuned_model
check_point = torch.load("./model_vgg16_01.pth", map_location=device) 
model.load_state_dict(check_point)
print('loaded fine-tuned model')

model.to(device)

build a model...
backbone is vgg16


Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:03<00:00, 177MB/s]


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1        [-1, 64, 600, 1000]           1,792
              ReLU-2        [-1, 64, 600, 1000]               0
            Conv2d-3        [-1, 64, 600, 1000]          36,928
              ReLU-4        [-1, 64, 600, 1000]               0
         MaxPool2d-5         [-1, 64, 300, 500]               0
            Conv2d-6        [-1, 128, 300, 500]          73,856
              ReLU-7        [-1, 128, 300, 500]               0
            Conv2d-8        [-1, 128, 300, 500]         147,584
              ReLU-9        [-1, 128, 300, 500]               0
        MaxPool2d-10        [-1, 128, 150, 250]               0
           Conv2d-11        [-1, 256, 150, 250]         295,168
             ReLU-12        [-1, 256, 150, 250]               0
           Conv2d-13        [-1, 256, 150, 250]         590,080
             ReLU-14        [-1, 256, 1

FileNotFoundError: ignored

# Application
- signLang Translation
- video를 촬영하며 영상 속 지어(수어)를 번역하여 하단에 출력합니다.

In [None]:
!cp '/content/drive/MyDrive/Colab Notebooks/ Master-2023/인공지능1/signLang/AppleGothic.ttf' '.'

In [None]:
!cp '/content/drive/MyDrive/Colab Notebooks/ Master-2023/인공지능1/signLang/unicode.py' '.'

In [None]:
from unicode import join_jamos ,split_syllable_char

In [None]:
def remove_repeated_letters(word):
    tmp_result = ''
    for i in range(len(word)):
        if i == 0 or word[i] != word[i-1]:
            tmp_result += word[i]

    tmp_result = join_jamos (tmp_result)

    result = ""
    for i in tmp_result :
        r_data = split_syllable_char(i)
        count = r_data.count(None)
        if count != 2 :
            result+= i

    return result

In [None]:
from PIL import Image, ImageFont, ImageDraw

# start streaming video from webcam
video_stream()
# label for video
label_html = 'Capturing...'
# initialze bounding box to empty
bbox = ''
count = 0 
txt =''

while True:
    js_reply = video_frame(label_html, bbox)
    if not js_reply:
        break

    # convert JS response to OpenCV Image
    img = js_to_image(js_reply["img"])

    # create transparent overlay for bounding box
    bbox_array = np.zeros([480,640,4], dtype=np.uint8)


    frame = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    image_np = np.array(frame)

    to_tensor = T.ToTensor()
    input_tensor = to_tensor(image_np).unsqueeze(0) #dtype float32

    # inference
    model.eval()

    with torch.no_grad():
        predict = model(input_tensor.to(device))

    boxes, labels = predict[0]['boxes'], predict[0]['labels']
    
    

    try:
        # threshold 부여
        idx = torch.argmax(predict[0]['scores']).detach().cpu().numpy()
        score = predict[0]["scores"][idx].detach().cpu().numpy()

        # predict 결과 50 점 이상인 경우에 한해서만 box 처리 진행
        if score >= 0.5 : 
          box, label = boxes[idx].detach().cpu().numpy(), labels[idx].detach().cpu().numpy()
          # bbox coordinate
          xmin, ymin, xmax, ymax = box[0], box[1], box[2], box[3]
        else :
          label= ""
          xmin, ymin, xmax, ymax  = 0, 0, 0, 0

        # emtpy box drawing by PIL
        #  1. font setting for PIL
        font_size = 40
        font = ImageFont.truetype("AppleGothic.ttf", font_size)
        
    except:
        label= ""
        xmin, ymin, xmax, ymax  = 0, 0, 0, 0
        pass

        #  2. PIL Draw 
    bbox_pil = Image.fromarray(np.uint8(bbox_array)) # .convert('RGB')
    draw = ImageDraw.Draw(bbox_pil)
    draw.rectangle((xmin, ymin, xmax, ymax), outline=(0,255,0), width=4)
    if label != "" :
      #print(label)
      draw.text((xmin, ymin-5), id_to_label[str(label)], fill=(0,255,0), font=font)
    bbox_array = np.array(bbox_pil)

  
   
    bbox_array[:,:,3] = (bbox_array.max(axis = 2) > 0 ).astype(int) * 255

    # convert overlay of bbox into bytes
    bbox_bytes = bbox_to_bytes(bbox_array)
    
    # update bbox so next frame gets new overlay
    bbox = bbox_bytes
    if label != "" :
      txt += id_to_label[str(label)]
    
    result_txt=join_jamos(remove_repeated_letters(txt))
    if len(result_txt) > 10 :
      result_txt = ""
      txt = ""
    label_html =result_txt