# Now that Ive applied for industry jobs, I know how important it is to write code as presentable/reusable to link to my resume. 
- Not sure why I never used to think people would care about this.


---


# This file will contain the code for Image Fraud Detection via my modified YOLO-based method (including data preprocessing, pre-training, finetuning, etc.)

### Imports

In [None]:
import cv2
import re
import yaml
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import glob
import random
import os
import torch
import tensorflow as tf
from torchvision.io import read_image
import torchvision.transforms.functional as F
import torchvision.transforms as T
from torchvision.ops import masks_to_boxes

## Unzip/extract 'masks.zip' and 'images.zip' (uploaded as 2 separate zip files)

In [None]:
# path where images and masks zip folders are located 
root = '/content/'

In [None]:
with zipfile.ZipFile(root +'masks.zip' , 'r') as zip_ref:
  zip_ref.extractall(root)
with zipfile.ZipFile(root + 'images.zip' , 'r') as zip_ref:
  zip_ref.extractall(root)

## Notes/TODO:

### may need to try resizing via padding only to avoid adding additional "tampering" or artifacts
### can also use larger squares as input to YOLO
### more DB and augmentation

## Create Labeled Train/Val/Test Sets in Proper File Structure (from Masks for specific IF DB)

In [None]:
# YOLO requires images to be squares 
# choose appropriate side length
im_size = 416
# COVERAGE is a very small DB
val_size = test_size = 15
DB = 'coverage'

In [None]:
# create file structures:
# train 
os.mkdir(root + 'train')
os.mkdir(root + 'train/' + 'labels')
os.mkdir(root + 'train/' + 'images')
# test
os.mkdir(root + 'test')
os.mkdir(root + 'test/' + 'labels')
os.mkdir(root + 'test/' + 'images')
# valid
os.mkdir(root + 'valid')
os.mkdir(root + 'valid/' + 'labels')
os.mkdir(root + 'valid/' + 'images')

In [None]:
# image fraud DBs only include masks (no BBs) so this function extracts BBs
# from masks to create the labels while placing ims and labels in proper YOLO
# file structures (for each train/val/test set)

def createLabeledSets(root, DB, test_size, val_size, im_size):


  # to start add ims/labels to test subset (make sure ims not ordered in any way)
  subset = 'test'
  # get all mask names in a list
  masks = list(os.listdir(os.path.join(root, 'masks')))

  # iterate through each mask
  for i in range(len(masks)):
    if DB == 'coverage':
      # remove forgery type info from image number so it matches with the corresponding image
      # for COVERAGE dataset matching numbers in names mean corresponding image/mask pair
      real_name = re.sub("[^0-9]", "", masks[i])
      name = real_name + 't'
      # 'paste' image mask is the forged area (skip other masks)
      if masks[i] != real_name + 'paste.jpeg':
        continue
      # COVERAGE contains a corresponding 'real' image for each tampered image
      # write empty file meaning 'no object' for untampered images (ones without the 't' are untampered)
      with open(os.path.join(root, subset, "labels", real_name + '.txt'), 'w') as f:     
        f.write(' ')
      # then add the real image to the set folder
      im = cv2.imread(os.path.join(root, "images", real_name + '.jpeg'))
      im = cv2.resize(im, (im_size,im_size))
      cv2.imwrite(os.path.join(root, subset,"images", real_name + '.jpeg'), im)

    # Pytorch reads image as tensor
    mask = read_image(os.path.join(root, "masks", masks[i]))

    # YOLO requires square images
    transform = T.Resize((im_size,im_size))
    mask = transform(mask)
    mask = F.convert_image_dtype(mask, dtype=torch.float)
    obj_ids = torch.unique(mask)
    obj_ids = obj_ids[1:]
    test = mask == obj_ids[:, None, None]
    b = masks_to_boxes(test)

    # YOLO requires coordinates in x,y center, x,y height/width
    b_center_x = (b[0][0].item() + b[0][2].item()) / 2 
    b_center_y = (b[0][1].item() + b[0][3].item()) / 2
    b_width    = (b[0][2].item() - b[0][0].item())
    b_height   = (b[0][3].item() - b[0][1].item())
    # YOLO requires normalization of coordinates      
    b_center_x /= im_size
    b_center_y /= im_size 
    b_width    /= im_size 
    b_height   /= im_size
    
    if i <= val_size:
      subset = 'valid'
    elif test_size + val_size >= i > val_size:
      subset = 'test' 
    else:
      subset = 'train'
    with open(os.path.join(root, subset, "labels", name + '.txt'), 'w') as f:
      f.write(str(0) + ' ' + str(b_center_x) + ' ' + str(b_center_y) + ' ' + str(b_width) + ' ' + str(b_height))
    # put actual tampered image into correct set 
    im = cv2.imread(os.path.join(root, "images", name + '.jpeg'))
    im = cv2.resize(im, (im_size,im_size))
    cv2.imwrite(os.path.join(root, subset,"images", name + '.jpeg'), im)



In [None]:
createLabeledSets(root, DB, test_size, val_size, im_size)

In [None]:
%%writefile /content/data.yaml
train: /content/train/images
val: /content/valid/images
test: /content/test/images


nc: 1
names: ['tampered']

Writing /content/data.yaml


# TODO copy weights from synthetic data trained model (add SyntheticImageFraudYOLO.ipynb code)

##YOLOv5

In [None]:
!git clone https://github.com/ultralytics/yolov5  # clone repo
%cd yolov5
!git reset --hard 886f1c03d839575afecb059accf74296fad395b6

### had to change torch>= 1.7.0 to torch == 1.7.0 in requirements.txt (and reinstall above eg. https://github.com/ultralytics/yolov5/issues/8405) 

In [None]:
# install dependencies as necessary
!pip install -qr requirements.txt  # install dependencies (ignore errors)
import torch

from IPython.display import Image, clear_output  # to display images
from utils.google_utils import gdrive_download  # to download models/datasets

# clear_output()
print('Setup complete. Using torch %s %s' % (torch.__version__, torch.cuda.get_device_properties(0) if torch.cuda.is_available() else 'CPU'))

In [None]:
%%writefile /content/yolov5/models/custom_yolov5m.yaml

# parameters
nc: {num_classes}  # number of classes
depth_multiple: 0.67  # model depth multiple
width_multiple: 0.75  # layer channel multiple

# anchors
anchors:
  - [10,13, 16,30, 33,23]  # P3/8
  - [30,61, 62,45, 59,119]  # P4/16
  - [116,90, 156,198, 373,326]  # P5/32

# YOLOv5 backbone
backbone:
  # [from, number, module, args]
  [[-1, 1, Focus, [64, 3]],  # 0-P1/2
   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
   [-1, 3, BottleneckCSP, [128]],
   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
   [-1, 9, BottleneckCSP, [256]],
   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
   [-1, 9, BottleneckCSP, [512]],
   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
   [-1, 1, SPP, [1024, [5, 9, 13]]],
   [-1, 3, BottleneckCSP, [1024, False]],  # 9
  ]

# YOLOv5 head
head:
  [[-1, 1, Conv, [512, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
   [-1, 3, BottleneckCSP, [512, False]],  # 13

   [-1, 1, Conv, [256, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
   [-1, 3, BottleneckCSP, [256, False]],  # 17 (P3/8-small)

   [-1, 1, Conv, [256, 3, 2]],
   [[-1, 14], 1, Concat, [1]],  # cat head P4
   [-1, 3, BottleneckCSP, [512, False]],  # 20 (P4/16-medium)

   [-1, 1, Conv, [512, 3, 2]],
   [[-1, 10], 1, Concat, [1]],  # cat head P5
   [-1, 3, BottleneckCSP, [1024, False]],  # 23 (P5/32-large)

   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
  ]

Writing /content/yolov5/models/custom_yolov5m.yaml


In [None]:
%%time
%cd /content/yolov5/
!python train.py --img 416 --batch 16 --epochs 60 --data '/content/data.yaml' --cfg /content/yolov5/models/yolov5m.yaml --weights '' --name yolov5s_results  --cache

In [None]:

!python test.py --weights /content/yolov5/runs/train/yolov5s_results4/weights/best.pt --data /content/data.yaml --img 416 --conf 0.2 --task test --save-txt


In [None]:
%cd /content/yolov5/
!python detect.py --weights runs/train/yolov5s_results4/weights/best.pt --img 416 --conf 0.5 --source ../test/images