# Generalized code to prepare and combine (both synthetic and real) Image Fruad Datasets for use with YOLOv5

### imports

In [1]:
import cv2
import re
import yaml
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import glob
from random import randint
import os
import torch
import tensorflow as tf
from torchvision.io import read_image
import torchvision.transforms.functional as F
import torchvision.transforms as T
from torchvision.ops import masks_to_boxes

### Unzip/extract images and masks files (uploaded as zip files)

In [None]:
# path whereimages and masks zip folder(s) are located 
root = '/content/'

### COVERAGE

In [None]:
# coverage
with zipfile.ZipFile(root +'masks.zip' , 'r') as zip_ref:
  zip_ref.extractall(root)
with zipfile.ZipFile(root + 'images.zip' , 'r') as zip_ref:
  zip_ref.extractall(root)

In [None]:
# mainly DB specific constants:
# name of DB
DB = 'coverage'
# COVERAGE is a very small DB
val_size = test_size = 15
# name 
files = 'masks'

### IEEE IFS-TC ImageForensics Challenge

In [None]:
# IEEE IFS-TC Image
# Forensics Challenge
with zipfile.ZipFile(root +'fake.zip' , 'r') as zip_ref:
  zip_ref.extractall(root)

### other constants

In [None]:
# YOLO requires images to be squares 
# choose appropriate side length
im_size = 416

## Create and label synthetic Tampered Dataset (and divide into train/valid)
- either save the weights (trained with synthetic data) to retrain or fine-tune with non-synthetic dataset or run add them to the same training set




## TODO:

Datasets:
- try other image fraud db
- compare synthetic Pascal Voc images (download masks)
- compare synthetic COCO images (download masks)

Testing:
- add in untampered images
- add more augmented data from actual training set since there is a class imbalance

Preprossing:
- try resizing via padding only to avoid adding additional "tampering" or artifacts
- move around masks pasted



**root:** where directories with test/train/val will be created and where the original tampered/authentic images and masks will be unzipped
**files:**

In [None]:
# image fraud DBs only include masks (no BBs) so this function extracts BBs
# from masks to create the labels while placing properly formatted images and 
# labels in proper YOLO file structures (for each train/val/test set)
# if synthetic = True then a synthetically crop object from another image and paste onto untampered image
def createLabeledSets(root, DB, test_size, val_size, im_size, synthetic, percent_real = 0, augmentation = 'None', n_rotations = 1, bb_pad = 0):

  # Create Labeled Train/Val/Test Sets in Proper File Structure
  # train 
  os.mkdir(root + 'train')
  os.mkdir(root + 'train/' + 'labels')
  os.mkdir(root + 'train/' + 'images')
  # test
  os.mkdir(root + 'test')
  os.mkdir(root + 'test/' + 'labels')
  os.mkdir(root + 'test/' + 'images')
  # valid
  os.mkdir(root + 'valid')
  os.mkdir(root + 'valid/' + 'labels')
  os.mkdir(root + 'valid/' + 'images')
  # get all mask names in a list (sorted so first images will always be in test set)
  all_files = list(sorted(os.listdir(os.path.join(root, files))))
  # get all images in a list 
  #images = list(sorted(os.listdir(os.path.join(root, 'images'))))
  subset = 'test'
    
  # iterate through each mask 
  for i in range(len(all_files)):

    if test_size <= i < (test_size + val_size):
      subset = 'valid' #switch this with test
    elif i >= (test_size + val_size):
      subset = 'train'

    if DB == 'coverage':
      # remove forgery type info from image number so it matches with the corresponding image
      # for COVERAGE dataset matching numbers in names mean corresponding image/mask pair
      auth_name = re.sub("[^0-9]", "", all_files[i])
      tamp_name = auth_name + 't'

      if all_files[i] != auth_name + 'paste.jpeg':
        continue

      # COVERAGE contains a corresponding 'real' image for each tampered image
      # write empty file meaning 'no object' for untampered images (ones without the 't' are untampered)
      with open(os.path.join(root, subset, "labels", auth_name + '.txt'), 'w') as f:     
        f.write(' ')
      # then add the real image to the set folder
      im = cv2.imread(os.path.join(root, "images", auth_name + '.jpeg'))
      im = cv2.resize(im, (im_size,im_size))
      cv2.imwrite(os.path.join(root, subset,"images", auth_name + '.jpeg'), im)
    
    if DB == 'ifs-tc':
      

    for j in range(n_rotations):
      # read in image to extract masked region
      im = cv2.imread(os.path.join(root, "images", tamp_name + '.jpeg'))
      # read corresponding mask
      mask = cv2.imread(os.path.join(root, "masks", masks[i]))

      # rotate both the mask and the tamped image its extracted from 
      if j == 0:
        # no rotation (full 360)
        mask = cv2.rotate(mask, cv2.ROTATE_180)
        im = cv2.rotate(im, cv2.ROTATE_180)
        rotate = cv2.ROTATE_180
      elif j == 1:
        if subset == 'test':
          break
        rotate = cv2.ROTATE_90_COUNTERCLOCKWISE
      elif j == 2:
        rotate = cv2.ROTATE_180
      else:
        rotate = cv2.ROTATE_90_CLOCKWISE
      
      # get BBs from mask
      # first rotate and resize
      mask = cv2.rotate(mask, rotate)

      ### here
      mask = cv2.resize(mask, (im_size,im_size))
      # make sure image is binary and threshold
      gray = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)
      thresh = cv2.threshold(gray,250,255,cv2.THRESH_BINARY)[1]
      cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
      cnts = cnts[0] if len(cnts) == 2 else cnts[1]
      for c in cnts:
        x,y,w1,h1 = cv2.boundingRect(c)

        
      # YOLO requires coordinates in x,y center, x,y height/width
      c_x = x + (w1 // 2) 
      c_y = y + (h1 // 2)
      # YOLO requires normalization of coordinates      
      c_x /= im_size
      c_y /= im_size 
      w = (w1 + bb_pad) / im_size 
      h = (h1 + bb_pad)/ im_size
      
      # rotate and resize image (same ask mask)
      im_r = cv2.rotate(im, rotate)

      ### here
      im = cv2.resize(im_r, (im_size,im_size))

      if synthetic == True:  
        # read in random image to paste extracted masked region onto it
        # and keep BG and FG images in same sets:
        if i < test_size:
          num = randint(0,test_size)
        else:
          num = randint(test_size,len(images)-1)

        # make sure to not paste on tampered image
        im2_name = re.sub("[^0-9]", "", masks[num])
        im2 = cv2.imread(os.path.join(root, "images", im2_name + ".jpeg")) 

        ### here
        im2 = cv2.resize(im2, (im_size, im_size))
      
        # mask of im is 0/black everywhere where im2 will go   
        im = np.where(mask == 0, im2, im) # also works
        #im = np.where(mask >= 250, im, im2)

      if augmentation != 'None':
        # imagaug and albumentations (augmentations for BB included)
        if subset == 'test':
          break
        if augmentation == 'brightness':
          b = np.ones(im.shape, dtype='uint8')*70
          im = cv2.add(im,b)

        elif augmentation == 'dullness':
          d = np.ones(im.shape, dtype='uint8')*70
          im = cv2.subtract(im,d)

        elif augmentation == 'sharpness':
          s = np.array([[-1,-1,-1],[-1,10,-1],[-1,-1,-1]])
          im = cv2.filter2D(im, -1,s)

      # debug:
      #im = cv2.rectangle(im, (x- (bb_pad//2),y- (bb_pad//2)), (x+w1 + (bb_pad//2),y+h1+ (bb_pad//2)), (255, 255, 255), 2)
      
      cv2.imwrite(os.path.join(root, subset, "images", tamp_name + '_' + augmentation + str(j) + '.jpeg'), im)

      with open(os.path.join(root, subset, "labels", tamp_name + '_' + augmentation + str(j) + '.txt'), 'w') as f:
        f.write(str(0) + ' ' + str(c_x) + ' ' + str(c_y) + ' ' + str(w) + ' ' + str(h))




## Label actual Tampered Dataset (and divide into test/train/valid)

In [None]:
createLabeledSets(root, files, DB, test_size, val_size, im_size, True, 0,   'None', 4)

### Create sets (non-synthetic)

In [None]:
createLabeledSets(root, files, DB, test_size, val_size, im_size, False, 0, 'None', 4)

In [None]:
createLabeledSets(root, files, DB, test_size, val_size, im_size, False, 0, 'brightness', 1)

In [None]:
createLabeledSets(root, files, DB, test_size, val_size, im_size, False, 0, 'dullness', 1)

In [None]:
%%writefile /content/data.yaml
train: /content/train/images
val: /content/valid/images
test: /content/test/images


nc: 1
names: ['tampered']

Writing /content/data.yaml


### move images and masks folders into syn directory before running below

In [None]:
root = '/content/syn/'

In [None]:
# create file structures:
os.mkdir(root)
# train 
os.mkdir(root + 'train')
os.mkdir(root + 'train/' + 'labels')
os.mkdir(root + 'train/' + 'images')
# test
os.mkdir(root + 'test')
os.mkdir(root + 'test/' + 'labels')
os.mkdir(root + 'test/' + 'images')
# valid
os.mkdir(root + 'valid')
os.mkdir(root + 'valid/' + 'labels')
os.mkdir(root + 'valid/' + 'images')

In [None]:
createLabeledSets(root, DB, test_size, val_size, im_size, True, 'None', 2)

In [None]:
%%writefile /content/data_syn.yaml
train: /content/syn/train/images
val: /content/syn/valid/images
test: /content/syn/test/images


nc: 1
names: ['tampered']

Writing /content/data_syn.yaml


# Setup

Clone repo, install dependencies and check PyTorch and GPU.

In [None]:
!git clone https://github.com/ultralytics/yolov5  # clone
%cd yolov5
%pip install -qr requirements.txt  # install

#import torch
import utils
display = utils.notebook_init()  # checks

INFO:yolov5:YOLOv5 🚀 v6.1-386-g2e57b84 Python-3.7.13 torch-1.12.1+cu113 CUDA:0 (Tesla P100-PCIE-16GB, 16281MiB)
YOLOv5 🚀 v6.1-386-g2e57b84 Python-3.7.13 torch-1.12.1+cu113 CUDA:0 (Tesla P100-PCIE-16GB, 16281MiB)


Setup complete ✅ (4 CPUs, 25.5 GB RAM, 37.6/166.8 GB disk)


In [None]:
%%time
%cd /content/yolov5/
!python train.py --img 416 --batch 16 --epochs 240 --data '/content/data.yaml' --cfg /content/yolov5/models/yolov5m.yaml --weights '/content/syn240.pt' --name yolov5s_results  --cache

In [None]:
%%time
%cd /content/yolov5/
!python val.py --weights /content/yolov5/runs/train/yolov5s_results3/weights/best.pt --data /content/data.yaml --img 416 --conf 0.2 --task test --save-txt


In [None]:
%cd /content/yolov5/
!python detect.py --weights runs/train/yolov5s_results/weights/best.pt --img 416 --conf 0.2 --source ../test/images

## Freeze backbone

In [None]:
%%time
%cd /content/yolov5/
!python train.py --freeze 10 --img 416 --batch 16 --epochs 240 --data '/content/data.yaml' --cfg /content/yolov5/models/yolov5m.yaml --weights '/content/syn240.pt' --name yolov5s_results  --cache