# Generalized code to prepare and combine (both synthetic and real) Image Fruad Datasets for use with YOLOv5

### imports

In [None]:
import cv2
import re
import yaml
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import glob
from random import randint
import os
import torch
import tensorflow as tf
from torchvision.io import read_image
import torchvision.transforms.functional as F
import torchvision.transforms as T
from torchvision.ops import masks_to_boxes

### Unzip/extract images and masks files (uploaded as zip files)

In [None]:
# path whereimages and masks zip folder(s) are located 
root = '/content/'

In [None]:
# Create Labeled Train/Val/Test Sets in Proper File Structure
# train 
os.mkdir(root + 'train')
os.mkdir(root + 'train/' + 'labels')
os.mkdir(root + 'train/' + 'images')
# test
os.mkdir(root + 'test')
os.mkdir(root + 'test/' + 'labels')
os.mkdir(root + 'test/' + 'images')
# valid
os.mkdir(root + 'valid')
os.mkdir(root + 'valid/' + 'labels')
os.mkdir(root + 'valid/' + 'images')

### COVERAGE

In [None]:
with zipfile.ZipFile(root +'masks.zip' , 'r') as zip_ref:
  zip_ref.extractall(root)

In [None]:
with zipfile.ZipFile(root + 'images.zip' , 'r') as zip_ref:
  zip_ref.extractall(root)

In [None]:
# name of DB
DB = 'coverage'
# COVERAGE is a very small DB
val_size = test_size = 15
# name 
files = 'masks'

### IEEE IFS-TC ImageForensics Challenge

In [None]:
with zipfile.ZipFile(root +'fake.zip' , 'r') as zip_ref:
  zip_ref.extractall(root)

In [47]:
with zipfile.ZipFile(root + 'pristine.zip' , 'r') as zip_ref:
  zip_ref.extractall(root)

In [None]:
# name of DB
DB = 'ifs-tc'
# COVERAGE is a very small DB
val_size = test_size = 150
# name 
files = 'fake'

### other constants

In [None]:
# YOLO requires images to be squares 
# choose appropriate side length
im_size = 416

## Create and label synthetic Tampered Dataset (and divide into train/valid)
- either save the weights (trained with synthetic data) to retrain or fine-tune with non-synthetic dataset or run add them to the same training set




## TODO:

Datasets:
- try other image fraud db
- compare synthetic Pascal Voc images (download masks)
- compare synthetic COCO images (download masks)

Testing:
- add in untampered images
- add more augmented data from actual training set since there is a class imbalance

Preprossing:
- try resizing via padding only to avoid adding additional "tampering" or artifacts
- move around masks pasted



##image fraud DBs only include masks (no BBs) so this `createLabeledSets` function extracts BBs from masks to create the labels while placing properly formatted images and labels in proper YOLO file structures (for each train/val/test set)


**root:** where directories with test/train/val will be created 
and where the original tampered/authentic images and masks will be unzipped

**DB:**

**files:** name of folder holding all masks and/or corresponding tampered images

**test_size:**

**val_size:**

**im_size:**

**synthetic:** if True then a synthetically crop object from another image and paste onto untampered image

num_real = 0, augmentation = 'None', n_rotations = 1, bb_pad = 0

In [61]:
def createLabeledSets(root, DB, files, test_size, val_size, im_size, synthetic, num_real = 0, augmentation = 'None', n_rotations = 1, bb_pad = 0):

  # get all mask names in a list (sorted so first images will always be in test set)
  all_files = list(sorted(os.listdir(os.path.join(root, files))))
  # get all images in a list 
  # images = list(sorted(os.listdir(os.path.join(root, 'images'))))
  subset = 'test'
  count = num_real

  #def crop_resize(im,im_size,bb = None):
    # if bb != None:
    #   x1,x2,w,h = 
    # w0, h0 = im.size[:2]
    # r_x, r_y = random.randint( w0-im_size,im_size-bb-1), random.randint(bb+1, im_size-bb-1) 
    # new_im = im[w0-im_size:(orig_im_coords[1] + (im_size-r_x)), (orig_im_coords[1] - r_y):(orig_im_coords [1] + (im_size-r_y))]
    #   if new_ims[name + '_' + str(line[0:3]) + '_' + str(i)].shape != (416,416,3):
    #     del new_ims[name + '_' + str(line[0:3]) + '_' + str(i)]
    #   else:
    #     labels[name + '_' + str(line[0:3]) + '_' + str(i)] = [r_x - bb, r_y- bb, r_x + bb, r_y + bb ]


  # iterate through each mask 
  for i in range(len(all_files)):

    if test_size <= i < (test_size + val_size):
      subset = 'valid' 
    elif i >= (test_size + val_size):
      subset = 'train'

    if i < test_size:
      num = randint(0,test_size)
    else:
      num = randint(test_size,len(all_files)-1)

    if DB == 'coverage':
      ext = '.jpg'
      # remove forgery type info from image number so it matches with the corresponding image
      # for COVERAGE dataset matching numbers in names mean corresponding image/mask pair
      auth_name = re.sub("[^0-9]", "", all_files[i])
      
      if all_files[i] != auth_name + 'paste.jpeg':
        continue

      tamp_name = auth_name + 't' 
      # make sure to not paste on tampered image for synthetic generation
      auth2_name = re.sub("[^0-9]", "", all_files[num]) 

      # get image and mask using above
      tamp_im = cv2.imread(os.path.join(root, "images", tamp_name + ext))
      mask_im = cv2.imread(os.path.join(root, files, all_files[i]))
      # COVERAGE contains a corresponding 'real' image for each tampered image
      auth_im = cv2.imread(os.path.join(root, "images", auth_name + ext))
      auth2_im = cv2.imread(os.path.join(root, "images", auth2_name + ext)) 
    
    if DB == 'ifs-tc':
      ext = '.png'

      if all_files[i] == all_files[i].split('.mask')[0]:
        continue
      tamp_name = all_files[i].split('.mask')[0]
      auth_name = all_files[i].split(ext)[0]
      auth2_name = list(sorted(os.listdir(os.path.join(root, 'pristine'))))[i].split(ext)[0]
      print(auth2_name)

      tamp_im = cv2.imread(os.path.join(root, files, tamp_name + ext))
      mask_im = cv2.imread(os.path.join(root, files, all_files[i]))
      auth_im = cv2.imread(os.path.join(root, "pristine", auth_name + ext)) 
      auth2_im = cv2.imread(os.path.join(root, "pristine", auth2_name + ext)) 
      #print(list(sorted(os.listdir(os.path.join(root, 'pristine'))))[i])


    if num_real > 0 and count > 0:
        count -= 1
        
        # write empty file meaning 'no object' for untampered images (ones without the 't' are untampered)
        with open(os.path.join(root, subset, "labels", auth_name + '.txt'), 'w') as f:     
          f.write(' ')
        # then add the real image to the set folder

        ### HERE
        auth_im = cv2.resize(auth_im, (im_size,im_size))
        cv2.imwrite(os.path.join(root, subset, "images", auth_name + ext), auth_im)
      

    for j in range(n_rotations):

      
      # read in image to extract masked region
      # im = cv2.imread(os.path.join(root, "images", tamp_name + '.jpeg'))

      # read corresponding mask
      # mask = cv2.imread(os.path.join(root, "masks", all_files[i]))

      # rotate both the mask and the tamped image its extracted from 
      if j == 0:
        # no rotation (full 360)
        mask_im = cv2.rotate(mask_im, cv2.ROTATE_180)
        tamp_im = cv2.rotate(tamp_im, cv2.ROTATE_180)
        rotate = cv2.ROTATE_180
      elif j == 1:
        if subset == 'test':
          break
        rotate = cv2.ROTATE_90_COUNTERCLOCKWISE
      elif j == 2:
        rotate = cv2.ROTATE_180
      else:
        rotate = cv2.ROTATE_90_CLOCKWISE
      
      # get BBs from mask
      # first rotate and resize
      
      mask_im = cv2.rotate(mask_im, rotate)

      ### HERE
      mask_im = cv2.resize(mask_im, (im_size,im_size))

      # make sure image is binary and threshold
      gray = cv2.cvtColor(mask_im, cv2.COLOR_BGR2GRAY)
      thresh = cv2.threshold(gray,250,255,cv2.THRESH_BINARY)[1]
      cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
      cnts = cnts[0] if len(cnts) == 2 else cnts[1]
      for c in cnts:
        x,y,w1,h1 = cv2.boundingRect(c)

        
      # YOLO requires coordinates in x,y center, x,y height/width
      c_x = x + (w1 // 2) 
      c_y = y + (h1 // 2)
      # YOLO requires normalization of coordinates      
      c_x /= im_size
      c_y /= im_size 
      w = (w1 + bb_pad) / im_size 
      h = (h1 + bb_pad)/ im_size
      
      # rotate and resize image (same ask mask)
      im_r = cv2.rotate(tamp_im, rotate)

      ### HERE
      out_im = cv2.resize(im_r, (im_size,im_size))

      if synthetic == True:  
        # read in random image to paste extracted masked region onto it
        # and keep BG and FG images in same sets:

        ### HERE
        auth2_im = cv2.resize(auth2_im, (im_size, im_size))
      
        # mask of im is 0/black everywhere where im2 will go   
        out_im = np.where(mask_im == 0, auth2_im, out_im) # also works
        #im = np.where(mask >= 250, im, im2)

      if augmentation != 'None':
        # imagaug and albumentations (augmentations for BB included)
        if subset == 'test':
          break
        if augmentation == 'brightness':
          b = np.ones(out_im.shape, dtype='uint8')*70
          out_im = cv2.add(out_im,b)

        elif augmentation == 'dullness':
          d = np.ones(out_im.shape, dtype='uint8')*70
          out_im = cv2.subtract(out_im,d)

        elif augmentation == 'sharpness':
          s = np.array([[-1,-1,-1],[-1,10,-1],[-1,-1,-1]])
          out_im = cv2.filter2D(out_im, -1,s)

      # debug:
      #im = cv2.rectangle(im, (x- (bb_pad//2),y- (bb_pad//2)), (x+w1 + (bb_pad//2),y+h1+ (bb_pad//2)), (255, 255, 255), 2)
      
      cv2.imwrite(os.path.join(root, subset, "images", tamp_name + '_' + augmentation + str(j) + ext), out_im)

      with open(os.path.join(root, subset, "labels", tamp_name + '_' + augmentation + str(j) + '.txt'), 'w') as f:
        f.write(str(0) + ' ' + str(c_x) + ' ' + str(c_y) + ' ' + str(w) + ' ' + str(h))




## Label actual Tampered Dataset (and divide into test/train/valid)

In [62]:
createLabeledSets(root, DB, files, test_size, val_size, im_size, True, 0, 'None', 4)

000bc3906100ede4b1374cea075adedb
003f2d6e572e2587ffa809f78ff4a5c4
00dd93f9e4af553929e54e66c2a3186e
00e079b66d9e9f99892bbb81d9d6cd57
00ec0ffe0cd029ad4551680484a007d7
00f2eadf3c83c55da30868177dfdc62d
01aa04db92eb2617ebfe69ffe32b1bdd
01ca35c7722fc0553bad629e14c00d2c
01e6ccb1d207ce31033251643aa55f9d
02b3c3a0865816af3992f188a5b6c94a
02c4c79f3fb145f4c80fa12cfece8b87
02e60bdefd5ddc034000c93fffdf645e
02efa34b5034a41fb487ef38f07c321c
03a529bdfeb4484b679ce36d419c0a04
03c794c1aa296f4566d2ee760128de3f
03cd095b978da91503f8aa88c5dcfa82
03eaa5163b84efa2b14c5d1431107200
03ecdd7f56567b5e64a9f9c768af4295
04b9dd7d5e3d24e0bca41ff9173626f1
04c1b2b6bfe74356016a1a4c43b5d207
04d607a8124fa06ee737411f24444bcf
04f7cceb8160dcec285a0a79ee7bae7a
05d3a37752dd9de965da4b5f88ed97d0
05d9c3c9bb54b8beca02ac89553bd84a
05e7e21796871e4303cac587265b5d60
06a6e29f235cf41ed4c3ca707d994a30
06c190007f980b9d58a25a6db433327a
06ea436a8f36419fa350dab4e135fecb
06f0bd10b361346e4dfe56d185421df1
06fd147009294369e6cd87cdf5a6e77d
07a65b4912

### Create sets (non-synthetic)

In [None]:
createLabeledSets(root, DB, files, test_size, val_size, im_size, False, 0, 'None', 4)

In [None]:
createLabeledSets(root, DB, files, test_size, val_size, im_size, False, 0, 'brightness', 1)

In [None]:
createLabeledSets(root, DB, files, test_size, val_size, im_size, False, 0, 'dullness', 1)

In [None]:
%%writefile /content/data.yaml
train: /content/train/images
val: /content/valid/images
test: /content/test/images


nc: 1
names: ['tampered']

Writing /content/data.yaml


### move images and masks folders into syn directory before running below

In [None]:
root = '/content/syn/'

In [None]:
# create file structures:
os.mkdir(root)
# train 
os.mkdir(root + 'train')
os.mkdir(root + 'train/' + 'labels')
os.mkdir(root + 'train/' + 'images')
# test
os.mkdir(root + 'test')
os.mkdir(root + 'test/' + 'labels')
os.mkdir(root + 'test/' + 'images')
# valid
os.mkdir(root + 'valid')
os.mkdir(root + 'valid/' + 'labels')
os.mkdir(root + 'valid/' + 'images')

In [None]:
createLabeledSets(root, DB, test_size, val_size, im_size, True, 'None', 2)

In [None]:
%%writefile /content/data_syn.yaml
train: /content/syn/train/images
val: /content/syn/valid/images
test: /content/syn/test/images


nc: 1
names: ['tampered']

Writing /content/data_syn.yaml


# Setup

Clone repo, install dependencies and check PyTorch and GPU.

In [None]:
!git clone https://github.com/ultralytics/yolov5  # clone
%cd yolov5
%pip install -qr requirements.txt  # install

#import torch
import utils
display = utils.notebook_init()  # checks

INFO:yolov5:YOLOv5 🚀 v6.1-386-g2e57b84 Python-3.7.13 torch-1.12.1+cu113 CUDA:0 (Tesla P100-PCIE-16GB, 16281MiB)
YOLOv5 🚀 v6.1-386-g2e57b84 Python-3.7.13 torch-1.12.1+cu113 CUDA:0 (Tesla P100-PCIE-16GB, 16281MiB)


Setup complete ✅ (4 CPUs, 25.5 GB RAM, 37.6/166.8 GB disk)


In [None]:
%%time
%cd /content/yolov5/
!python train.py --img 416 --batch 16 --epochs 240 --data '/content/data.yaml' --cfg /content/yolov5/models/yolov5m.yaml --weights '/content/syn240.pt' --name yolov5s_results  --cache

In [None]:
%%time
%cd /content/yolov5/
!python val.py --weights /content/yolov5/runs/train/yolov5s_results3/weights/best.pt --data /content/data.yaml --img 416 --conf 0.2 --task test --save-txt


In [None]:
%cd /content/yolov5/
!python detect.py --weights runs/train/yolov5s_results/weights/best.pt --img 416 --conf 0.2 --source ../test/images

## Freeze backbone

In [None]:
%%time
%cd /content/yolov5/
!python train.py --freeze 10 --img 416 --batch 16 --epochs 240 --data '/content/data.yaml' --cfg /content/yolov5/models/yolov5m.yaml --weights '/content/syn240.pt' --name yolov5s_results  --cache