<a href="https://colab.research.google.com/github/sawyermade/detectron2_pkgs/blob/master/colab/Detectron2_Train_COCO_2017.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Detectron2 Training: Mask R-CNN with COCO 2017](https://github.com/sawyermade/detectron2_pkgs)
<img src="https://dl.fbaipublicfiles.com/detectron2/Detectron2-Logo-Horz.png" width="500">

# **Github:** https://github.com/sawyermade/detectron2_pkgs/tree/master/colab

# **COCO 2017 Setup:** https://github.com/sawyermade/detectron2_pkgs/tree/master/dataset_download

# **Google Drive Setup**: 

## Google Drive: https://drive.google.com

## Youtube Video: https://youtu.be/I-sfiHEL2Sw

## 1. Create directory in root of 'My Drive' called 'detectron2_models' (case sensitive).

## 2. Add COCO dataset share to your google drive account via this link (login if necessary): https://drive.google.com/drive/folders/1EVsLBRwT2njNWOrmBAhDHvvB8qrd9pXT?usp=sharing

## 3. Now COCO directory should be in your 'Shared with me' section of google drive

## 4. Create shortcut of COCO share to your drive, right click and select 'Add Shortcut to drive', choose root of 'My Drive'. Now you should see COCO directory under 'My Drive' section.

## 5. ***Resume Training*** by setting the resume_training under 'Install Detectron2 and Deps' cell to True and then Run All under Runtime.

# **Keep Colab From Timing Out:**
Prevents Colab from disconnecting due to inactivity while training and lets you sleep :)

## Youtube Video: https://youtu.be/TCuklhUHDKQ
Press Ctrl+Shift+i in your browser then select console tab.

Copy and paste this into console and press return:
```
function ClickConnect(){
console.log("Working"); 
document.querySelector("colab-toolbar-button#connect").click() 
}
setInterval(ClickConnect,60000)
```
"colab-toolbar-button#connect"

"connect-button-resource-display"

# **Mount Google Drive**
Mounts Google Drive so you can read COCO from it and save models to it. Will also allow you to resume.

You will be prompted with an http link to get your google oath credential key. Paste into input box and press return.

## Youtube Video: https://youtu.be/Qa6lS2lGPl4

## Mount Google Drive:
Mounts Google Drive to /content/gdrive

In [0]:
# Imports google drive module and mounts it
from google.colab import drive 
drive.mount('/content/gdrive')

# **Install Detectron2 and Deps**
Installs all the deps needed through pip and restarts runtime.

Takes about 5 minutes to run.


In [0]:
# Starts timer
from datetime import timedelta
import time
from tqdm import tqdm
time_start = time.time()

try: 
  import detectron2

except:
  # Clone git repo, you can fork and make your own configs
  !git clone --recurse-submodules https://github.com/sawyermade/detectron2_pkgs.git

  # Installs detectron2
  !pip install -U torch torchvision
  !pip install git+https://github.com/facebookresearch/fvcore.git
  !rm -rf detectron2_pkgs/detectron2/build
  !pip install -U -e detectron2_pkgs/detectron2

  # Installs additional packages for other types of training
  !pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
  !pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
  !pip install -U 'git+https://github.com/cocodataset/panopticapi.git'
  !pip install -U 'git+https://github.com/mcordts/cityscapesScripts.git'
  !pip install imageio shapely
  # !pip install -U 'git+https://github.com/lvis-dataset/lvis-api.git' #LVIS not working atm

  # Prints running time
  time_stop = time.time()
  time_delta = timedelta(seconds=(time_stop - time_start))
  print(f'Running Time: {time_delta}')

  # Resets runtime
  import os
  os.kill(os.getpid(), 9)


# **Resume or Debug?**
Set resume_training to True to resume, set to False to start new.

Set debug_flag to True to enable some debugging functions, like random image sampling.

In [0]:
resume_training = True
debug_flag = True

#**COCO 2017 Import**
Loads symlinks for COCO 2017 from Google Drive to local disk.


## Create Local Symlinks:
Creates symlinks from Google Drive to local /content/datasets/coco.

Prevents the reading large number of files in Gdrive directory caching bug.

There are 24 subdirectories of 5000 for train2017 and 9 subdirectories of 5000 for test2017.

Takes about 25 minutes to run.

In [0]:
# Create directories
!rm -rf datasets
!mkdir -p datasets/coco/train2017 datasets/coco/test2017

# Imports and starts timer
import time, os
from tqdm import tqdm
from datetime import timedelta
time_start = time.time()

# Symlink COCO
gdrive_path = r'/content/gdrive/My Drive/COCO/2017'
local_path = 'datasets/coco/'
dnc_dirs = ['train2017', 'test2017']
for d in os.listdir(gdrive_path):
  if d not in dnc_dirs:
    os.system(f'ln -s \"{os.path.join(gdrive_path, d)}\" {local_path}')

# Create symlinks for sub directories for train2017 and test2017
dir_sub_list = [
  os.path.join(gdrive_path, 'train2017_sub'), 
  os.path.join(gdrive_path, 'test2017_sub')
]
for d, d_sub in zip(dnc_dirs, dir_sub_list):
  for root, dirs, files in tqdm(os.walk(d_sub)):
    if files:
      for filename in files:
        os.system(f'ln -s \"{os.path.join(root, filename)}\" {os.path.join(local_path, d)}')

# Prints the file count in train2017 and test2017
train_count = len(os.listdir(os.path.join(local_path, 'train2017')))
test_count = len(os.listdir(os.path.join(local_path, 'test2017')))
print(f'Train2017 Count(118287): {train_count}')
print(f'Test2017 Count(40670): {test_count}')

# Links output
!unlink output 
!ln -s /content/gdrive/My\ Drive/detectron2_models ./output

# Prints running time
time_stop = time.time()
time_delta = timedelta(seconds=(time_stop - time_start))
print(f'Running Time: {time_delta}')

# **Imports**
Import all the modules needed to train.

In [0]:
import os, sys, argparse, logging, torch, random, cv2
from google.colab.patches import cv2_imshow

from collections import OrderedDict
import detectron2.utils.comm as comm
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch
from detectron2.evaluation import (
	CityscapesEvaluator,
	COCOEvaluator,
	COCOPanopticEvaluator,
	DatasetEvaluators,
	LVISEvaluator,
	PascalVOCDetectionEvaluator,
	SemSegEvaluator,
	verify_results,
)
from detectron2.modeling import GeneralizedRCNNWithTTA
from detectron2.data.datasets import register_coco_instances
from detectron2.utils.visualizer import Visualizer

# **Arguments**
Setup all the arguments needed for training like configs, resume, opts, etc.

## Argument Parser Definition:
Run to create the arg parse function definition.

In [0]:
# Arg parser
def argument_parser(arg_list=None):
    """
    Create a parser with some common arguments used by detectron2 users.

    Returns:
        argparse.ArgumentParser:
    """
    parser = argparse.ArgumentParser(description="Detectron2 Training")
    parser.add_argument(
    	"--config-file", 
    	'-cf',
    	default='detectron2_pkgs/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml', 
    	metavar="FILE", 
    	help="path to config file",
    )
    parser.add_argument(
        "--resume",
        action="store_true",
        help="whether to attempt to resume from the checkpoint directory",
    )
    parser.add_argument("--eval-only", action="store_true", help="perform evaluation only")
    parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*")
    parser.add_argument("--num-machines", type=int, default=1)
    parser.add_argument(
        "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)"
    )

    # PyTorch still may leave orphan processes in multi-gpu training.
    # Therefore we use a deterministic way to obtain port,
    # so that users are aware of orphan processes by seeing the port occupied.
    port = 2 ** 15 + 2 ** 14 + hash(os.getuid()) % 2 ** 14
    parser.add_argument("--dist-url", default="tcp://127.0.0.1:{}".format(port))
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=['SOLVER.IMS_PER_BATCH', '2', 'SOLVER.BASE_LR', '0.0025'],
        nargs=argparse.REMAINDER,
    )

    parser.add_argument(
    	'--dataset-name',
    	'-dn',
    	dest='dataset_name',
    	help='Name of dataset',
    	type=str,
    	default='coco_2017'
    )
    parser.add_argument(
    	'--train-gt',
    	'-tgt',
    	dest='train_gt',
    	help='Path to train json',
    	type=str,
    	default=None
    )
    parser.add_argument(
    	'--val-gt',
    	'-vgt',
    	dest='val_gt',
    	help='Path to train json',
    	type=str,
    	default=None
    )
    parser.add_argument(
    	'--train-dir',
    	'-tdir',
    	dest='train_dir',
    	help='Path to train directory',
    	type=str,
    	default=None
    )
    parser.add_argument(
    	'--val-dir',
    	'-vdir',
    	dest='val_dir',
    	help='Path to val directory',
    	type=str,
    	default=None
    )
    parser.add_argument(
        "--debug",
        action="store_true",
        help="Enable DEBUG",
    )
    parser.add_argument(
    	'--cuda',
    	'-cu',
    	dest='cuda',
    	help='CUDA card to use',
    	type=str,
    	default='0'
    )

    if arg_list:
      return parser.parse_args(args=arg_list)
    else:
      return parser.parse_args()

## Create Arguments List: 
This is where you set your config file.

Run New Model Training for a new model or Resume Model Training to resume.

### Config File:
Path to config file.

In [0]:
config_path = 'detectron2_pkgs/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml'

### New Model Training:
For training new model.

In [0]:
# Sets config file and opts
if not resume_training:
  arg_list = [
              '--config-file', config_path,
              'SOLVER.IMS_PER_BATCH', '2',
              'SOLVER.BASE_LR', '0.0025'
  ]

### Resume Model Training:
Resume model training.

In [0]:
# Sets config file and opts
if resume_training:
  arg_list = [
              '--config-file', config_path,
              '--resume',
              'SOLVER.IMS_PER_BATCH', '2',
              'SOLVER.BASE_LR', '0.0025'
  ]

## Parse Arguments:
Parses args.

In [0]:
# Parse and print args
args = argument_parser(arg_list)
if debug_flag: args.debug = True
print(args)

# **Check Dataset**: Optional
Checks that dataset and ground truth are working properly. This step is optional. 

Must set args.debug to True and resume_training False.

## Loads Meta Data and Dicts:

In [0]:
if not resume_training and args.debug:
  # Loads training meta data
  coco_train_metadata = MetadataCatalog.get('coco_2017_train')
  coco_train_dicts = DatasetCatalog.get('coco_2017_train')

  # # Loads validation meta data
  coco_val_metadata = MetadataCatalog.get('coco_2017_val')
  coco_val_dicts = DatasetCatalog.get('coco_2017_val')



## Meta Check Definition:
Defines the meta check.

In [0]:
# Random sample check
def random_meta_check(dataset_dicts, dataset_metadata, name='Test'):
  for d in random.sample(dataset_dicts, 3):
    # print(f'filename: {d["file_name"]}')
    img = cv2.imread(d["file_name"])
    visualizer = Visualizer(img[:, :, ::-1], metadata=dataset_metadata, scale=0.5)
    vis = visualizer.draw_dataset_dict(d)
    # cv2.imshow(name, vis.get_image()[:, :, ::-1])
    cv2_imshow(vis.get_image()[:, :, ::-1])
    k = cv2.waitKey(0)
    cv2.destroyAllWindows()

## Runs random sampling check:
Runs random sampling of data with ground truth.

In [0]:
if not resume_training and args.debug:
  # # Samples training data
  random_meta_check(coco_train_dicts, coco_train_metadata, 'Train')

  # # Samples validation data
  random_meta_check(coco_val_dicts, coco_val_metadata, 'Valid')


# **Config Setup Function:**
Sets up configuration for network.

In [0]:
# Config setup
def setup(args):
	"""
	Create configs and perform basic setups.
	"""
	out_dir = args.config_file.split(os.sep)[-1].rsplit('.', 1)[0]
	cfg = get_cfg()
	cfg.OUTPUT_DIR = os.path.join('./output', out_dir)
	cfg.merge_from_file(args.config_file)
	cfg.merge_from_list(args.opts)
	cfg.freeze()
	default_setup(cfg, args)
	return cfg

# **Trainer Function:**
Function used to train the network.

In [0]:
class Trainer(DefaultTrainer):
	"""
	We use the "DefaultTrainer" which contains pre-defined default logic for
	standard training workflow. They may not work for you, especially if you
	are working on a new research project. In that case you can use the cleaner
	"SimpleTrainer", or write your own training loop. You can use
	"tools/plain_train_net.py" as an example.
	"""

	@classmethod
	def build_evaluator(cls, cfg, dataset_name, output_folder=None):
		"""
		Create evaluator(s) for a given dataset.
		This uses the special metadata "evaluator_type" associated with each builtin dataset.
		For your own dataset, you can simply create an evaluator manually in your
		script and do not have to worry about the hacky if-else logic here.
		"""
		if output_folder is None:
			output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
		evaluator_list = []
		evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
		if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
			evaluator_list.append(
				SemSegEvaluator(
					dataset_name,
					distributed=True,
					num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
					ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
					output_dir=output_folder,
				)
			)
		if evaluator_type in ["coco", "coco_panoptic_seg"]:
			evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder))
		if evaluator_type == "coco_panoptic_seg":
			evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
		elif evaluator_type == "cityscapes":
			assert (
				torch.cuda.device_count() >= comm.get_rank()
			), "CityscapesEvaluator currently do not work with multiple machines."
			return CityscapesEvaluator(dataset_name)
		elif evaluator_type == "pascal_voc":
			return PascalVOCDetectionEvaluator(dataset_name)
		elif evaluator_type == "lvis":
			return LVISEvaluator(dataset_name, cfg, True, output_folder)
		if len(evaluator_list) == 0:
			raise NotImplementedError(
				"no Evaluator for the dataset {} with the type {}".format(
					dataset_name, evaluator_type
				)
			)
		elif len(evaluator_list) == 1:
			return evaluator_list[0]
		return DatasetEvaluators(evaluator_list)

	@classmethod
	def test_with_TTA(cls, cfg, model):
		logger = logging.getLogger("detectron2.trainer")
		# In the end of training, run an evaluation with TTA
		# Only support some R-CNN models.
		logger.info("Running inference with test-time augmentation ...")
		model = GeneralizedRCNNWithTTA(cfg, model)
		evaluators = [
			cls.build_evaluator(
				cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
			)
			for name in cfg.DATASETS.TEST
		]
		res = cls.test(cfg, model, evaluators)
		res = OrderedDict({k + "_TTA": v for k, v in res.items()})
		return res

# **Main Function:**
Main function that gets launched for training.

In [0]:
# Main function to get launched
def main(args):
	# Gets and sets up config
	cfg = setup(args)

	# If eval only
	if args.eval_only:
		model = Trainer.build_model(cfg)
		DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
			cfg.MODEL.WEIGHTS, resume=args.resume
		)
		res = Trainer.test(cfg, model)
		if comm.is_main_process():
			verify_results(cfg, res)
		if cfg.TEST.AUG.ENABLED:
			res.update(Trainer.test_with_TTA(cfg, model))
		return res

	"""
	If you'd like to do anything fancier than the standard training logic,
	consider writing your own training loop or subclassing the trainer.
	"""
	trainer = Trainer(cfg)
	trainer.resume_or_load(resume=args.resume)
	if cfg.TEST.AUG.ENABLED:
		trainer.register_hooks(
			[hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
		)
	return trainer.train()

# **Launch Training:**
Starts training by launching main function.

In [0]:
# Launches main and starts training
launch(
  main,
  args.num_gpus,
  num_machines=args.num_machines,
  machine_rank=args.machine_rank,
  dist_url=args.dist_url,
  args=(args,),
)