## Research Paper Segmentation 

*CS445 Computational Photography Final Project*

- Bruno Seo (sbseo2)
- Michal Gryga (mgryga2)

References
- https://blog.francium.tech/object-detection-with-faster-rcnn-bc2e4295bf49
- https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html

### Colab Setting

- Run this chunk to download `paper dataset`
- Change runtime type to `GPU`

In [None]:
# !wget https://uofi.box.com/shared/static/zaer1y9ob4lnb9r1sihmyp5pyivmbrop.zip -O paper_dataset.zip
# !unzip paper_dataset.zip

### Load Packages

In [None]:
import pandas as pd
import numpy as np
import cv2
import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import FasterRCNN
from torch.utils.data import DataLoader, Dataset
import os
from collections import defaultdict
from matplotlib import pyplot as plt
import copy

### Set Data Directory

In [None]:
img_dir = './paper_dataset/img/'
csv_dir = './paper_dataset/csv/' 
tst_dir = './tst/'

### Define Training Dataset

In [None]:
class PaperDataset(object):
    def __init__(self, img_dir, csv_dir):
               
        self.img_dic = defaultdict()
        self.csv_dic = defaultdict()
        
        im_list = os.listdir(img_dir)
        self.num_images = len(im_list)

        for i in range(1,self.num_images+1):
            im = cv2.imread(img_dir+'{}.jpg'.format(i))/255.0
            im = im.astype('float32')
            self.img_dic[i-1] = cv2.resize(im, (int(im.shape[1]), int(im.shape[0])))

        for i in range(1,self.num_images+1):
            self.csv_dic[i-1] = pd.read_csv(csv_dir+'{}.csv'.format(i))
        
    def __getitem__(self, idx):

        im = self.img_dic[idx]
        csv = self.csv_dic[idx]
        """ Parameters """
        names = ['title', 'author', 'abstract']
        labels_dic = {'title': 1, 'author':2, 'abstract': 3}
        """ End """
        
        boxes = list()
        labels = list()
        num_objs = len(names)
        
        for name in names:
            xmin = csv.loc[csv['name']==str(name)]['xmin'].to_numpy()[0]
            xmax = csv.loc[csv['name']==str(name)]['xmax'].to_numpy()[0]
            ymin = csv.loc[csv['name']==str(name)]['ymin'].to_numpy()[0]
            ymax = csv.loc[csv['name']==str(name)]['ymax'].to_numpy()[0]
            boxes.append([int(xmin), int(ymin), int(xmax), int(ymax)])
            labels.append(labels_dic[name])

        # convert everything into a torch.Tensor
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # there are three classes
        labels = torch.as_tensor(labels, dtype=torch.int64)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])

        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd # background
        img = torchvision.transforms.ToTensor()(im)
        
        return img, target

    def __len__(self):
        return self.num_images

In [None]:
# Initialize Dataset
train_dataset = PaperDataset(img_dir, csv_dir)
def collate_fn(batch):
    return tuple(zip(*batch))
train_data_loader = DataLoader(
    train_dataset,
    batch_size=2,
    shuffle=False,
    num_workers=2,
    collate_fn=collate_fn
)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
images, targets = next(iter(train_data_loader))
images = list(image.to(device) for image in images)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

### Sanity Check

Check whether training set is properly created

In [None]:
boxes = targets[0]['boxes'].cpu().numpy().astype(np.int32)
img = images[0].permute(1,2,0).cpu().numpy().copy()
plt.figure(figsize=(10,10))

for box in boxes:
    cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]), (220,0,0),2)

plt.imshow(img)

### Modeling

- Define model here. We use. Faster-r-cnn as our baseline.
- Please change `num_classes` if you training multiple classes

In [None]:
num_classes = 4 # 1class(title) + background

In [None]:
model = 'res' # Choose between resnet, efficientNet

In [None]:
if model == 'res':
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True, progress=True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

elif model == 'eff':
    from utils import effnet_create_model
    model = effnet_create_model(num_classes)
    model.to(device)

In [None]:
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
num_epochs = 2

In [None]:
itr = 1

for epoch in range(num_epochs):
    for images, targets in train_data_loader:
        
        images =list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        
        if torch.cuda.is_available():
            model.to(device)    
        loss_dict = model(images, targets)
        
        losses = sum(loss for loss in loss_dict.values())
        loss_value = losses.item()
        
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
        
        if itr%50 == 0:
            print("Iteration #{} loss: #{}".format(itr, loss_value))
            
        itr += 1
        
#         lr_scheduler.step()
        
    print("Epoch #{} loss: {}".format(epoch, loss_value))

### Save model  (optional)

In [None]:
torch.save(model.state_dict(), f'./model-{model}-epoch-{num_epochs}.pth')

### Evaluation

Validate on testdata

In [None]:
class TestDataset(object):
    def __init__(self, img_dir):
        self.img_dic = defaultdict()
        
        im_list = os.listdir(img_dir)
        self.num_images = len(im_list)
        for i in range(1,self.num_images+1):
            im = cv2.imread(img_dir+'{}.jpg'.format(i))/255.0
            im = im.astype('float32')
            self.img_dic[i-1] = cv2.resize(im, (int(im.shape[1]), int(im.shape[0])))
        
    def __getitem__(self, idx):

        im = self.img_dic[idx]
        img = torchvision.transforms.ToTensor()(im)
        
        return img, _

    def __len__(self):
        return self.num_images

In [None]:
test_dataset = TestDataset(tst_dir)
def collate_fn(batch):
    return tuple(zip(*batch))
test_data_loader = DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False,
    num_workers=1,
    collate_fn=collate_fn
)

In [None]:
model.eval()
for i, (images, _) in enumerate(test_data_loader):
    images = list(image.to(device) for image in images)
    outputs = model(images)

    idx = 0 # I do not know why this value should be 0
    sample = images[idx].permute(1,2,0).cpu().numpy().copy()
    boxes = outputs[idx]['boxes'].data.cpu().numpy()
    scores = outputs[idx]['scores'].data.cpu().numpy()
    labels = outputs[idx]['labels'].data.cpu().numpy()
    
    plt.figure(figsize=(10,10))
    path = list()
    for lbl, _, box in sorted(list(zip(labels, scores, boxes)), key=lambda x: x[1], reverse=True):
        if len(path) == 3:
            break

        if lbl not in path:
            x_min, y_min = int(box[0]), int(box[1])
            x_max, y_max = int(box[2]), int(box[3])
        
            cv2.rectangle(sample, (x_min, y_min), (x_max, y_max), (220,0,0),2)
            
            path.append(lbl)

    plt.imshow(sample)