In [None]:
!pip install efficientnet_pytorch
!pip install cleanlab

In this Kernel, I used Confident Learning to remove some false positive in train.csv.About Confident Learning, you can check below.

* 中文：https://zhuanlan.zhihu.com/p/146557232
* English：https://arxiv.org/pdf/1911.00068.pdf
* 日本語：https://aotamasaki.hatenablog.com/entry/confident_learning

In [None]:
import pandas as pd
import numpy as np
import os
import re
import cv2
import matplotlib.pyplot as plt
import random
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from efficientnet_pytorch import EfficientNet
from torch.utils.data import Dataset
from torchvision import transforms
from torch.optim import Adam, SGD, RMSprop
from torchvision import transforms
import torch
import torch.nn as nn
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
from sklearn.metrics import accuracy_score
import time

In [None]:
batch_size = 512
num_class = 2
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [None]:
def preprecess(df: pd.DataFrame) -> pd.DataFrame:
    """
    https://www.kaggle.com/pestipeti/pytorch-starter-fasterrcnn-inference
    :param df:
    :return: df
    """
    df['x'] = -1
    df['y'] = -1
    df['w'] = -1
    df['h'] = -1

    def expand_bbox(x):
        r = np.array(re.findall("([0-9]+[.]?[0-9]*)", x))
        if len(r) == 0:
            r = [-1, -1, -1, -1]
        return r

    df[['x', 'y', 'w', 'h']] = np.stack(df['bbox'].apply(lambda x: expand_bbox(x)))
    df.drop(columns=['bbox'], inplace=True)
    df['x'] = df['x'].astype(np.float)
    df['y'] = df['y'].astype(np.float)
    df['w'] = df['w'].astype(np.float)
    df['h'] = df['h'].astype(np.float)

    return df

train = preprecess(pd.read_csv("../input/global-wheat-detection/train.csv"))
data_dir = "../input/global-wheat-detection/train/"

In [None]:
too_small_box_df = train[(train['h'] < 10) | (train['w'] < 10)]
print('before remove too samll bboxes:', len(train))
train = train.drop(index=too_small_box_df.index.values)
train.reset_index(drop=True)
print('after remove too samll bboxes:', len(train))

In [None]:
large_box_index = [3687,117344,173,113947,52868,2159,2169,121633,121634,147504,118211, 147552, 86917, 4412]

In [None]:
print('before remove too large bboxes:', len(train))
train = train.drop(index=large_box_index)
train = train.reset_index(drop=True)
print('after remove too large bboxes:', len(train))

## Cut Wheat

In [None]:
def bbox_ioa(box1, box2):
        # Returns the intersection over box2 area given box1, box2. box1 is 4, box2 is nx4. boxes are x1y1x2y2
    box2 = box2.transpose()

    # Get the coordinates of bounding boxes
    b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
    b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]

    # Intersection area
    inter_area = (np.minimum(b1_x2, b2_x2) - np.maximum(b1_x1, b2_x1)).clip(0) * \
                 (np.minimum(b1_y2, b2_y2) - np.maximum(b1_y1, b2_y1)).clip(0)

    # box2 area
    box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + 1e-16

    # Intersection over box2 area
    return inter_area / box2_area

In [None]:
def create_data(df, data_dir, debug=True):
    
    wheat_imgs = []
    wheat_labels = []
    wheat_imgs_2 = []
    wheat_labels_2 = []
    
    for img_id in tqdm(df["image_id"].unique()):
        
        image = cv2.imread(f'{data_dir}/{img_id}.jpg', cv2.IMREAD_COLOR)
        h, w = image.shape[:2]
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0
        records = df[df['image_id'] == img_id]
        boxes = records[['x', 'y', 'w', 'h']].values
        boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
        boxes[:, 3] = boxes[:, 1] + boxes[:, 3]
        
        for box in boxes:
            #create true image, wheat img and label == 1
            img = image[int(box[1]):int(box[3]), int(box[0]):int(box[2]), :]
            img = cv2.resize(img, (32, 32))
            wheat_imgs.append(img)
            wheat_labels.append(1)
            if debug:
                fig, ax = plt.subplots(1, 1, figsize=(2, 2))
                plt.imshow(img)
                plt.show()
                debug = False
                
        scale = [0.25, 0.125, 0.0625] 
        
        for i in range(len(boxes)):
            
            s = random.choice(scale)
            y = random.randint(0, h)
            x = random.randint(0, w)

            y1 = np.clip(y - h*s // 2, 0, h)
            y2 = np.clip(y1 + h*s, 0, h)
            x1 = np.clip(x - w*s // 2, 0, w)
            x2 = np.clip(x1 + w*s, 0, w)
            cutout_box = np.array([x1, y1, x2, y2], dtype=np.float32)
            
            check = np.any(bbox_ioa(cutout_box, boxes) > 0.00) # intersection over area
                
            while check==True:
                s = random.choice(scale)
                y = random.randint(0, h)
                x = random.randint(0, w)

                y1 = np.clip(y - h*s // 2, 0, h)
                y2 = np.clip(y1 + h*s, 0, h)
                x1 = np.clip(x - w*s // 2, 0, w)
                x2 = np.clip(x1 + w*s, 0, w)
                cutout_box = np.array([x1, y1, x2, y2], dtype=np.float32)
                check = np.any(bbox_ioa(cutout_box, boxes) > 0.1) 
                
            img = image[int(y1):int(y2), int(x1):int(x2)]
            img = cv2.resize(img, (32, 32))
            wheat_imgs_2.append(img)
            wheat_labels_2.append(0)
            if debug:
                fig, ax = plt.subplots(1, 1, figsize=(2, 2))
                plt.imshow(img)
                plt.show()
                debug = False
                 
    return  np.concatenate((np.array(wheat_imgs), np.array(wheat_imgs_2)), 0), np.array(wheat_labels + wheat_labels_2)

In [None]:
wheat_imgs, wheat_labels = create_data(train, data_dir)

In [None]:
class WheatDataset(Dataset):
    
    def __init__(self, imgs, labels, transform=None):
        self.imgs = imgs
        self.labels = labels
        self.transform = transform
    
    def __len__(self):
        return len(self.imgs)
    
    def __getitem__(self, idx):
        
        label = self.labels[idx]
        image = self.imgs[idx]
        
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
            
        return image, label

In [None]:
val_transform = A.Compose([ToTensorV2(p=1.0)], p=1.0) 
valset = WheatDataset(wheat_imgs, 
                      wheat_labels, 
                      val_transform)
val_loader   = torch.utils.data.DataLoader(valset, 
                                           batch_size=batch_size, 
                                           shuffle=False, 
                                           num_workers=4)


In [None]:
del wheat_imgs # save memory

In [None]:
max_images = 16
grid_width = 16
grid_height = int(max_images / grid_width)
fig, axs = plt.subplots(grid_height, grid_width, figsize=(grid_width, grid_height))

for i, ax in zip(range(max_images), axs):
    image, label = valset[i]
    image = image.permute(1,2,0).cpu().numpy()
    ax.imshow(image)
    ax.set_title(label)
    ax.axis('off')

In [None]:
def get_model(path):
    model = EfficientNet.from_name('efficientnet-b0')
    in_features = model._fc.in_features
    model._fc   = nn.Linear(in_features, num_class)
    model.load_state_dict(torch.load(path))
    model.cuda()
    
    return model

models = [get_model("../input/wheat-confident-learning/weight_acc_best_0.pt"),
          get_model("../input/wheat-confident-learning/weight_acc_best_1.pt"),
          get_model("../input/wheat-confident-learning/weight_acc_best_2.pt"),
          get_model("../input/wheat-confident-learning/weight_acc_best_3.pt"),
          get_model("../input/wheat-confident-learning/weight_acc_best_4.pt"),
         ]

In [None]:
def inference_model():
    score = 0.
    avg_val_loss = 0.
    predicts = np.zeros((len(valset), 2))
    
   
    with torch.no_grad():
        for idx, (imgs, labels) in tqdm(enumerate(val_loader)):
            start = idx * batch_size
            end   = min(start + batch_size, len(valset))
            imgs_vaild, labels_vaild = imgs.float().cuda(), labels.cuda()
            
            for model in models:
                model.eval()
                output_test = model(imgs_vaild)
                output = torch.softmax(output_test, dim=1)
                predicts[start:end, :] +=  output.detach().cpu().numpy()
            
    return predicts

In [None]:
predicts = inference_model()

In [None]:
predicts /= 5

In [None]:
predicts

## Method: Prune by Class (PBC).

Testing PBC and plotting results

In [None]:
import cleanlab
#Prune by Class (PBC)
baseline_cl_pbc = cleanlab.pruning.get_noise_indices(wheat_labels, predicts, prune_method='prune_by_class')
index_pbc = np.where(baseline_cl_pbc[:len(train)] == True)[0].tolist()
print(len(index_pbc))
print(index_pbc)

In [None]:
pf_df = train.loc[index_pbc]

In [None]:
pf_img_ids = pf_df['image_id'].unique()

In [None]:
count = 0
for img_id in pf_img_ids:
    if count == 12: break
    
    sub_test = pf_df[pf_df['image_id'] == img_id]
    image = cv2.imread(f'{data_dir}/{img_id}.jpg', cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
    image /= 255.0
    fig, ax = plt.subplots(1, 1, figsize=(16, 8))
    
    boxes = sub_test[['x','y', 'w', 'h']].values
    original_boxes = train[train['image_id'] == img_id][['x','y', 'w', 'h']].values
    
    for box in original_boxes:
        cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (0, 1, 0), 2)
    for box in boxes:
        cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (1, 0, 0), 2)
    
    
    ax.set_axis_off()
    ax.set_title(img_id)
    ax.imshow(image);
    
    count += 1

In [None]:
count = 0
for img_id in pf_img_ids[12:]:
    if count == 12: break
    
    sub_test = pf_df[pf_df['image_id'] == img_id]
    image = cv2.imread(f'{data_dir}/{img_id}.jpg', cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
    image /= 255.0
    fig, ax = plt.subplots(1, 1, figsize=(16, 8))
    
    boxes = sub_test[['x','y', 'w', 'h']].values
    original_boxes = train[train['image_id'] == img_id][['x','y', 'w', 'h']].values
    
    for box in original_boxes:
        cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (0, 1, 0), 2)
    for box in boxes:
        cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (1, 0, 0), 2)
        
    ax.set_axis_off()
    ax.set_title(img_id)
    ax.imshow(image);
    
    count += 1

In [None]:
count = 0
for img_id in pf_img_ids[24:]:
    if count == 12: break
    
    sub_test = pf_df[pf_df['image_id'] == img_id]
    image = cv2.imread(f'{data_dir}/{img_id}.jpg', cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
    image /= 255.0
    fig, ax = plt.subplots(1, 1, figsize=(16, 8))
    
    boxes = sub_test[['x','y', 'w', 'h']].values
    original_boxes = train[train['image_id'] == img_id][['x','y', 'w', 'h']].values
    
    for box in original_boxes:
        cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (0, 1, 0), 2)
    for box in boxes:
        cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (1, 0, 0), 2)
    
    ax.set_axis_off()
    ax.set_title(img_id)
    ax.imshow(image);
    
    count += 1

In [None]:
count = 0
for img_id in pf_img_ids[36:]:
    if count == 12: break
    
    sub_test = pf_df[pf_df['image_id'] == img_id]
    image = cv2.imread(f'{data_dir}/{img_id}.jpg', cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
    image /= 255.0
    fig, ax = plt.subplots(1, 1, figsize=(16, 8))
    
    boxes = sub_test[['x','y', 'w', 'h']].values
    original_boxes = train[train['image_id'] == img_id][['x','y', 'w', 'h']].values
    
    for box in original_boxes:
        cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (0, 1, 0), 2)
    for box in boxes:
        cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (1, 0, 0), 2)
    
    ax.set_axis_off()
    ax.set_title(img_id)
    ax.imshow(image);
    
    count += 1

In [None]:
count = 0
for img_id in pf_img_ids[48:]:
    if count == 12: break
    
    sub_test = pf_df[pf_df['image_id'] == img_id]
    image = cv2.imread(f'{data_dir}/{img_id}.jpg', cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
    image /= 255.0
    fig, ax = plt.subplots(1, 1, figsize=(16, 8))
    boxes = sub_test[['x','y', 'w', 'h']].values
    original_boxes = train[train['image_id'] == img_id][['x','y', 'w', 'h']].values
    
    for box in original_boxes:
        cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (0, 1, 0), 2)
    for box in boxes:
        cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (1, 0, 0), 2)
        
    
    ax.set_axis_off()
    ax.set_title(img_id)
    ax.imshow(image);
    
    count += 1

In [None]:
count = 0
for img_id in pf_img_ids[60:]:
    if count == 12: break
    
    sub_test = pf_df[pf_df['image_id'] == img_id]
    image = cv2.imread(f'{data_dir}/{img_id}.jpg', cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
    image /= 255.0
    fig, ax = plt.subplots(1, 1, figsize=(16, 8))
    
    boxes = sub_test[['x','y', 'w', 'h']].values
    original_boxes = train[train['image_id'] == img_id][['x','y', 'w', 'h']].values
    
    for box in original_boxes:
        cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (0, 1, 0), 2)
    for box in boxes:
        cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (1, 0, 0), 2)
        
    
    ax.set_axis_off()
    ax.set_title(img_id)
    ax.imshow(image);
    
    count += 1

In [None]:
count = 0
for img_id in pf_img_ids[72:]:
    if count == 12: break
    
    sub_test = pf_df[pf_df['image_id'] == img_id]
    image = cv2.imread(f'{data_dir}/{img_id}.jpg', cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
    image /= 255.0
    fig, ax = plt.subplots(1, 1, figsize=(16, 8))
    
    boxes = sub_test[['x','y', 'w', 'h']].values
    original_boxes = train[train['image_id'] == img_id][['x','y', 'w', 'h']].values
    
    for box in original_boxes:
        cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (0, 1, 0), 2)
    for box in boxes:
        cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[0]+box[2]), int(box[1]+box[3])), (1, 0, 0), 2)
    
    ax.set_axis_off()
    ax.set_title(img_id)
    ax.imshow(image);
    
    count += 1