
![image](https://raw.githubusercontent.com/cgnorthcutt/cleanlab/master/img/cleanlab_logo.png)
 

``cleanlab`` is a machine learning python package for **learning with noisy labels** and **finding label errors in datasets**. ``cleanlab`` CLEANs LABels. It is powered by the theory of **confident learning**, published in this [paper](https://arxiv.org/abs/1911.00068) | [blog](https://l7.curtisnorthcutt.com/confident-learning).

github: https://github.com/cgnorthcutt/cleanlab  
arxiv: https://arxiv.org/abs/1911.00068  


In [None]:
import sys
sys.path.append("../input/pytorch-image-models/pytorch-image-models-master")
sys.path.append("../input/cleanlab/")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from PIL import Image
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.base import BaseEstimator
import random
import os
import gc
import copy
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms as T
import torch.optim as optim
from torchvision import datasets, transforms
import timm
from tqdm import tqdm_notebook as tqdm

from cleanlab.classification import LearningWithNoisyLabels

import warnings
warnings.filterwarnings("ignore")

import albumentations as A
from albumentations.pytorch import ToTensorV2

In [None]:
PATH = "../input/cassava-leaf-disease-classification/"

train = pd.read_csv(PATH + "train.csv")
sample_submission = pd.read_csv(PATH + "sample_submission.csv")

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [None]:
params = {"n_folds": 5,
          "batch_size": 32,
          "epochs": 5,
          "model_name": "tf_efficientnet_b0_ns",
          "lr": 1e-4,
          "weight_decay": 1e-6,
          "width": 512,
          "height": 512,
          "debug": False}

In [None]:
class CassavaImgClassifier(nn.Module):
    def __init__(self, model_name, pretrained=True):
        super(CassavaImgClassifier, self).__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        n_features = self.model.classifier.in_features
        self.model.classifier = nn.Linear(n_features, 5)
        
    # @torch.cuda.amp.autocast()
    def forward(self, x):
        x = self.model(x)
        return x

In [None]:
def get_train_transforms(params):
    return A.Compose([
        A.RandomResizedCrop(params["height"], params["width"]),
        # A.Transpose(p=0.5),
        A.HorizontalFlip(p=0.5),
        # A.VerticalFlip(p=0.5),
        # A.ShiftScaleRotate(p=0.5),
        # A.HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5),
        # A.RandomBrightnessContrast(brightness_limit=(-0.1, 0.1), contrast_limit=(-0.1, 0.1), p=0.5),
        A.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225],),
        # A.CoarseDropout(p=0.5),
        # A.Cutout(p=0.5),
        ToTensorV2(p=1.0)
    ])



def get_test_transforms(params):

    trans = A.Compose([
        A.Resize(params["height"], params["width"], p=1.0),
#         A.CenterCrop(params["height"], params["width"], p=1.0),
        A.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225],),
        ToTensorV2(p=1.0)
    ])
    return trans


In [None]:
class CassavaDataset(Dataset):
    def __init__(self, image_id, label=None, phase="train", transform=None):
        self.image_id = image_id
        self.label = label
        self.transform = transform
        self.phase = phase
    def __len__(self):
        return len(self.image_id)
    def __getitem__(self, idx):
        img = cv2.imread(PATH + self.phase + "_images/" + self.image_id[idx])
        img = img[:, :, ::-1]
        if self.transform:
            img = self.transform(image=img)["image"]
        if self.label is not None:
            label = self.label[idx]
            return {"image": img, "label": label}
        else:
            return {"image": img}

In [None]:
class Classifier(BaseEstimator):
    def __init__(self, model, params):
        self.model = model
        self.params = params
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)
        self.best_model = None

    def fit(self, img_idx, label, val_img_idx=None, val_label=None):
        transform = get_train_transforms(self.params)
        dataset = CassavaDataset(img_idx, label, transform=transform)
        dataloader = DataLoader(dataset, batch_size=self.params["batch_size"], num_workers=4, pin_memory=True, shuffle=True)

        loss_fn = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.params["lr"], weight_decay=self.params["weight_decay"])
#         scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1, 6, 10, 15], gamma=0.5)

        scaler = torch.cuda.amp.GradScaler()

        best_loss = 1000.0
        best_score = 0.0
        for epoch in range(self.params["epochs"]):
            self.model.train()
            train_loss = 0.0
            train_score = 0.0
            t0 = time.time()
            for data in dataloader:
                x = data["image"].to(self.device)
                y = data["label"].to(self.device)
                with torch.cuda.amp.autocast():
                    pred = self.model(x)
                    loss = loss_fn(pred, y)

                optimizer.zero_grad()
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
                # scheduler.step()
                train_loss += loss.item()
                train_score += accuracy_score(y.detach().cpu().numpy(), pred.softmax(dim=1).argmax(dim=1).detach().cpu().numpy())
#             scheduler.step()
            train_loss = train_loss / len(dataloader)
            train_score = train_score / len(dataloader)
            if val_img_idx is not None:
                val_pred = self.predict(val_img_idx)
                val_score = accuracy_score(val_label, val_pred)
                if best_score < val_score:
                    self.best_model = copy.copy(self.model)
                print(f"{epoch} epoch | train loss: {train_loss:.4f} | train acc: {train_score:.4f} | val acc: {val_score:.4f} | {time.time() - t0:.1f}s")
            else:
                print(f"{epoch} epoch | train loss: {train_loss:.4f} | train acc: {train_score:.4f} | {time.time() - t0:.1f}s")

    def predict_proba(self, img_idx, phase="train"):
        transform = get_test_transforms(self.params)
        dataset = CassavaDataset(img_idx, phase=phase, transform=transform)
        dataloader = DataLoader(dataset, batch_size=self.params["batch_size"], num_workers=4, pin_memory=True)
        
        self.model.eval()
        preds = []
        with torch.no_grad():
            for data in dataloader:
                x = data["image"].to(self.device)
                with torch.cuda.amp.autocast():
                    pred = self.model(x)
                preds.append(pred.softmax(dim=1).detach().cpu().numpy())
        prob = np.concatenate(preds)
        return prob

    def predict(self, img_idx, phase="train"):
        prob = self.predict_proba(img_idx, phase=phase)
        preds = np.argmax(prob, axis=1)
        return preds
    def score(self, img_idx, label, phase="train"):
        preds = self.predict(img_idx, phase=phase)
        return accuracy_score(label, preds)

In [None]:
if params["debug"]:
    train_df = train.head(500)
else:
    train_df = train

train_image_id = train_df["image_id"].values
train_label = train_df["label"].values


In [None]:
val_preds = np.zeros((train_label.shape[0], 5))
kfold = StratifiedKFold(n_splits=params["n_folds"], random_state=0)
seed_everything(0)
for fold, (train_idx, val_idx) in enumerate(kfold.split(train_image_id, train_label)):
    X_train, y_train = train_image_id[train_idx], train_label[train_idx]
    X_val, y_val = train_image_id[val_idx], train_label[val_idx]

    cassava_model = CassavaImgClassifier(params["model_name"])
    model = Classifier(cassava_model, params)

    lnl = LearningWithNoisyLabels(clf=model, seed=0, n_jobs=os.cpu_count())  
    clf = lnl.fit(X_train, y_train)
    val_preds[val_idx, :] = clf.predict_proba(X_val)
    print(accuracy_score(y_val, np.argmax(val_preds[val_idx, :], axis=1)))
    torch.save(clf.model.state_dict(), f"model{fold}.pth")
    print(f"-------- {fold} finished --------")
    
    break

In [None]:
score = accuracy_score(train_label, np.argmax(val_preds, axis=1))
print(f"CV: {score}")

In [None]:
# np.save("oof", val_preds)