In [10]:
import pandas as pd
import numpy as np
import cv2
from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split
from RPCC_metric_utils_for_participants import contest_metric, sive_diam_pan, calc_chi_square_metric

In [12]:
train = pd.read_csv("data/RPCC_labels.csv")

In [13]:
train_df, test_df = train_test_split(train, test_size=0.2, random_state=42)

In [14]:
train_cnt = train_df[~train_df.prop_count.isnull()]
train_dist = train_df[~train_df.pan.isnull()]

valid_cnt = test_df[~test_df.prop_count.isnull()]
valid_dist = test_df[~test_df.pan.isnull()]

# Count props part

In [20]:
import torch
import torch.nn as nn
from torch.optim import Adam

import albumentations as A
from torchvision.models import mobilenet_v2
from torch.utils.data import DataLoader, Dataset

In [21]:
import os


class CntDataset(Dataset):
    def __init__(self, path, df, transforms):
        self.path = path
        self.df = df
        self.transforms = transforms
        
    def __getitem__(self, item):
        path = os.path.join(self.path, f"{self.df.ImageId.iloc[item]}.jpg")
        label = torch.Tensor([self.df.prop_count.iloc[item]])
        img = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)
        img = self.transforms(image=img)['image']
        img = torch.from_numpy(img)
        return img.permute(2, 0, 1), label
    
    def __len__(self):
        return len(self.df)

In [22]:
max_cnt = train_cnt.prop_count.max()
min_cnt = train_cnt.prop_count.min()
print(min_cnt, max_cnt)

norm = lambda cnt: (cnt - min_cnt) / (max_cnt - min_cnt)
inorm = lambda cnt: cnt * (max_cnt - min_cnt) + min_cnt

assert inorm(norm(1500)) == 1500

688.0 3029.0


In [23]:
train_cnt.prop_count = train_cnt.prop_count.apply(norm)
valid_cnt.prop_count = valid_cnt.prop_count.apply(norm)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [24]:
model = mobilenet_v2(True)
model.classifier[1] = nn.Linear(1280, 1, True)
model = model.cuda()

optimizer = Adam(model.parameters(), 1e-4)
criterion = nn.MSELoss()

Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /Users/skrrydg/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth


HBox(children=(FloatProgress(value=0.0, max=14212972.0), HTML(value='')))




AssertionError: Torch not compiled with CUDA enabled

In [None]:
train_ds = CntDataset(
    "RPCC_train/train/", 
    train_cnt, 
    A.Compose([
        A.Normalize(),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.Resize(512, 512),
    ]),
)
train_loader = DataLoader(train_ds, 16, shuffle=True)

valid_ds = CntDataset(
    "RPCC_train/train/", 
    valid_cnt, 
    A.Compose([
        A.Normalize(),
        A.Resize(512, 512),
    ]),
)

valid_loader = DataLoader(valid_ds, 16, shuffle=False)


test_ds = CntDataset(
    "RPCC_train/train/", 
    test_df, 
    A.Compose([
        A.Normalize(),
        A.Resize(512, 512),
    ]),
)

test_loader = DataLoader(test_ds, 16, shuffle=False)

In [None]:
def train_epoch(num, loader):
    model.train()
    running_loss = 0.
    for i, (batch, labels) in enumerate(loader):
        batch = batch.cuda()
        labels = labels.cuda()

        optimizer.zero_grad()

        outputs = model(batch)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print("Train Epoch: ", num + 1, "Loss: ", running_loss / (i+1))

    
def predict(loader):
    model.eval()
    outputs = []
    with torch.no_grad():
        for i, (batch, _) in enumerate(loader):
            batch = batch.cuda()
            outputs.extend(model(batch).cpu().detach().numpy().squeeze().tolist())
    return outputs
    
    
def eval_epoch(num, loader):
    outputs = predict(loader)
    pred_cnts = np.array([inorm(x) for x in outputs])
    gt_cnts = np.array([inorm(x) for x in loader.dataset.df.prop_count])
    print("Eval Epoch: ", num + 1, "MAPE: ", np.mean(np.abs(pred_cnts - gt_cnts) / gt_cnts))
    

def train_and_eval():
    for epoch in range(5):
        train_epoch(epoch, train_loader)
        eval_epoch(epoch, valid_loader)
        print("=" * 10)


train_and_eval()

In [25]:
outputs = predict(test_loader)
cnt_preds = [inorm(x) for x in outputs]

NameError: name 'predict' is not defined

In [None]:
torch.save(model.cpu(), "model_cnts.pth")

# Count distr part

In [None]:
import os


class DistDataset(Dataset):
    def __init__(self, path, df, transforms):
        self.path = path
        self.df = df
        self.transforms = transforms
        
    def __getitem__(self, item):
        path = os.path.join(self.path, f"{self.df.ImageId.iloc[item]}.jpg")
        label = torch.from_numpy(self.df.iloc[item, 1:-2].values.astype(np.float32))
        img = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)
        img = self.transforms(image=img)['image']
        img = torch.from_numpy(img)
        return img.permute(2, 0, 1), label
    
    def __len__(self):
        return len(self.df)

In [None]:
model = mobilenet_v2(True)
model.classifier[1] = nn.Linear(1280, 20, True)
model = model.cuda()

optimizer = Adam(model.parameters(), 1e-4)
criterion = nn.MSELoss()

In [None]:
train_ds = DistDataset(
    "RPCC_train/train/", 
    train_dist, 
    A.Compose([
        A.Normalize(),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.Resize(512, 512),
    ]),
)
train_loader = DataLoader(train_ds, 16, shuffle=True)

valid_ds = DistDataset(
    "RPCC_train/train/", 
    valid_dist, 
    A.Compose([
        A.Normalize(),
        A.Resize(512, 512),
    ]),
)

valid_loader = DataLoader(valid_ds, 16, shuffle=False)


test_ds = DistDataset(
    "RPCC_train/train/", 
    test_df, 
    A.Compose([
        A.Normalize(),
        A.Resize(512, 512),
    ]),
)

test_loader = DataLoader(test_ds, 16, shuffle=False)

In [None]:
def train_epoch(num, loader):
    model.train()
    running_loss = 0.
    for i, (batch, labels) in tqdm(enumerate(loader), total=len(loader)):
        batch = batch.cuda()
        labels = labels.cuda()

        optimizer.zero_grad()

        outputs = model(batch)
        loss = criterion(outputs.softmax(dim=1), labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print("Train Epoch: ", num + 1, "Loss: ", running_loss / (i+1))

    
def predict(loader):
    model.eval()
    outputs = []
    with torch.no_grad():
        for i, (batch, _) in enumerate(loader):
            batch = batch.cuda()
            outputs.extend(model(batch).softmax(dim=1).cpu().detach().numpy().squeeze().tolist())
    return outputs
    
    
def eval_epoch(num, loader):
    outputs = predict(loader)
    gt_hists = loader.dataset.df.iloc[:, 1:-2].values
    gt_fracts = loader.dataset.df["fraction"].values
    res = []
    for i, (hist, fracts) in enumerate(zip(gt_hists, gt_fracts)):
        res.append(calc_chi_square_metric(hist, outputs[i], fracts))

    print("Eval Epoch: ", num + 1, "CHI2: ", np.mean(res))    

def train_and_eval():
    for epoch in range(3):
        train_epoch(epoch, train_loader)
        eval_epoch(epoch, valid_loader)
        print("=" * 10)


train_and_eval()

In [None]:
dist_preds = predict(test_loader)

In [None]:
torch.save(model.cpu(), "model_dists.pth")

In [None]:
def get_submit(cnt_preds, dist_preds, indices):
    submit = []
    for idx, cnt, dist in zip(indices, cnt_preds, dist_preds):
        cnt = int(cnt)
        sizes = np.random.choice(sive_diam_pan, size=cnt, p=dist / np.sum(dist))
        submit.extend([{
            "ImageId": idx,
            "prop_size": sizes[i]
        } for i in range(cnt)])
    return pd.DataFrame.from_records(submit)

In [None]:
predictions = get_submit(cnt_preds, dist_preds, test_loader.dataset.df.ImageId.values)

In [None]:
%%time
contest_metric(test_df, predictions)