In [None]:
import pandas as pd
import numpy as np
import cv2 as cv
import os

import torch
from torch import nn
import torch.nn.functional as F
from torch import optim
import torchvision
from torchvision import transforms
from torch.utils.data import TensorDataset, DataLoader, Dataset, random_split
from torch.utils.data.sampler import SubsetRandomSampler

from matplotlib import pyplot as plt
from matplotlib import patches
plt.style.use("ggplot")
import seaborn as sns

import sklearn
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, accuracy_score

from PIL import Image

from tqdm import tqdm

In [None]:
oznake = pd.read_csv("../input/histopathologic-cancer-detection/train_labels.csv")
oznake.head()

In [None]:
sns.countplot(x="label", data=oznake)
labelsCount = oznake["label"].value_counts()
plt.xticks([0,1], ["Negativni ({})".format((oznake.label==0).sum()), "Pozitivni ({})".format((oznake.label==1).sum())])
plt.ylabel("Broj primjera");

Prikaza odnosa pozitivnih i negativnih primjera iz train seta. Gore je prikaz countplotom, a dolje piechartom.

In [None]:
plt.pie(labelsCount, labels=['Negativno', 'Pozitivno'], startangle=180, 
        autopct='%1.1f', colors=['#FF96A7', '#00ff99'], shadow=True);

In [None]:
trainPath = "/kaggle/input/histopathologic-cancer-detection/train/"
testPath = "/kaggle/input/histopathologic-cancer-detection/test/"

pozitivniUzorci = oznake.loc[oznake["label"] == 1].sample(20)
negativniUzorci = oznake.loc[oznake["label"] == 0].sample(20)

slikePozitivnih = []
slikeNegativnih = []

for i in pozitivniUzorci["id"]:
    path = os.path.join(trainPath, i+".tif")
    slika = cv.imread(path)
    slikePozitivnih.append(slika)
for i in negativniUzorci["id"]:
    path = os.path.join(trainPath, i+".tif")
    slika = cv.imread(path)
    slikeNegativnih.append(slika)
    
fig,axis = plt.subplots(4,10,figsize=(20,10), dpi=150)
fig.suptitle("Primjeri slika iz dataseta",fontsize=20)

for i,elem in enumerate(slikePozitivnih):
    if i<10:
        k=0
    else:
        k=1
    axis[k,i%10].imshow(elem)
    rect = patches.Rectangle((32,32),32,32,linewidth=3,edgecolor="lime",facecolor="none", linestyle=":", capstyle="round")
    axis[k,i%10].add_patch(rect)
    axis[k,i%10].set_title("Pozitivno")
    axis[k,i%10].axis("off")

for i,elem in enumerate(slikeNegativnih):
    if i<10:
        k=2
    else:
        k=3
    axis[k,i%10].imshow(elem)
    rect = patches.Rectangle((32,32),32,32,linewidth=3,edgecolor="r",facecolor="none", linestyle=":", capstyle="round")
    axis[k,i%10].add_patch(rect)
    axis[k,i%10].set_title("Negativno")
    axis[k,i%10].axis("off")

plt.show()

Prikazano je po 20 primjera za pozitivno i negativno označene primjere iz train seta. Pozitivan primjer je onaj koji ima bar jedan pixel tkiva tumora u centralnom 32x32 prostoru (posebno označen).

In [None]:
negativni = oznake.loc[oznake["label"] == 0].sample(50000)
pozitivni = oznake.loc[oznake["label"] == 1].sample(50000)

slikeP = []
slikeN = []

for i in tqdm(pozitivni["id"], desc="Pozitivni"):
    path = os.path.join(trainPath, i+".tif")
    slika = cv.imread(path)
    slikeP.append(slika)
for i in tqdm(negativni["id"], desc="Negativni"):
    path = os.path.join(trainPath, i+".tif")
    slika = cv.imread(path)
    slikeN.append(slika)
    
slikeP = np.array(slikeP)
slikeN = np.array(slikeN)

bins = 256

fig, axis = plt.subplots(4,2, sharey=True, figsize=(8,8), dpi=150);

#RGB
axis[0,0].hist(slikeN[:,:,:,0].flatten(), bins=bins, density=True);
axis[0,1].hist(slikeP[:,:,:,0].flatten(), bins=bins, density=True);
axis[1,0].hist(slikeN[:,:,:,1].flatten(), bins=bins, density=True);
axis[1,1].hist(slikeP[:,:,:,1].flatten(), bins=bins, density=True);
axis[2,0].hist(slikeN[:,:,:,2].flatten(), bins=bins, density=True);
axis[2,1].hist(slikeP[:,:,:,2].flatten(), bins=bins, density=True);

#sve zajedno
axis[3,0].hist(slikeN.flatten(), bins=bins, density=True);
axis[3,1].hist(slikeP.flatten(), bins=bins, density=True);

#opisi
axis[0,0].set_title("Negativni")
axis[0,1].set_title("Pozitvni")
axis[0,1].set_ylabel("Red", rotation="horizontal", labelpad=23, fontsize=12)
axis[1,1].set_ylabel("Green", rotation="horizontal", labelpad=23, fontsize=12)
axis[2,1].set_ylabel("Blue", rotation="horizontal", labelpad=23, fontsize=12)
axis[3,1].set_ylabel("Svi", rotation="horizontal", labelpad=23, fontsize=12)

for i in range(4):
    axis[i,0].set_ylabel("Relativna frekvencija", fontsize=8)
axis[3,0].set_xlabel("Pixel");
axis[3,1].set_xlabel("Pixel");

Raspodjela pixela za svaki kanal zasebno (R, G, B) i zajednički prikaz. Za zeleni kanal i pozitivni i negativni imaju tamne pixele, dok za crveni i plavi kanal nemaju. Negativni primjeri, općenito imaju više svjetlijih pixela od pozitivnih. Jako velika frekvencija pojavljivanja pixela 255 što znači da je velik udio bijele boje na slikama.

In [None]:
bins = 256 #we use a bit fewer bins to get a smoother image
fig,axis = plt.subplots(1,2,sharey=True, sharex = True, figsize=(8,2),dpi=150)
axis[0].hist(np.mean(slikeN,axis=(1,2,3)),bins=bins,density=True);
axis[1].hist(np.mean(slikeP,axis=(1,2,3)),bins=bins,density=True);
axis[0].set_title("Negativni");
axis[1].set_title("Pozitivni");
axis[0].set_xlabel("Svjetlina slike")
axis[1].set_xlabel("Svjetlina slike")
axis[0].set_ylabel("Relativna frekvencija")
axis[1].set_ylabel("Relativna frekvencija");

Velika razlika u distribuciji za pozitivne i negativne primjere. Pozitivni poprimaju oblik normalne distribucije oko vrijednosti 150, a negativni prate oblik bimodalne distribucije s vršnim vrijednostima oko 140 i 220.

In [None]:
train = shuffle(oznake)

In [None]:
class Kreiraj(Dataset):
    def __init__ (self, data, dataPath="./", transform=None):
        super().__init__()
        self.df = data
        self.dataPath = dataPath
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        slikaIme, oznaka = self.df.iloc[index]
        slikaPath = os.path.join(self.dataPath, slikaIme + ".tif")
        slika = cv.imread(slikaPath)
        if self.transform is not None:
            slika = self.transform(slika)
        return slika, oznaka

Na dijelu slika koje se koriste za treniranje modela (ne i validacija) provode se transformacije slike kako bi se izmjenila slika i smanjila prenaučenost.

In [None]:
transformTrain = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(p=0.7),
    transforms.RandomVerticalFlip(p=0.7),
    transforms.RandomRotation(45),
    transforms.ToTensor()
])

transformStart = transformTest = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
])

pocetniData = Kreiraj(data = train, dataPath = trainPath, transform = transformStart)

In [None]:
img,label = pocetniData[10]
print(img.shape, torch.min(img), torch.max(img))

In [None]:
batch = 64

valSize = 0.2
testSize = 0.1

fullLen = len(pocetniData)
valLen = int(valSize * fullLen)
helpLen = fullLen - valLen

testLen = int(testSize*helpLen)
trainLen = helpLen-testLen


helpSet, valSet = random_split(pocetniData, [helpLen, valLen])
trainSet, testSet = random_split(helpSet, [trainLen, testLen])

trainSet.transform = transformTrain
valSet.transform = transformStart
testSet.transform = transformStart

trainLoad = DataLoader(trainSet, batch_size=batch, shuffle=True)
valLoad = DataLoader(valSet, batch_size=batch, shuffle=False)
testLoadF = DataLoader(testSet, batch_size=batch, shuffle=False)


print("Velicina training seta je {}.".format(trainLen))
print("Velicina validation seta je {}.".format(valLen))
print("Velicina test seta je {}.".format(testLen))

In [None]:
sample = pd.read_csv("/kaggle/input/histopathologic-cancer-detection/sample_submission.csv")
testData = Kreiraj(data = sample, dataPath = testPath, transform = transformTest)

testLoad = DataLoader(testData, batch_size=batch, shuffle=False)

In [None]:
class Model(nn.Module):
    
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Sequential(
                nn.Conv2d(in_channels=3,out_channels=32,kernel_size=3,stride=1,padding=0),
                nn.BatchNorm2d(32),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(2,2))
        self.conv2 = nn.Sequential(
                nn.Conv2d(in_channels=32,out_channels=64,kernel_size=2,stride=1,padding=1),
                nn.BatchNorm2d(64),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(2,2))
        self.conv3 = nn.Sequential(
                nn.Conv2d(in_channels=64,out_channels=128,kernel_size=3,stride=1,padding=1),
                nn.BatchNorm2d(128),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(2,2))
        self.conv4 = nn.Sequential(
                nn.Conv2d(in_channels=128,out_channels=256,kernel_size=3,stride=1,padding=1),
                nn.BatchNorm2d(256),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(2,2))
        self.conv5 = nn.Sequential(
                nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
                nn.BatchNorm2d(512),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(2,2))
        
        self.dropout2d = nn.Dropout2d()
        
        self.fc=nn.Sequential(
                nn.Linear(512*3*3,1024),
                nn.ReLU(inplace=True),
                nn.Dropout(0.3),
                nn.Linear(1024,512),
                nn.Dropout(0.3),
                nn.Linear(512, 1),
                nn.Sigmoid())
        
    def forward(self,x):
        x=self.conv1(x)
        x=self.conv2(x)
        x=self.conv3(x)
        x=self.conv4(x)
        x=self.conv5(x)
        x=x.view(x.shape[0],-1)
        x=self.fc(x)
        return x

In [None]:
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA nedostupan -> CPU')
else:
    print('CUDA dostupan -> GPU')

In [None]:
model = Model()
print(model)

if train_on_gpu: model.cuda()

In [None]:
ukupnoParametara = sum(elem.numel() for elem in model.parameters() if elem.requires_grad)
print("Parametri za treniranje: {}".format(ukupnoParametara))

In [None]:
lossFunc = nn.BCELoss()
opt = optim.Adam(model.parameters(), lr=1.5e-4)

In [None]:
epohe = 20
minLossVal = np.inf
trainLos, valLos, aucEp, valAuc = [],[],[],[]

for i in range(epohe):
    trainL = 0
    valL = 0
    
    model.train()
    for data, oznaka in tqdm(trainLoad, desc="Training {}".format(i+1)):
        if train_on_gpu:
            data, oznaka = data.cuda(), oznaka.cuda().float()
        oznaka = oznaka.view(-1,1)
        opt.zero_grad()
        izlaz = model(data)
        loss = lossFunc(izlaz, oznaka)
        loss.backward()
        opt.step()
        trainL += loss.item()*data.size(0)
        yTocan = oznaka.data.cpu().numpy()
        yDobiven = izlaz[:,-1].detach().cpu().numpy()
        
    
    model.eval()
    with torch.no_grad(): 
        for data, oznaka in tqdm(valLoad, desc="Validation {}".format(i+1)):
            if train_on_gpu:
                data, oznaka = data.cuda(), oznaka.cuda().float()
            oznaka = oznaka.view(-1,1)
            izlaz = model(data)
            loss = lossFunc(izlaz, oznaka)
            valL += loss.item()*data.size(0)
            yTocan = oznaka.data.cpu().numpy()
            yDobiven = izlaz[:,-1].detach().cpu().numpy()
            valAuc.append(roc_auc_score(yTocan, yDobiven))
    
    trainL /= len(trainLoad.sampler)
    valL /= len(valLoad.sampler)
    valAucElem = np.mean(valAuc)
    aucEp.append(valAucElem)
    
    trainLos.append(trainL)
    valLos.append(valL)
    

    print("Epoha: {}, Training Loss: {}, Validation Loss: {}, Validation AUC: {}".format(i+1, trainL, valL, valAucElem))
    
    if valL <= minLossVal:
        print("Smanjen validation loss: {} -> {}.".format(minLossVal, valL))
        torch.save(model.state_dict(), "best_model.pt")
        minLossVal = valL

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

plt.plot(np.arange(1,epohe+1),trainLos, label='Training loss')
plt.plot(np.arange(1,epohe+1),valLos, label='Validation loss')
plt.xticks(np.arange(1,epohe+1, 1.0))
plt.xlabel("Epoha")
plt.ylabel("Loss")
plt.legend(frameon=False);

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

plt.plot(np.arange(1,epohe+1),aucEp)
plt.xticks(np.arange(1,epohe+1, 1.0))
plt.legend("")
plt.xlabel("Epoha")
plt.ylabel("AUC")
plt.legend(frameon=False);

Test

In [None]:
model.load_state_dict(torch.load('best_model.pt'))

In [None]:
model.eval()

predikcijaT = []
for i, (data,oznaka) in tqdm(enumerate(testLoad)):
    data, oznaka = data.cuda(), oznaka.cuda()
    izlaz = model(data)
    
    pr = izlaz.detach().cpu().numpy()
    for i in pr:
        predikcijaT.append(int(i))
    
sample["label"] = predikcijaT

In [None]:
sample.to_csv("./submission.csv", index=False)

Procjena za naš "test set":

In [None]:
model.load_state_dict(torch.load('best_model.pt'))

In [None]:
model.eval()

predikcija = []
tocno = []
for i, (data,oznaka) in tqdm(enumerate(testLoadF)):
    t = oznaka.detach().cpu().numpy()
    data, oznaka = data.cuda(), oznaka.cuda()
    izlaz = model(data)
    
    pr = izlaz.detach().cpu().numpy()
    for i,j in zip(pr,t):
        predikcija.append(i>=0.5)
        tocno.append(j)

In [None]:
print("Preciznost: {:.2f}%".format(accuracy_score(tocno, predikcija)*100))