In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("/kaggle/input/test"))
!pip install efficientnet_pytorch
!pip install torchsummary
from efficientnet_pytorch import EfficientNet
import torchvision
import torch
from torch import nn
import torch.nn.functional as F
import torchvision.models as models
from torchsummary import summary
import torch.optim as optim
import copy
import os
import torch
from tqdm.autonotebook import tqdm
from torch.optim.lr_scheduler import _LRScheduler
import matplotlib.pyplot as plt
from PIL import Image
from torch.utils.data import Dataset
# Any results you write to the current directory are saved as output.

In [None]:
train_csv = pd.read_csv('/kaggle/input/train.csv')
train_csv.head(10)

In [None]:
import cv2
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import random
from sklearn.utils import shuffle
from tqdm import tqdm_notebook
import numpy as np
data = pd.read_csv('/kaggle/input/train.csv')
train_path = '/kaggle/input/train/train/'
test_path = '/kaggle/input/test/test'
# quick look at the label stats
print(data['label'].value_counts())


def readImage(path):
    # OpenCV reads the image in bgr format by default
    bgr_img = cv2.imread(path)
    # We flip it to rgb for visualization purposes
    b,g,r = cv2.split(bgr_img)
    rgb_img = cv2.merge([r,g,b])
    return rgb_img



## time to plot

# random sampling
shuffled_data = shuffle(data)

fig, ax = plt.subplots(2,5, figsize=(20,8))
fig.suptitle('Histopathologic scans of lymph node sections',fontsize=20)
# Negatives
for i, idx in enumerate(shuffled_data[shuffled_data['label'] == 0]['id'][:5]):
    path = os.path.join(train_path, idx)
    ax[0,i].imshow(readImage(path + '.tif'))
    # Create a Rectangle patch
    box = patches.Rectangle((32,32),32,32,linewidth=4,edgecolor='b',facecolor='none', linestyle=':', capstyle='round')
    ax[0,i].add_patch(box)
ax[0,0].set_ylabel('Negative samples', size='large')
# Positives
for i, idx in enumerate(shuffled_data[shuffled_data['label'] == 1]['id'][:5]):
    path = os.path.join(train_path, idx)
    ax[1,i].imshow(readImage(path + '.tif'))
    # Create a Rectangle patch
    box = patches.Rectangle((32,32),32,32,linewidth=4,edgecolor='r',facecolor='none', linestyle=':', capstyle='round')
    ax[1,i].add_patch(box)
ax[1,0].set_ylabel('Tumor tissue samples', size='large')







## data augmentation
import random
ORIGINAL_SIZE = 96      # original size of the images - do not change

# AUGMENTATION VARIABLES
CROP_SIZE = 90          # final size after crop
RANDOM_ROTATION = 3    # range (0-180), 180 allows all rotation variations, 0=no change
RANDOM_SHIFT = 2        # center crop shift in x and y axes, 0=no change. This cannot be more than (ORIGINAL_SIZE - CROP_SIZE)//2 
RANDOM_BRIGHTNESS = 7  # range (0-100), 0=no change
RANDOM_CONTRAST = 5    # range (0-100), 0=no change
RANDOM_90_DEG_TURN = 1  # 0 or 1= random turn to left or right

def readCroppedImage(path, augmentations = True):
    # augmentations parameter is included for counting statistics from images, where we don't want augmentations
    
    # OpenCV reads the image in bgr format by default
    bgr_img = cv2.imread(path)
    # We flip it to rgb for visualization purposes
    b,g,r = cv2.split(bgr_img)
    rgb_img = cv2.merge([r,g,b])
    
    if(not augmentations):
        return rgb_img / 255
    
    #random rotation
    rotation = random.randint(-RANDOM_ROTATION,RANDOM_ROTATION)
    if(RANDOM_90_DEG_TURN == 1):
        rotation += random.randint(-1,1) * 90
    M = cv2.getRotationMatrix2D((48,48),rotation,1)   # the center point is the rotation anchor
    rgb_img = cv2.warpAffine(rgb_img,M,(96,96))
    
    #random x,y-shift
    x = random.randint(-RANDOM_SHIFT, RANDOM_SHIFT)
    y = random.randint(-RANDOM_SHIFT, RANDOM_SHIFT)
    
    # crop to center and normalize to 0-1 range
    start_crop = (ORIGINAL_SIZE - CROP_SIZE) // 2
    end_crop = start_crop + CROP_SIZE
    rgb_img = rgb_img[(start_crop + x):(end_crop + x), (start_crop + y):(end_crop + y)] / 255
    
    # Random flip
    flip_hor = bool(random.getrandbits(1))
    flip_ver = bool(random.getrandbits(1))
    if(flip_hor):
        rgb_img = rgb_img[:, ::-1]
    if(flip_ver):
        rgb_img = rgb_img[::-1, :]
        
    # Random brightness
    br = random.randint(-RANDOM_BRIGHTNESS, RANDOM_BRIGHTNESS) / 100.
    rgb_img = rgb_img + br
    
    # Random contrast
    cr = 1.0 + random.randint(-RANDOM_CONTRAST, RANDOM_CONTRAST) / 100.
    rgb_img = rgb_img * cr
    
    # clip values to 0-1 range
    rgb_img = np.clip(rgb_img, 0, 1.0)
    
    return rgb_img










fig, ax = plt.subplots(2,5, figsize=(20,8))
fig.suptitle('Cropped histopathologic scans of lymph node sections',fontsize=20)
# Negatives
for i, idx in enumerate(shuffled_data[shuffled_data['label'] == 0]['id'][:5]):
    path = os.path.join(train_path, idx)
    ax[0,i].imshow(readCroppedImage(path + '.tif'))
ax[0,0].set_ylabel('Negative samples', size='large')
# Positives
for i, idx in enumerate(shuffled_data[shuffled_data['label'] == 1]['id'][:5]):
    path = os.path.join(train_path, idx)
    ax[1,i].imshow(readCroppedImage(path + '.tif'))
ax[1,0].set_ylabel('Tumor tissue samples', size='large')



## here comes the main part


# As we count the statistics, we can check if there are any completely black or white images
dark_th = 10 / 255      # If no pixel reaches this threshold, image is considered too dark
bright_th = 245 / 255   # If no pixel is under this threshold, image is considerd too bright
too_dark_idx = []
too_bright_idx = []

x_tot = np.zeros(3)
x2_tot = np.zeros(3)
counted_ones = 0
for i, idx in tqdm_notebook(enumerate(shuffled_data['id']), 'computing statistics...(220025 it total)'):
    path = os.path.join(train_path, idx)
    imagearray = readCroppedImage(path + '.tif', augmentations = False).reshape(-1,3)
    # is this too dark
    if(imagearray.max() < dark_th):
        too_dark_idx.append(idx)
        continue # do not include in statistics
    # is this too bright
    if(imagearray.min() > bright_th):
        too_bright_idx.append(idx)
        continue # do not include in statistics
    x_tot += imagearray.mean(axis=0)
    x2_tot += (imagearray**2).mean(axis=0)
    counted_ones += 1
    
channel_avr = x_tot/counted_ones
channel_std = np.sqrt(x2_tot/counted_ones - channel_avr**2)
channel_avr,channel_std

print('There was {0} extremely dark image'.format(len(too_dark_idx)))
print('and {0} extremely bright images'.format(len(too_bright_idx)))
print('Dark one:')
print(too_dark_idx)
print('Bright ones:')
print(too_bright_idx)
fig, ax = plt.subplots(2,6, figsize=(25,9))
fig.suptitle('Almost completely black or white images',fontsize=20)
# Too dark
i = 0
for idx in np.asarray(too_dark_idx)[:min(6, len(too_dark_idx))]:
    lbl = shuffled_data[shuffled_data['id'] == idx]['label'].values[0]
    path = os.path.join(train_path, idx)
    ax[0,i].imshow(readCroppedImage(path + '.tif', augmentations = False))
    ax[0,i].set_title(idx + '\n label=' + str(lbl), fontsize = 8)
    i += 1
ax[0,0].set_ylabel('Extremely dark images', size='large')
for j in range(min(6, len(too_dark_idx)), 6):
    ax[0,j].axis('off') # hide axes if there are less than 6
# Too bright
i = 0
for idx in np.asarray(too_bright_idx)[:min(6, len(too_bright_idx))]:
    lbl = shuffled_data[shuffled_data['id'] == idx]['label'].values[0]
    path = os.path.join(train_path, idx)
    ax[1,i].imshow(readCroppedImage(path + '.tif', augmentations = False))
    ax[1,i].set_title(idx + '\n label=' + str(lbl), fontsize = 8)
    i += 1
ax[1,0].set_ylabel('Extremely bright images', size='large')
for j in range(min(6, len(too_bright_idx)), 6):
    ax[1,j].axis('off') # hide axes if there are less than 6

In [None]:
classes = train_csv['has_cactus'].unique()
encoder = {0:'no cactus',1:'has cactus'}

In [None]:
from sklearn.model_selection import train_test_split
train_df,val_df = train_test_split(train_csv,test_size = 0)
val_df = val_df.reset_index()
val_df = val_df.drop(['index'],axis = 1)
train_df = train_df.reset_index()
train_df = train_df.drop(['index'],axis = 1)

In [None]:
class cactus_dataset(Dataset):
  def __init__(self,image_dir,train_csv,transform = None):
    self.img_dir = image_dir
    self.transform = transform
    self.id = train_csv.iloc[:,0]
    self.classes =  train_csv.iloc[:,1]
  def __len__(self):
    return len(self.id)
  def __getitem__(self,idx):
    img_name = os.path.join(self.img_dir, self.id[idx])
    image = cv2.imread(img_name)
    if self.transform:
        image = self.transform(image)
    label = self.classes[idx]
    return image,label


In [None]:
batch_size = 4
import cv2
from torchvision import transforms
from torch.utils.data import DataLoader


In [None]:
train_transforms = transforms.Compose([
                                        transforms.ToPILImage(),
                                    
                                        transforms.RandomResizedCrop(224),                                    
                                        transforms.RandomHorizontalFlip(),
                                        #transforms.RandomRotation(30),
                                        transforms.ToTensor(),
                                        transforms.Normalize([0.485, 0.456, 0.406], 
                                                            [0.229, 0.224, 0.225])])
test_transforms = transforms.Compose([
                                        transforms.ToPILImage(),
                                        transforms.Resize(256),
                                          transforms.CenterCrop(224),
                                        transforms.ToTensor(),
                                        transforms.Normalize([0.485, 0.456, 0.406], 
                                                            [0.229, 0.224, 0.225])])

#inverse normalization for image plot
train_data = cactus_dataset('/kaggle/input/train/train',train_df,transform = train_transforms)


In [None]:
val_data = cactus_dataset('/kaggle/input/train/train',val_df,transform = test_transforms)
train_loader = DataLoader(train_data, batch_size=4,
                        shuffle=True, num_workers=0)

#val_loader = DataLoader(val_data, batch_size=4,shuffle=True, num_workers=0)
dataloaders = {'train':train_loader}

In [None]:
from efficientnet_pytorch import EfficientNet
import torchvision
import torch
from torch import nn
import torch.nn.functional as F
import torchvision.models as models
from torchsummary import summary
import torch.optim as optim
import copy
import os
import torch
from tqdm.autonotebook import tqdm

import matplotlib.pyplot as plt

class classifie(nn.Module):
    def __init__(self,n_classes,pretrained = True ):
        super(classifie, self).__init__()
        self.cnn_arch = models.densenet161(pretrained = True)
        self.linear = nn.Linear(1000, 2)
        self.bn = nn.BatchNorm1d(16)
        self.dropout = nn.Dropout(0.2)
        self.elu = nn.ELU()
        self.out = nn.Linear(16, 2)
    def forward(self, input):
        out = self.cnn_arch(input)
        res = self.linear(out)
        #x = self.bn(self.relu(self.linear1(am)))
        return res

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
classifier = classifie(n_classes = 2).to(device)

In [None]:
from torchsummary import summary
summary(classifier,(3,224,224))

In [None]:
import torch.optim as optim
import matplotlib.pyplot as plt
import random
from torch.autograd import Variable
import numpy as np
import torch
from torch import nn
import sys
def train(model,dataloaders,device,num_epochs,lr,batch_size,patience):
    phase1 = dataloaders.keys()
    losses = list()
    criterion = nn.CrossEntropyLoss()
    acc = list()
    for epoch in range(num_epochs):
        print('Epoch:',epoch)
        optimizer = optim.Adam(model.parameters(), lr=lr,weight_decay = 1e-6)
        lr = lr*0.9
        for phase in phase1:
            epoch_metrics = {"loss": [], "acc": []}
            if phase == ' train':
                model.train()
            else:
                model.eval()
            for  batch_idx, (data, target) in enumerate(dataloaders[phase]):
                data, target = Variable(data), Variable(target)
                data = data.type(torch.FloatTensor).to(device)
                target = target.type(torch.LongTensor).to(device)

                optimizer.zero_grad()
                output = model(data)
                loss = criterion(output, target)
                target = target.type(torch.LongTensor).to(device)

                acc = 100 * (output.detach().argmax(1) == target).cpu().numpy().mean()
                epoch_metrics["loss"].append(loss.item())
                epoch_metrics["acc"].append(acc)
                if(phase =='train'):
                    loss.backward()
                    optimizer.step()
                sys.stdout.write(
                "\r[Epoch %d/%d] [Batch %d/%d] [Loss: %f (%f), Acc: %.2f%% (%.2f%%)]"
                % (
                    epoch,
                    num_epochs,
                    batch_idx,
                    len(dataloaders[phase]),
                    loss.item(),
                    np.mean(epoch_metrics["loss"]),
                    acc,
                    np.mean(epoch_metrics["acc"]),
                    )
                )
               
            epoch_acc = np.mean(epoch_metrics["acc"])
            epoch_loss = np.mean(epoch_metrics["loss"])
        print('')  
        print('{} Accuracy: {}'.format(phase,epoch_acc.item()))
    return losses,acc

def train_model(model,dataloaders,encoder,lr_scheduler = None,inv_normalize = None,num_epochs=10,lr=0.0001,batch_size=8,patience = None,classes = None):
    dataloader_train = {}
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    losses = list()
    accuracy = list()
    key = dataloaders.keys()
    perform_test = False
    for phase in key:
        if(phase == 'test'):
            perform_test = True
        else:2
            dataloader_train.update([(phase,dataloaders[phase])])
    losses,accuracy = train(model,dataloader_train,device,num_epochs,lr,batch_size,patience)

In [None]:
import cv2
lr = 0.001
train_model(classifier,dataloaders,encoder,inv_normalize = None,num_epochs=4,lr = lr,batch_size = batch_size,patience = None,classes = classes)

In [None]:
lr = 0.0001
train_model(classifier,dataloaders,encoder,inv_normalize = None,num_epochs=4,lr = lr,batch_size = batch_size,patience = None,classes = classes)

In [None]:
class cactus_dataset_test(Dataset):
  def __init__(self,image_dir,transform = None):
    self.img_dir = image_dir
    self.transform = transform
    self.id = os.listdir(image_dir)
  def __len__(self):
    return len(self.id)
  def __getitem__(self,idx):
    img_name = os.path.join(self.img_dir, self.id[idx])
    image = cv2.imread(img_name)
    if self.transform:
      image = self.transform(image)
    return (self.id[idx],image)

In [None]:
test1 = cactus_dataset_test('/kaggle/input/test/test',test_transforms)

In [None]:
test_loader = DataLoader(test1, batch_size =32, shuffle = True)

In [None]:
def test(model,dataloader,device,batch_size):
    running_corrects = 0
    running_loss=0
    pred = []
    id = list()
    sm = nn.Softmax(dim = 1)
    criterion = nn.CrossEntropyLoss()
    for batch_idx, (id_1,data) in enumerate(dataloader):
        data = Variable(data)
        data = data.type(torch.FloatTensor).to(device)
        model.eval()
        output = model(data)
        output = sm(output)
        _, preds = torch.max(output, 1)
        preds = preds.cpu().numpy()
        preds = np.reshape(preds,(len(preds),1))
        
        for i in range(len(preds)):
            pred.append(preds[i])
            id.append(id_1[i])
    return id,pred

In [None]:
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.autograd import Variable
import numpy as np
import torch
from torch import nn
id,pred = test(classifier,test_loader,'cuda',32)

In [None]:
a = list()
for i in range(len(pred)):
    a.append(pred[i][0])

In [None]:
a = np.asarray(a)

In [None]:
a = np.reshape(a,(-1,1))

In [None]:
b = np.asarray(id)

In [None]:
b = np.reshape(b,(-1,1))

In [None]:
sub = np.concatenate((b,a),axis = 1)

In [None]:
sub_df = pd.DataFrame(sub)

In [None]:
sub_df.columns = ['id','has_cactus']

In [None]:
sub_df.head(10)

In [None]:
sub_df.to_csv("/kaggle/working/submission.csv", index=False)