# Visual Transformer(part 1)
https://paperswithcode.com/paper/an-image-is-worth-16x16-words-transformers-1

Github links:

* [google-research/vision_transformer](https://github.com/google-research/vision_transformer)
*  [lucidrains/vit-pytorch](https://github.com/lucidrains/vit-pytorch)
* [lukemelas/PyTorch-Pretrained-ViT](https://github.com/lukemelas/PyTorch-Pretrained-ViT)

## Задание
1. Выбрать удобный для вас датасет, например, любое соревнование по CV. Пример приведен на датасете Dogs vs Cats Data
2. Взять любую предобученную модель VIT и затем сделать finetuning на вашем датасете
3. Сравнить по качеству с любой CNN моделью

 * Histopathologic Cancer Detection - https://www.kaggle.com/c/histopathologic-cancer-detection  
 
 
 "In this competition, you must create an algorithm to identify metastatic cancer in small image patches taken from larger digital pathology scans. The data for this competition is a slightly modified version of the PatchCamelyon (PCam) benchmark dataset (the original PCam dataset contains duplicate images due to its probabilistic sampling, however, the version presented on Kaggle does not contain duplicates)."
 
 


In [1]:
!pip -q install vit_pytorch linformer

## Import Libraries

In [2]:
from __future__ import print_function

import glob
from itertools import chain
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from PIL import Image
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from tqdm.notebook import tqdm
import random



from torchvision import datasets, transforms
import cv2

In [3]:
print(f"Torch: {torch.__version__}")

Torch: 1.8.0+cu101


In [4]:
seed = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed)

In [5]:
device = 'cuda'

## From kaggle to colab

In [6]:
# !pip install -q kaggle
# !pip install --upgrade --force-reinstall --no-deps kaggle
#  # https://www.kaggle.com/Robohant/account
#  !get kaggle.json - CREATE NEW API TOKEN(your profile)
#  from google.colab import files 
#  files.upload()

In [7]:
!ls

sample_data


In [8]:
# !rm -r ~/.kaggle
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json
# #! kaggle datasets list

In [9]:
# !kaggle competitions download -c histopathologic-cancer-detection
# !unzip histopathologic-cancer-detection.zip  
# !unzip -q train.zip
# !unzip -q test.zip


## Load Data

In [10]:

train_path = 'train'
test_path = 'test'
labels = pd.read_csv('train_labels.csv')


FileNotFoundError: ignored

In [None]:

class MyDataset(Dataset):
    def __init__(self, df_data, data_dir = './', transform=None, pathe = None):
        super().__init__()
        self.df = df_data.values
        self.data_dir = data_dir
        self.transform = transform
        self.pathe = os.path.join(self.data_dir,(self.df[0][0])+'.tif')
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        img_name,label = self.df[index]
        img_path = os.path.join(self.data_dir, img_name+'.tif')
        image = cv2.imread(img_path)
        if self.transform is not None:
            image = self.transform(image)
        return image, label

## Image Augumentation

In [None]:
# Random indexes for tests
train_indexes = random.choices(np.arange(0,len(labels)), k=len(labels)) 

# Augmentation
trans_train = transforms.Compose([transforms.ToPILImage(),
                                  transforms.Resize((224, 224)),
                                  transforms.RandomHorizontalFlip(), 
                                  transforms.RandomVerticalFlip(),
                                  transforms.RandomRotation(20), 
                                  transforms.ToTensor(),
                                  transforms.Normalize(mean=[0.5, 0.5, 0.5],std=[0.5, 0.5, 0.5])])

trans_valid = transforms.Compose([transforms.ToPILImage(),
                                  transforms.Resize((224, 224)),
                                  transforms.ToTensor(),
                                  transforms.Normalize(mean=[0.5, 0.5, 0.5],std=[0.5, 0.5, 0.5])])

## Split

In [None]:

batch_size = 20

# Разделяем на обучение\валидацию\тест
train, val = train_test_split(labels.iloc[train_indexes], stratify=[labels.iloc[i][1] for i in train_indexes], test_size=0.2)
test,val = train_test_split(val, stratify=val.label, test_size=0.5)




train_data = MyDataset(df_data=train, data_dir=train_path, transform=trans_train)
valid_data = MyDataset(df_data=val, data_dir=train_path, transform=trans_valid)
test_data = MyDataset(df_data=test, data_dir=train_path, transform=trans_valid)


#Loaders 

train_loader = DataLoader(dataset = train_data, batch_size=batch_size, shuffle=True, num_workers=0)
valid_loader = DataLoader(dataset = valid_data, batch_size=batch_size, shuffle=False, num_workers=0)
test_loader = DataLoader(dataset = test_data, batch_size=batch_size, shuffle=False, num_workers=0)


## Random Plots

In [None]:
# Смотрим на данные
random_idx = np.random.randint(1, len(labels.iloc[train_indexes].id), size=9)
fig, axes = plt.subplots(3, 3, figsize=(20, 16))
for idx, ax in enumerate(axes.ravel()):
    
    img_path =('train\\'+ str(labels.id[idx])+'.tif')
    #print(img_path)
    img = Image.open(img_path)
    ax.set_title(labels.label[idx])
    ax.axis('off')
    ax.imshow(img)

In [None]:
print('Train: ',len(train_data), len(train_loader))
print('Test: ',len(test_data), len(test_loader))
print('Val: ',len(valid_data), len(valid_loader))
 

# Visual Transformer(part 2)

## Pretrained VIT

### Возьмем стандартную предобученную модель (которой в последующем окажется достаточно)

In [None]:
!pip install pytorch-pretrained-vit

In [None]:
import pytorch_pretrained_vit
import torch
from vit_pytorch import ViT
# model = ViT(
#     image_size = 96,
#     patch_size = 16,
#     num_classes = 2,
#     dim = 1024,
#     depth = 24,
#     heads = 16,
#     mlp_dim =4096,
#     dropout = 0.2,
#     emb_dropout = 0.2
# )

model_name = 'B_16_imagenet1k'
model = pytorch_pretrained_vit.ViT(model_name, image_size=224, pretrained=True)

# model = pytorch_pretrained_vit.ViT(model_name,dim = 2048, ff_dim= 1576,num_heads= 1 , patches = 8, num_layers=4,attention_dropout_rate = 0.3, 
#                                    dropout_rate = 0.5, image_size=90, pretrained=True)

In [None]:
from termcolor import colored

colors = ['red', 'green', 'blue', 'yellow']

def model_structure(layer, margin=0, item_color=0, deep=0, max_deep=2):
    for name, next_layer in layer.named_children():
        if deep > max_deep:
            return
        next = (0 if not list(next_layer.named_children()) else 1)
        print(colored(' ' * margin + name, colors[item_color]) + ':' * next)
        model_structure(next_layer, margin + len(name) + 2, (item_color + 1) % 4, deep + 1, max_deep=max_deep)

model_structure(model, max_deep=2)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(14, 6))
for i in range(3):
    axs[i].pcolormesh(model.patch_embedding.weight.data.cpu().detach().numpy()[0, i])
plt.show()

In [None]:
# change ouput layer
# firstly you can train only the last fc layer then finetune all layers

#Так же два класса на выходе

model.fc = nn.Linear(768, 2)
model = model.to(device)

In [None]:
plt.title("Learnable embedding for [CLS]")
plt.plot(model.class_token[0, 0].cpu().detach().numpy())
plt.show()

In [None]:
plt.title('Positional embeddings')
plt.ylabel('[CLS], Number of the patch')
plt.xlabel('Position in vector')
plt.pcolormesh(model.positional_embedding.pos_embedding.data[0].cpu().detach().numpy()[:20, :30])
plt.show()

### Attention до обучения

In [None]:

# Подготовка...

img_path =('train\\'+ str(labels.id[5])+'.tif')# train_data.pathe
img = Image.open(img_path)
trans1 = transforms.ToTensor()
img_transformed = train_data.transform(trans1(img))
# our forward
x = img_transformed.unsqueeze(0)
print(x.shape)
model = model.to(device)
model.eval()
x = model.patch_embedding(x.to(device))
print(x.shape)
x = x.flatten(2).transpose(1, 2)
print(x.shape)
x = torch.cat((model.class_token.expand(1, -1, -1), x), dim=1)
print(x.shape)
x = model.transformer(x)
print(x.shape)
att_mat = []
for block in model.transformer.blocks:
    att_mat.append(block.attn.scores.squeeze(0))
    
# Average the attention weights across all heads.
att_mat = torch.mean(torch.stack(att_mat), dim=1)
# # To account for residual connections, we add an identity matrix to the
# # attention matrix and re-normalize the weights.
residual_att = torch.eye(att_mat.size(1)).to(device)

aug_att_mat = att_mat + residual_att
aug_att_mat = aug_att_mat / aug_att_mat.sum(dim=-1, keepdim=True)
#aug_att_mat.shape
# Recursively multiply the weight matrices
joint_attentions = torch.zeros_like(aug_att_mat).to(device)
# copy first layer
joint_attentions[0] = aug_att_mat[0]
for n in range(1, aug_att_mat.size(0)):
    joint_attentions[n] = aug_att_mat[n] @ joint_attentions[n - 1]
    
# Attention from the output token to the input space, last layer
v = joint_attentions[-1]
v.shape


In [None]:
#Все еще подготовка...

att_mat = []
for block in model.transformer.blocks:
    att_mat.append(block.attn.scores.squeeze(0))
    
# Average the attention weights across all heads.
att_mat = torch.mean(torch.stack(att_mat), dim=1)
# # To account for residual connections, we add an identity matrix to the
# # attention matrix and re-normalize the weights.
residual_att = torch.eye(att_mat.size(1)).to(device)

aug_att_mat = att_mat + residual_att
aug_att_mat = aug_att_mat / aug_att_mat.sum(dim=-1, keepdim=True)
#aug_att_mat.shape
# Recursively multiply the weight matrices
joint_attentions = torch.zeros_like(aug_att_mat).to(device)
# copy first layer
joint_attentions[0] = aug_att_mat[0]
for n in range(1, aug_att_mat.size(0)):
    joint_attentions[n] = aug_att_mat[n] @ joint_attentions[n - 1]
    
# Attention from the output token to the input space, last layer
v = joint_attentions[-1]
v.shape

In [None]:
# Распределение по головам 
grid_size = int(np.sqrt(aug_att_mat.size(-1)))
mask = v[0, 1:].reshape(grid_size, grid_size).cpu().detach().numpy()
mask = cv2.resize(mask / mask.max(), img.size)[..., np.newaxis]



for i, v in enumerate(joint_attentions):
    # Attention from the output token to the input space.
    mask = v.cpu()[0, 1:].reshape(grid_size, grid_size).detach().numpy()
    mask = cv2.resize(mask / mask.max(), img.size)[..., np.newaxis]
    result = (mask * img).astype("uint8")

    fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(16, 16))
    ax1.set_title('Original')
    ax2.set_title('Attention Map_%d Layer' % (i+1))
    _ = ax1.imshow(img)
    _ = ax2.imshow(result)


# result = (mask * img).astype("uint8")
# fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(16, 16))

# ax1.set_title('Original')
# ax2.set_title('Attention Map')
# _ = ax1.imshow(img)
# _ = ax2.imshow(result)

### Training

In [None]:
# Задаем параметры нейросети
params = [param for name, param in model.named_parameters()]
# for param in params[:-2]:
#      param.requires_grad=True
optimizer = torch.optim.Adamax(model.parameters(), lr=0.001)
#optimizer = optim.SGD(params, lr=1, momentum=0.9)
scheduler = StepLR(optimizer, step_size=1, gamma=0.7)
criterion = nn.CrossEntropyLoss()

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

In [None]:
def train(model, iterator, optimizer, scheduler, criterion, clip, train_history=None, valid_history=None):
    model.train()
    
    epoch_loss = 0
    history = []
    i = 0
    for data, label in tqdm(iterator):
        data = data.to(device)
        label = label.to(device)
        
        output = model(data)
        loss = criterion(output, label)
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        i += 1
        
        history.append(loss.cpu().data.numpy())
        if (i+1)%20==0:
            fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 8))

            clear_output(True)
            ax[0].plot(history, label='train loss')
            ax[0].set_xlabel('Batch')
            ax[0].set_title('Train loss')
            if train_history is not None:
                ax[1].plot(train_history, label='general train history')
                ax[1].set_xlabel('Epoch')
            if valid_history is not None:
                ax[1].plot(valid_history, label='general valid history')
            plt.legend()
            
            plt.show()

    scheduler.step()
    
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    
    model.eval()
    epoch_loss = 0
    history = []
    with torch.no_grad():
    
        for data, label in iterator:
            data = data.to(device)
            label = label.to(device)

            output = model(data)
            loss = criterion(output, label)
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
import time
import math
import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

### Графики не сохранились, только веса

In [None]:
# Обучение

train_history = []
valid_history = []

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_loader, optimizer, scheduler, criterion, CLIP, train_history, valid_history)
    valid_loss = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-model.pt')
    
    train_history.append(train_loss)
    valid_history.append(valid_loss)
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

In [None]:
# submit = pd.read_csv('./sample_submission.csv')
# submit.head()

#### Смотрим качество

In [None]:
model.load_state_dict(torch.load('ViT_cancer_model'))       #!!!!!
best_model = model
best_model.eval()


pred_labels = []
true_labels = []
epoch_loss = 0

with torch.no_grad():
     for data, label in tqdm(valid_loader):
        data = data.to(device)
        true_labels.append(label.numpy())

        output = model(data)
        loss = criterion(output, label.to(device))
        epoch_loss += loss.item()
        pred_labels.append(output.argmax(dim=1).cpu().numpy())

In [None]:
from sklearn.metrics import accuracy_score

true_labels = np.concatenate(true_labels, axis=0)
pred_labels = np.concatenate(pred_labels, axis=0)
print(f'Accuracy score: {accuracy_score(true_labels, pred_labels)}')
print(f"Loss: {epoch_loss / len(valid_loader)}")
# .9674589828659729

### Valid accuracy: 0.97

In [None]:

best_model = model
best_model.eval()


prob = []
true_labels = []

with torch.no_grad():
     for data, label in tqdm(test_loader):
        data = data.to(device)
        true_labels.append(label.numpy())
        output = model(data)
        prob.append(output.argmax(dim=1).cpu().numpy())

In [None]:
acc = 0
for i in range(len(prob)):
  acc= acc + accuracy_score(true_labels[i],prob[i])

print (acc/len(prob))

### Test accuracy: 0.96

In [None]:
#torch.save(model.state_dict(), "C:\\Users\\safiu\\ViT_cancer_model")

In [None]:
plt.title("Learnable embedding for [CLS]")
plt.plot(model.class_token[0, 0].cpu().detach().numpy())
plt.show()

In [None]:
for batch in train_loader:
    break

In [None]:
# our forward
print(batch[0].shape)
model.eval()
x = model.patch_embedding(batch[0].to(device))
print(x.shape)
x = x.flatten(2).transpose(1, 2)
print(x.shape)
x = torch.cat((model.class_token.expand(batch_size, -1, -1), x), dim=1)
print(x.shape)

In [None]:
model_old = pytorch_pretrained_vit.ViT(model_name,image_size=224,num_classes = 2, pretrained=True).to(device)
model_old.eval()
pass

In [None]:
plt.title("Difference between old and new Learnable embedding for [CLS]")
plt.plot((model.class_token[0, 0] - model_old.class_token[0, 0]).cpu().detach().numpy())
plt.show()

In [None]:
model_old.positional_embedding.pos_embedding.data.shape
model.positional_embedding.pos_embedding.data.shape

In [None]:
plt.title('Difference Positional embeddings')
plt.ylabel('[CLS], Number of the patch')
plt.xlabel('Position in vector')
plt.pcolormesh((model_old.positional_embedding.pos_embedding.data - model.positional_embedding.pos_embedding.data).cpu()[0].detach().numpy()[:20, :30])
plt.show()

## Смотрим как поменялся attention после обучения

In [None]:
model_old = pytorch_pretrained_vit.ViT(model_name,image_size=224, pretrained=True, num_classes = 2).to(device)
model_old.eval()
pass
img_path = ('train\\'+ str(labels.id[114])+'.tif')
img = Image.open(img_path)
trans1 = transforms.ToTensor()
img_transformed = train_data.transform(trans1(img))
# our forward
x = img_transformed.unsqueeze(0)
print(x.shape)
model_old = model_old.to(device)
model_old.eval()
x = model_old.patch_embedding(x.to(device))
print(x.shape)
x = x.flatten(2).transpose(1, 2)
print(x.shape)
x = torch.cat((model_old.class_token.expand(1, -1, -1), x), dim=1)
print(x.shape)
x = model_old.transformer(x)
print(x.shape)
att_mat = []
for block in model_old.transformer.blocks:
    att_mat.append(block.attn.scores.squeeze(0))
    
# Average the attention weights across all heads.
att_mat = torch.mean(torch.stack(att_mat), dim=1)
# # To account for residual connections, we add an identity matrix to the
# # attention matrix and re-normalize the weights.
residual_att = torch.eye(att_mat.size(1)).to(device)

aug_att_mat = att_mat + residual_att
aug_att_mat = aug_att_mat / aug_att_mat.sum(dim=-1, keepdim=True)
#aug_att_mat.shape
# Recursively multiply the weight matrices
joint_attentions = torch.zeros_like(aug_att_mat).to(device)
# copy first layer
joint_attentions[0] = aug_att_mat[0]
for n in range(1, aug_att_mat.size(0)):
    joint_attentions[n] = aug_att_mat[n] @ joint_attentions[n - 1]
    
# Attention from the output token to the input space, last layer
v = joint_attentions[-1]
v.shape
grid_size = int(np.sqrt(aug_att_mat.size(-1)))
mask = v[0, 1:].reshape(grid_size, grid_size).cpu().detach().numpy()
mask = cv2.resize(mask / mask.max(), img.size)[..., np.newaxis]
result = (mask * img).astype("uint8")
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(16, 16))

ax1.set_title('Original')
ax2.set_title('Attention Map_pretrained')
_ = ax1.imshow(img)
_ = ax2.imshow(result)

model = pytorch_pretrained_vit.ViT(model_name,image_size=224, pretrained=True, num_classes =2 ).to(device)

model.load_state_dict(torch.load('ViT_cancer_model'))


#img_path = ('train\\'+ str(labels.id[95])+'.tif')
img = Image.open(img_path)
trans1 = transforms.ToTensor()
img_transformed = train_data.transform(trans1(img))
# our forward
x = img_transformed.unsqueeze(0)
print(x.shape)
model = model.to(device)
model.eval()
x = model.patch_embedding(x.to(device))
print(x.shape)
x = x.flatten(2).transpose(1, 2)
print(x.shape)
x = torch.cat((model.class_token.expand(1, -1, -1), x), dim=1)
print(x.shape)
x = model.transformer(x)
print(x.shape)
att_mat = []
for block in model.transformer.blocks:
    att_mat.append(block.attn.scores.squeeze(0))
    
# Average the attention weights across all heads.
att_mat = torch.mean(torch.stack(att_mat), dim=1)
# # To account for residual connections, we add an identity matrix to the
# # attention matrix and re-normalize the weights.
residual_att = torch.eye(att_mat.size(1)).to(device)

aug_att_mat = att_mat + residual_att
aug_att_mat = aug_att_mat / aug_att_mat.sum(dim=-1, keepdim=True)
#aug_att_mat.shape
# Recursively multiply the weight matrices
joint_attentions = torch.zeros_like(aug_att_mat).to(device)
# copy first layer
joint_attentions[0] = aug_att_mat[0]
for n in range(1, aug_att_mat.size(0)):
    joint_attentions[n] = aug_att_mat[n] @ joint_attentions[n - 1]
    
# Attention from the output token to the input space, last layer
v = joint_attentions[-1]
v.shape
grid_size = int(np.sqrt(aug_att_mat.size(-1)))
mask = v[0, 1:].reshape(grid_size, grid_size).cpu().detach().numpy()
mask = cv2.resize(mask / mask.max(), img.size)[..., np.newaxis]
result1 = (mask * img).astype("uint8")
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(16, 16))

ax1.set_title('Original')
ax2.set_title('Attention Map_transferL')
_ = ax1.imshow(img)
_ = ax2.imshow(result1)

### Я конечно не биолог, но кажется, что после обучения attention стал обращать внимание на более важные области патологии

# Visual Transformer(part 3)

## CNN test

In [None]:
train_dir = 'train'
test_dir = 'test'

train_indexes = random.choices(np.arange(0,len(labels)), k=len(labels))
trainD, val = train_test_split(labels.iloc[train_indexes], stratify=[labels.iloc[i][1] for i in train_indexes], test_size=0.2)
test,val = train_test_split(val, stratify=val.label, test_size=0.5)

In [None]:
# Параметры для CNN

num_classes = 2
batch_size = 110
learning_rate = 0.002

# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
#Повторяем подготовительный процесс

train_data = MyDataset(df_data=trainD, data_dir=train_path, transform=trans_train)
test_data = MyDataset(df_data=test, data_dir=train_path, transform=trans_valid)
valid_data = MyDataset(df_data=val, data_dir=train_path, transform=trans_valid)




train_loader = DataLoader(dataset = train_data, batch_size=batch_size, shuffle=True, num_workers=0)
test_loader = DataLoader(dataset = test_data, batch_size=batch_size, shuffle=True, num_workers=0)
valid_loader = DataLoader(dataset = valid_data, batch_size=batch_size, shuffle=False, num_workers=0)



class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=2)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=2)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=2)
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=2)
        self.conv5 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=2)
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)
        self.bn4 = nn.BatchNorm2d(256)
        self.bn5 = nn.BatchNorm2d(512)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.avg = nn.AvgPool2d(8)
        self.fc = nn.Linear(512 * 1 * 1, 2) # !!!
    def forward(self, x):
        #print (x.shape)
        x = self.pool(F.leaky_relu(self.bn1(self.conv1(x)))) 
        x = self.pool(F.leaky_relu(self.bn3(self.conv3(x))))
        x = self.pool(F.leaky_relu(self.bn4(self.conv4(x))))
        x = self.pool(F.leaky_relu(self.bn5(self.conv5(x))))
        x = self.avg(x)
        x = x.view(-1, 512 * 1 * 1) # !!!
        x = self.fc(x)
        return x

model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adamax(model.parameters(), lr=0.001)

In [None]:
train_history = []
valid_history = []
import time
import math
import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output
N_EPOCHS = 10
CLIP = 1
scheduler = StepLR(optimizer, step_size=1, gamma=0.7)
import cv2

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_loader, optimizer, scheduler, criterion, CLIP, train_history, valid_history)
    valid_loss = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-model_CNN.pt')
    
    train_history.append(train_loss)
    valid_history.append(valid_loss)
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

In [None]:
# Картинка с обучения
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://sun9-64.userapi.com/impg/MlzY0Rjlv63SmwOH0qegh0t2iiC-HrrhTthHrA/KBIsALrXMP4.jpg?size=719x496&quality=96&proxy=1&sign=8bbeb100f5eb36802c9f22afe9e103c3&type=album")


In [None]:
best_model = model
best_model.load_state_dict(torch.load('CNN_cancer_model11'))           #!!!!

best_model.eval()


pred_labels = []
true_labels = []
epoch_loss = 0

with torch.no_grad():
     for data, label in tqdm(valid_loader):
        data = data.to(device)
        true_labels.append(label.numpy())

        output = model(data)
        loss = criterion(output, label.to(device))
        epoch_loss += loss.item()
        pred_labels.append(output.argmax(dim=1).cpu().numpy())

In [None]:
from sklearn.metrics import accuracy_score

true_labels = np.concatenate(true_labels, axis=0)
pred_labels = np.concatenate(pred_labels, axis=0)
print(f'Accuracy score: {accuracy_score(true_labels, pred_labels)}')
print(f"Loss: {epoch_loss / len(valid_loader)}")

### Valid accuracy: 0.95

In [None]:

best_model = model
best_model.eval()


prob = []
true_labels = []

with torch.no_grad():
     for data, label in tqdm(test_loader):
        data = data.to(device)
        true_labels.append(label.numpy())
        output = model(data)
        prob.append(output.argmax(dim=1).cpu().numpy())

In [None]:
acc = 0
for i in range(len(prob)):
    acc= acc + accuracy_score(true_labels[i],prob[i])

print (acc/len(prob))

### Test accuracy: 0.94

In [None]:
#torch.save(model.state_dict(), "C:\\Users\\safiu\\CNN_cancer_model")

### Таким образом получаем, что наш не сильно большой и отлаженный ViT получился лучше неплохой CNN