In [1]:
import keras
keras.__version__

Using TensorFlow backend.


'2.3.1'

In [2]:
from keras.applications import VGG16

conv_base = VGG16(weights='imagenet',
                  include_top=False,
                  input_shape=(150, 150, 3))

conv_base.summary()


Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 150, 150, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 150, 150, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 150, 150, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 75, 75, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 75, 75, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 75, 75, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 37, 37, 128)       0    

In [3]:
import os
import numpy as np
from keras.preprocessing.image import ImageDataGenerator

#base_dir = '/Users/fchollet/Downloads/cats_and_dogs_small'
root_dir = r'C:\Temp'

train_dir = os.path.join(root_dir, 'train')
validation_dir = os.path.join(root_dir, 'val')
test_dir = os.path.join(base_dir, 'test')

datagen = ImageDataGenerator()
batch_size = 20

def extract_features(directory, sample_count):
    features = np.zeros(shape=(sample_count, 4, 4, 512))
    labels = np.zeros(shape=(sample_count))

    generator = datagen.flow_from_directory(
        directory,
        target_size=(150, 150),
        batch_size=batch_size,
        class_mode='binary')

    i = 0
    for inputs_batch, labels_batch in generator:
        features_batch = conv_base.predict(inputs_batch)
        features[i * batch_size : (i + 1) * batch_size] = features_batch
        labels[i * batch_size : (i + 1) * batch_size] = labels_batch
        i += 1
        if i * batch_size >= sample_count:
            # Note that since generators yield data indefinitely in a loop,
            # we must `break` after every image has been seen once.
            break
    return features, labels

train_features, train_labels = extract_features(train_dir, 2000)
# validation_features, validation_labels = extract_features(validation_dir, 1000)
# test_features, test_labels = extract_features(test_dir, 1000)

NameError: name 'base_dir' is not defined

In [None]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
from os.path import join
import shutil

from tqdm import tqdm   # Progress bar
from IPython.display import Audio, display

import torch
import torchvision
import torch.nn.functional as T
from torchvision import transforms, models
from torch.utils.data import DataLoader, Dataset


random.seed(6)
np.random.seed(6)
torch.manual_seed(6)
torch.cuda.manual_seed(6)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

root_dir = r'C:\Temp'

In [None]:
def allDone():
    display(Audio(url='../../chicken.mp3', autoplay=True))

### Load Labels

In [None]:
train_val_labels = pd.read_csv('../input/train.csv')
train_val_labels.head()

In [None]:
plt.figure(figsize=(3,3))
plt.title('Labels distribution')
sns.countplot(train_val_labels['has_cactus']);

In [None]:
# Вывод сэмплов на экран

def show_sample_images(dataloader, batch_size, images_from_batch=0, denormalize=False, classes=None):
    if denormalize:
        mean = np.array([0.485, 0.456, 0.406])
        std = np.array([0.229, 0.224, 0.225])
    else:
        mean = np.array([0., 0., 0.])
        std = np.array([1., 1., 1.])
    
    if images_from_batch == 0 or images_from_batch > batch_size:
            images_from_batch = batch_size
        
    for images, labels in dataloader:
        plt.figure(figsize=(20, (batch_size // 20 + 1) * 3))

        cols = 12
        rows = batch_size // cols + 1
        for i in range(images_from_batch):
            image = images[i].permute(1, 2, 0).numpy() * std + mean   # Размерность RGB в конец
            plt.subplot(rows, cols, i+1)
            plt.xticks([])
            plt.yticks([])
            plt.grid(False)
            plt.imshow(image.clip(0, 1))
            if classes is not None:
                plt.xlabel(classes[labels[i].numpy()])
        plt.show()
        
        break

### Preparing train and validation datasets

In [None]:
batch_size = 100
# 500 неплохо, попробовать ещё (sched - 25 step)
# 250 - 99,48%
# 50 - 99,08%

train_dir = join(root_dir, 'train')
val_dir = join(root_dir, 'val')

classes = ['No', 'Cactus']

train_transforms = transforms.Compose([
    transforms.Resize(224),
    # transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transforms = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


train_dataset = torchvision.datasets.ImageFolder(train_dir, train_transforms)
val_dataset = torchvision.datasets.ImageFolder(val_dir, val_transforms)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=0)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=0)


# Протестировали загрузку, узнали размер тензора
for images, labels in train_dataloader:
    print(images.size())
    print(labels.size())
    break

### Let's look at the samples

In [None]:
show_sample_images(train_dataloader, batch_size, 72, denormalize=True)

In [None]:
print(f'Batch size: {batch_size}')
print(f'Train batches: {len(train_dataloader)}, Train samples: {len(train_dataset)}')
print(f'Val batches:   {len(val_dataloader)}, Val samples:    {len(val_dataset)}')

### Define train_model and validate functions

In [None]:
train_batch_loss_history = []
train_batch_accuracy_history = []

train_loss_history = []
train_accuracy_history = []

val_loss_history = []
val_accuracy_history = []

def validate(model, loss, optimizer):
        
    dataloader = val_dataloader
    model.eval()   # Set model to evaluate mode

    sum_loss = 0.
    sum_accuracy = 0.

    for inputs, labels in dataloader:
        inputs = inputs.cuda()
        labels = labels.cuda()

        optimizer.zero_grad()

        with torch.set_grad_enabled(False):
            preds = model(inputs)
            loss_value = loss(preds, labels)
            preds_class = preds.argmax(dim=1)

        sum_loss += loss_value.item()
        sum_accuracy += (preds_class == labels.data).float().mean().cpu().numpy().item()

    val_loss = sum_loss / len(dataloader)
    val_accuracy = sum_accuracy / len(dataloader)

    val_loss_history.append(val_loss)
    val_accuracy_history.append(val_accuracy)
    
    print(f'Validation accuracy {val_accuracy * 100:.2f} %, loss {val_loss:.4f}')
#     if val_accuracy >= 0.99:
#         allDone()
#         input()

    model.train()  # Вернули как было


def train_model(model, loss, optimizer, scheduler, num_epochs):
        
    for epoch in range(num_epochs):
        print(f'\nEpoch {epoch}/{num_epochs-1}: \n', end='')

        dataloader = train_dataloader
        model.train()  # Set model to training mode

        sum_loss = 0.
        sum_accuracy = 0.

        # Прогон по батчам
        for inputs, labels in tqdm(dataloader):
            inputs = inputs.cuda()
            labels = labels.cuda()

            optimizer.zero_grad()

            # forward and backward
            with torch.set_grad_enabled(True):
                preds = model(inputs)
                loss_value = loss(preds, labels)
                preds_class = preds.argmax(dim=1)

                loss_value.backward()
                optimizer.step()
                # scheduler.step()

            batch_loss = loss_value.item()
            batch_accuracy = (preds_class == labels.data).float().mean().cpu().numpy().item()

            sum_loss += batch_loss
            sum_accuracy += batch_accuracy
            
            train_batch_loss_history.append(batch_loss)
            train_batch_accuracy_history.append(batch_accuracy)
            #print(f'\r----- {phase}, batch accuracy {train_batch_accuracy * 100:.2f} %, batch loss {train_batch_loss:.4f}')        
            #validate(model, loss, optimizer)
            
        epoch_loss = sum_loss / len(dataloader)
        epoch_acc = sum_accuracy / len(dataloader)

        train_loss_history.append(epoch_loss)
        train_accuracy_history.append(epoch_acc)
        scheduler.step()

        # Валидация
        print('\n End epoch: ', end='')
        validate(model, loss, optimizer)
        
    return model

### Define Model, Loss and Optimizer

In [None]:
model = models.resnet18(pretrained=False)

#model = CactusNet()
#model = models.mobilenet_v2(pretrained=True)

# Disable grad for all conv layers - замораживаем слои ResNet
# for param in model.parameters():
#     param.requires_grad = False

# Заменяем последний слой на наш (512 входов, 2 выхода) (пересоздаём, поэтому будет разморожен)
#model.fc = torch.nn.Linear(model.fc.in_features, 2)
#model.classifier[1] = torch.nn.Linear(model.classifier[1].in_features, 2)

model = model.cuda()

loss = torch.nn.CrossEntropyLoss() #weight=torch.FloatTensor([1, 1]).cuda())
optimizer = torch.optim.Adam(model.parameters())#, lr=1.0e-3, weight_decay=0.01, amsgrad=True)

#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=25, gamma=0.1)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[20, 40], gamma=0.1)  

In [None]:
print(f'Batch size: {batch_size}\nBatches: {len(train_dataloader)}\nAll elements: {len(train_dataset)}')

### Let's train!

In [None]:
# Тренировочка
epochs = int(np.sqrt(batch_size)) + 20  # Чтобы автоматически выбиралось

train_model(model, loss, optimizer, scheduler, num_epochs=epochs)
allDone()

### History graphs

In [None]:
plt.figure(figsize=(20,10))
    
plt.subplot(1, 3, 1)
plt.plot(train_batch_loss_history, label='Train Batch Loss')
plt.plot(train_batch_accuracy_history, label='Train Batch Accuracy')
plt.legend();

plt.subplot(1, 3, 2)
plt.plot(train_accuracy_history, label='Train accuracy')
plt.plot(val_accuracy_history, label='Val accuracy')
plt.legend();
    
plt.subplot(1, 3, 3)
plt.plot(train_loss_history, label='Train Loss')
plt.plot(val_loss_history, label='Val Loss')
plt.legend();

### Preparing test dataset and dataloader

In [None]:
# Создадим новую папку и перенесём туда тестовый датасет

test_dir = join(root_dir, 'test')

#shutil.copytree(os.path.join(data_root, 'test'), os.path.join(test_dir, 'unknown'));

In [None]:
test_transforms = val_transforms

test_dataset = torchvision.datasets.ImageFolder(test_dir, test_transforms)

test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=0)

# Протестировали загрузку, узнали размер тензора
for images, labels in test_dataloader:
    print(images.size())
    print(labels.size())
    break

In [None]:
show_sample_images(test_dataloader, batch_size, 12, denormalize=True, classes=['Unknown'])

### Predict on test set

In [None]:
model.eval()

test_predictions = []

i = 1
for images, labels in test_dataloader:
    images = images.cuda()
    with torch.set_grad_enabled(False):
        preds = model(images)
    test_predictions.append(T.softmax(preds, dim=1)[:,1].data.cpu().numpy())
    print(f'\r{i}/{len(test_dataloader)}', end='')
    i += 1
    
test_predictions = np.concatenate(test_predictions)  # Соединили предсказания по батчам в единый массив

# Округлили до 0 и 1
test_predictions = (test_predictions >= 0.5).astype('int')

In [None]:
# Получаем список файлов из каталога test/unknown. 
# os.walk возвращает кортеж (каталог, имена подкаталогов, список файлов). Нам нужен список файлов [2]
test_files = next(iter(os.walk(join(test_dir, 'unknown'))))[2]

print(test_files[:10])

### Make submission

In [None]:
submission_df = pd.DataFrame.from_dict({'id': test_files, 'has_cactus': test_predictions})
submission_df.set_index('id', inplace=True)
submission_df.head(25)

In [None]:
submission_df.to_csv('submission.csv')