In [None]:
# Отключим автоскролл
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [None]:
import pickle
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from PIL import Image
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from torchvision import models, transforms
from tqdm.autonotebook import tqdm

In [None]:
DEVICE = torch.device("cuda")

DATA MODULE


In [None]:
def sort_files(files_list):
    files_dict = {}

    for file in files_list:
        files_dict.setdefault(file.parent.name, [])
        files_dict[file.parent.name].append(file)

    return files_dict

In [None]:
def label_freq(files):
    freq_dict = {}

    for file in files:
        freq_dict.setdefault(file.parent.name, 0)
        freq_dict[file.parent.name] += 1

    return freq_dict

In [None]:
# undersampling
def remove_images(files, sampler):
    new_files = []

    files_dict = dict_from_list(files)
    freq = label_freq(files)

    for label in sampler.keys():
        if freq[label] >= sampler[label]:
            indexes = np.random.permutation(freq[label])[: sampler[label]]
            for index in indexes:
                new_files.append(files_dict[label][index])

    return new_files

In [None]:
# oversampling
def add_images(files, new_images, sampler):
    new_files = files.copy()

    files_dict = dict_from_list(files)
    new_images_dict = dict_from_list(new_images)
    freq = label_freq(files)

    for label in sampler.keys():
        if freq[label] < sampler[label]:
            all_images = files_dict[label] + new_images_dict[label]
            new_files += new_images_dict[label]
            to_add = sampler[label] - len(all_images)

            if to_add > 0:
                indexes = np.random.randint(0, len(all_images), to_add)
                for index in indexes:
                    new_files.append(all_images[index])

    return new_files

In [None]:
# аугментация
def resample(files, new_images, sampler):
    new_files = add_images(files, new_images, sampler)
    new_files = remove_images(new_files, sampler)
    return new_files

In [None]:
# написал свой splitter, чтобы класс с малым количеством примеров не попадал полностью в валидационный датасет
def train_test_split(files, train_size):
    files_dict = dict_from_list(files)

    train_files, test_files, = (
        [],
        [],
    )

    for label in files_dict.keys():

        indexes = np.random.permutation(len(files_dict[label]))
        train_indexes = indexes[: int(train_size * len(files_dict[label]))]
        test_indexes = indexes[int(train_size * len(files_dict[label])) :]

        for index in train_indexes:
            train_files.append(files_dict[label][index])
        for index in test_indexes:
            test_files.append(files_dict[label][index])

    return train_files, test_files

In [None]:
class SimpsonsDataset(Dataset):
    def __init__(self, files, mode):
        super().__init__()
        self.files = sorted(files)
        self.labels = [path.parent.name for path in self.files]
        self.mode = mode
        self.len_ = len(self.files)
        self.label_encoder = LabelEncoder()

        if self.mode != "test":
            self.label_encoder.fit(self.labels)
            with open("label_encoder.pkl", "wb") as pickle_file:
                pickle.dump(self.label_encoder, pickle_file)

    def __len__(self):
        return self.len_

    def __getitem__(self, index):
        transform = transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]
        )

        image = Image.open(self.files[index])
        image.load()
        image = image.resize((224, 224))
        x = np.array(image)
        x = np.array(x / 255, dtype="float32")
        x = transform(x)

        if self.mode == "test":
            return x

        label = self.labels[index]
        encoded_label = self.label_encoder.transform([label])
        y = encoded_label.item()

        return x, y

ML MODULE

In [None]:
def train_epoch(model, criterion, optimizer, train_loader):
    model.train()

    epoch_loss = 0.0
    epoch_corrects = 0
    processed_data = 0

    for x_batch, y_batch in tqdm(train_loader, leave=False, desc="batch train:"):
        x_batch = x_batch.to(DEVICE)
        y_batch = y_batch.to(DEVICE)
        optimizer.zero_grad()

        y_logits = model(x_batch)
        y_pred = torch.argmax(y_logits, 1)

        loss = criterion(y_logits, y_batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * x_batch.size(0)
        epoch_corrects += torch.sum(y_pred == y_batch.data)
        processed_data += x_batch.size(0)

    epoch_loss /= processed_data
    epoch_acc = epoch_corrects.cpu().numpy() / processed_data
    return epoch_loss, epoch_acc

In [None]:
def val_epoch(model, criterion, val_loader):
    model.eval()

    val_loss = 0.0
    val_corrects = 0
    processed_data = 0

    for x_batch, y_batch in tqdm(val_loader, leave=False, desc="batch val:"):
        x_batch = x_batch.to(DEVICE)
        y_batch = y_batch.to(DEVICE)

        with torch.no_grad():
            y_logits = model(x_batch)
            y_pred = torch.argmax(y_logits, 1)
            loss = criterion(y_logits, y_batch)

        val_loss += loss.item() * x_batch.size(0)
        val_corrects += torch.sum(y_pred == y_batch.data)
        processed_data += x_batch.size(0)

    val_loss /= processed_data
    val_acc = val_corrects.double() / processed_data
    return val_loss, val_acc

In [None]:
def train(model, criterion, optimizer, epochs, batch_size):
    log = []
    log_template = (
        "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} val_loss {v_loss:0.4f} train_acc {t_acc:0.4f} val_acc {v_acc:0.4f}"
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

    for epoch in tqdm(range(epochs), desc="epoch:"):
        train_loss, train_acc = train_epoch(model, criterion, optimizer, train_loader)
        val_loss, val_acc = val_epoch(model, criterion, val_loader)
        log.append((train_loss, train_acc, val_loss, val_acc))
        tqdm.write(
            log_template.format(ep=epoch + 1, t_loss=train_loss, v_loss=val_loss, t_acc=train_acc, v_acc=val_acc)
        )

    return log

In [None]:
def predict_proba(model, test_loader):
    model.eval()
    logits = []

    with torch.no_grad():
        for x in tqdm(test_loader):
            x = x.to(DEVICE)
            y_logits = model(x).cpu()
            logits.append(y_logits)

    y_prob = nn.functional.softmax(torch.cat(logits), dim=-1).numpy()
    return y_prob

In [None]:
def plot_loss(train_loss, val_loss):
    plt.figure(figsize=(15, 9))
    plt.plot(train_loss, label="train_loss")
    plt.plot(val_loss, label="val_loss")
    plt.xlabel("epochs")
    plt.ylabel("loss")
    plt.show()

In [None]:
def create_predictions_file(model, filename):
    test_loader = DataLoader(test_dataset, shuffle=False, batch_size=64)
    label_encoder = pickle.load(open("label_encoder.pkl", "rb"))

    y_prob = predict_proba(model, test_loader)
    y_pred = label_encoder.inverse_transform(np.argmax(y_prob, axis=1))
    test_filenames = [path.name for path in test_dataset.files]
    predictions_df = pd.DataFrame({"Id": test_filenames, "Expected": y_pred})
    predictions_df.to_csv(filename, index=False)

DATA PREPARATION

In [None]:
!unzip -q /content/drive/My\ Drive/journey-springfield.zip
!unzip -q /content/drive/My\ Drive/horizontal_flip_images.zip

In [None]:
train_dir = Path("train/simpsons_dataset")
test_dir = Path("testset/testset")
new_images_dir = Path("horizontal_flip_images")

all_train_images_files = sorted(list(train_dir.rglob("*.jpg")))
test_images_files = sorted(list(test_dir.rglob("*.jpg")))
new_images_files = sorted(list(new_images_dir.rglob("*.jpg")))
labels = np.unique([file.parent.name for file in all_train_images_files])

In [None]:
original_sampler = label_freq(all_train_images_files)
uniform_sampler = {label: 500 for label in labels}
sampler_1 = {label: original_sampler[label] if original_sampler[label] > 300 else 300 for label in labels}

In [72]:
# создадим валидационный датасет из оригинальных данных
_, val_images_files = train_test_split(all_train_images_files, train_size=0.8)

In [None]:
# augmentation
all_train_images_files_resampled = resample(all_train_images_files, new_images_files, sampler_1)

In [None]:
# создадим обучающий датасет из аугментированных данных
# не берем отсюда валидацинный датасет, потому что распределения классов в оргинальном датасете и в тестовом вероятнее всего одинаковые
train_images_files_resampled = [
    image_file for image_file in tqdm(all_train_images_files_resampled) if image_file not in val_images_files
]

  0%|          | 0/26202 [00:00<?, ?it/s]

In [None]:
train_dataset = SimpsonsDataset(train_images_files_resampled, mode="train")
val_dataset = SimpsonsDataset(val_images_files, mode="val")
test_dataset = SimpsonsDataset(test_images_files, mode="test")

SIMPLE CNN WITH BATCHNORM - 93.3%

In [None]:
class SimpleCnn(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=8, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(8),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=8, out_channels=16, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(16),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(32),
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(64),
        )
        self.conv5 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=96, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(96),
        )

        self.out = nn.Sequential(
            nn.Linear(96 * 5 * 5, n_classes),
        )

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)

        x = x.view(x.size(0), -1)
        logits = self.out(x)
        return logits

In [None]:
simple_cnn = SimpleCnn(len(labels)).to(DEVICE)
loss_func = nn.CrossEntropyLoss()
optim_func = torch.optim.AdamW(simple_cnn.parameters(), lr=1e-4)

In [None]:
simple_cnn_log = train(simple_cnn, loss_func, optim_func, epochs=5, batch_size=4)

In [None]:
torch.save(simple_cnn.state_dict(), "simple_cnn_resampled_e5_b4.pth")

In [None]:
create_predictions_file(simple_cnn, "simple_cnn_resampled_e5_b4.csv")

  0%|          | 0/16 [00:00<?, ?it/s]

RESNET FINE-TUNING - 98.5%

In [None]:
resnet_model = models.resnet50(pretrained=True)

In [None]:
# заморозим первые 3 слоя и будем переобучать последний
for param in resnet_model.parameters():
    param.requires_grad = False
for param in resnet_model.layer4.parameters():
    param.requires_grad = True
resnet_model.fc = nn.Linear(2048, len(labels))

In [None]:
resnet_model = resnet_model.to(DEVICE)

In [None]:
loss_func = nn.CrossEntropyLoss()
optim_func = torch.optim.AdamW(list(resnet_model.layer4.parameters()) + list(resnet_model.fc.parameters()), lr=1e-5)

In [None]:
# прогнал этот код дважды
# сначала с lr=1e-4, потом обновил обучающую и валидацинную выборку, чтобы обучить на новых данных (по типу crossfoldа)
# второй раз поставил lr=1e-5, чтобы модель не переобучалась
resnet_model_log = train(resnet_model, loss_func, optim_func, epochs=1, batch_size=4)

epoch::   0%|          | 0/1 [00:00<?, ?it/s]

batch train::   0%|          | 0/5400 [00:00<?, ?it/s]

batch val::   0%|          | 0/1051 [00:00<?, ?it/s]


Epoch 001 train_loss: 0.0830 val_loss 0.0397 train_acc 0.9817 val_acc 0.9914


In [None]:
torch.save(resnet_model.state_dict(), "resnet_resampled_e2_b4.pth")

In [None]:
create_predictions_file(resnet_model, "drive/MyDrive/resnet_resampled_e2_b4.csv")

  0%|          | 0/16 [00:00<?, ?it/s]