In [None]:
!pip install -q efficientnet_pytorch
!pip install -q albumentations==0.5.2

In [None]:
'/opt/conda/bin/python3.7 -m pip install --upgrade pip'

In [None]:
!pip install opencv-python-headless==4.1.2.30
'/opt/conda/bin/python3.7 -m pip install --upgrade pip'

In [None]:
from efficientnet_pytorch import EfficientNet
from albumentations.pytorch import ToTensorV2
import albumentations as A
import os
import torch
import pandas as pd
import numpy as np
import random
import torch.nn as nn
import matplotlib.pyplot as plt

from glob import glob
import torchvision
from torch.utils.data import Dataset
import time
from tqdm.notebook import tqdm
#from tqdm import tqdm
from sklearn import metrics
import cv2
import gc
import torch.nn.functional as F

SEED = 42

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

Сборка данных в Датафрейм


In [None]:
data_dir = '../input/alaska2-image-steganalysis'
sample_size = 15000 #  тестирования, при желании можно установить 75000
val_size = int(sample_size*0.2)

train_fn, val_fn = [], []
train_labels, val_labels = [], []

folder_names = ['Cover/','JMiPOD/', 'JUNIWARD/', 'UERD/'] # label 1 2 3
for label, folder in enumerate(folder_names):
    train_filenames = sorted(glob(f"{data_dir}/{folder}/*.jpg"))[:sample_size]
    #np.random.shuffle(train_filenames) #можно потом включить для перемешивания
    
    train_fn.extend(train_filenames[val_size:])
    train_labels.extend(np.zeros(len(train_filenames[val_size:],))+label)
    
    val_fn.extend(train_filenames[:val_size])
    val_labels.extend(np.zeros(len(train_filenames[:val_size],))+label)

assert len(train_labels) == len(train_fn), "неверное заполнение labels"
assert len(val_labels) == len(val_fn), "неверное заполнение labels"

train_df = pd.DataFrame({'ImageFileName': train_fn, 'Label': train_labels}, columns=['ImageFileName', 'Label'])
train_df['Label'] = train_df['Label'].astype(int)

val_df = pd.DataFrame({'ImageFileName': val_fn, 'Label': val_labels}, columns=['ImageFileName', 'Label'])
val_df['Label'] = val_df['Label'].astype(int)

print(train_df.head())
print(val_df.head())

Класс преобразования фото в тензор

In [None]:
class Alaska2_Dataset(Dataset):

    def __init__(self, df, augmentations=None):

        self.data = df
        self.augment = augmentations

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        fn, label = self.data.loc[idx]
        im = cv2.imread(fn)[:, :, ::-1]
        if self.augment:
            im = self.augment(image=im)
        return im, label


img_size = 512
AUGMENTATIONS_TRAIN = A.Compose([
    A.Resize(img_size, img_size, p=1.0),
    A.VerticalFlip(p=0.5),
    A.HorizontalFlip(p=0.5),
    A.JpegCompression(quality_lower=75, quality_upper=100, p=0.5),
    A.ToFloat(max_value=255),
    ToTensorV2()
], p=1.0)


AUGMENTATIONS_TEST = A.Compose([
    A.Resize(img_size, img_size, p=1.0),
    A.ToFloat(max_value=255),
    ToTensorV2()
], p=1.0)

Получение датафреймов для обучения

In [None]:
train_dataset = Alaska2_Dataset(train_df, augmentations=AUGMENTATIONS_TRAIN)
valid_dataset = Alaska2_Dataset(val_df, augmentations=AUGMENTATIONS_TEST)

Проверка фото

In [None]:
image, label = valid_dataset[50]
image = image.get('image').permute(1,2,0).cpu().numpy()
plt.imshow(image)

Далее идет создание модели model

1.   Новый пункт
2.   Новый пункт



> пример модели efficientnet-b2:


In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = EfficientNet.from_pretrained('efficientnet-b2')
        self.dense_output = nn.Linear(1280, 4)

    def forward(self, x):
        feat = self.model.extract_features(x)
        feat = F.avg_pool2d(feat, feat.size()[2:]).reshape(1280, -1)
        return self.dense_output(feat)

In [None]:
batch_size = 8 # гиперпараметр
num_workers = 8 # гиперпараметр

train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=batch_size,
                                           num_workers=num_workers,
                                           shuffle=True)

valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                           batch_size=batch_size*2,
                                           num_workers=num_workers,
                                           shuffle=False)

device = 'cuda'
model = Net().to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4) # можно поиграть
criterion = torch.nn.CrossEntropyLoss() 

In [None]:
def alaska_weighted_auc(y_true, y_valid):
    tpr_thresholds = [0.0, 0.4, 1.0]
    weights = [2, 1]

    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_valid, pos_label=1)

    # size of subsets
    areas = np.array(tpr_thresholds[1:]) - np.array(tpr_thresholds[:-1])

    # The total area is normalized by the sum of weights such that the final weighted AUC is between 0 and 1.
    normalization = np.dot(areas, weights)

    competition_metric = 0
    for idx, weight in enumerate(weights):
        y_min = tpr_thresholds[idx]
        y_max = tpr_thresholds[idx + 1]
        mask = (y_min < tpr) & (tpr < y_max)
        # pdb.set_trace()

        x_padding = np.linspace(fpr[mask][-1], 1, 100)

        x = np.concatenate([fpr[mask], x_padding])
        y = np.concatenate([tpr[mask], [y_max] * len(x_padding)])
        y = y - y_min  # normalize such that curve starts at y=0
        score = metrics.auc(x, y)
        submetric = score * weight
        best_subscore = (y_max - y_min) * weight
        competition_metric += submetric

    return competition_metric / normalization

Model 1 : Baseline Model Configuration

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [None]:
num_epochs = 5
train_loss, val_loss = [], []

for epoch in range(num_epochs):
    print('Epoch {}/{}'.format(epoch, num_epochs - 1))
    print('-' * 10)
    model.train()
    running_loss = 0
    tk0 = tqdm(train_loader, total=int(len(train_loader)))
    for im, labels in tk0:
        inputs = im["image"].to(device, dtype=torch.float)
        labels = labels.to(device, dtype=torch.long)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        tk0.set_postfix(loss=(loss.item()))

    epoch_loss = running_loss / (len(train_loader)/batch_size)
    train_loss.append(epoch_loss)
    print('Training Loss: {:.8f}'.format(epoch_loss))

    tk1 = tqdm(valid_loader, total=int(len(valid_loader)))
    model.eval()
    running_loss = 0
    y, preds = [], []
    with torch.no_grad():
        for (im, labels) in tk1:
            inputs = im["image"].to(device, dtype=torch.float)
            labels = labels.to(device, dtype=torch.long)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            y.extend(labels.cpu().numpy().astype(int))
            preds.extend(F.softmax(outputs, 1).cpu().numpy())
            running_loss += loss.item()
            tk1.set_postfix(loss=(loss.item()))

        epoch_loss = running_loss / (len(valid_loader)/batch_size)
        val_loss.append(epoch_loss)
        preds = np.array(preds)

        labels = preds.argmax(1)
        acc = (labels == y).mean()*100
        new_preds = np.zeros((len(preds),))
        temp = preds[labels != 0, 1:]
        new_preds[labels != 0] = temp.sum(1)
        new_preds[labels == 0] = preds[labels == 0, 0]
        y = np.array(y)
        y[y != 0] = 1
        auc_score = alaska_weighted_auc(y, new_preds)
        print(f'Val Loss: {epoch_loss:.3}, Weighted AUC:{auc_score:.3}, Acc: {acc:.3}')

    torch.save(model.state_dict(),f"epoch_{epoch+4}_val_loss_{epoch_loss:.3}_auc_{auc_score:.3}.pth")

In [None]:
train_dataset.get_labels()

In [None]:
run_training()

# **Проверяем модель на тестовых данных**

In [None]:
my_sample = sample.copy()
my_sample["Label"] = pred
my_sample.to_csv("my_sample.csv", index=False)
my_sample.head()

In [None]:
#my_sample.to_csv('1_sub.csv', index=False)

In [None]:
#!kaggle competitions submit -c alaska2-image-steganalysis -f 1_sub.csv -m "EfficientNet"
