In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q efficientnet_pytorch
from efficientnet_pytorch import EfficientNet

In [None]:
import torch
import random
from skimage import io
from torch import nn
from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler
import sklearn


SEED = 42

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [None]:
from albumentations import (
    HorizontalFlip,Rotate,  RandomRotate90, VerticalFlip,
   Normalize,ToFloat, Compose
)

from albumentations.pytorch import ToTensor

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import cv2
from torch.utils.data import Dataset
import gc
import torch.nn.functional as F
from tqdm import tqdm
from sklearn import metrics
import torchvision

In [None]:
dir_img='../input/alaska2-image-steganalysis'

In [None]:

train_img_ids = pd.Series(os.listdir(dir_img + '/Cover')).sort_values(ascending=True).reset_index(drop=True)
test_img_ids = pd.Series(os.listdir(dir_img + '/Test')).sort_values(ascending=True).reset_index(drop=True)

In [None]:
cover_img_path = pd.Series(dir_img + '/Cover/' + train_img_ids ).sort_values(ascending=True)
JMIPOD_img_path = pd.Series(dir_img + '/JMiPOD/'+train_img_ids).sort_values(ascending=True)
JUNIWARD_img_path = pd.Series(dir_img + '/JUNIWARD/'+train_img_ids).sort_values(ascending=True)
UERD_img_path = pd.Series(dir_img + '/UERD/'+train_img_ids).sort_values(ascending=True)
test_img_path = pd.Series(dir_img + '/Test/'+test_img_ids).sort_values(ascending=True)
ss = pd.read_csv(f'{dir_img}/sample_submission.csv')

In [None]:
f, axs = plt.subplots(nrows=2, ncols=2, figsize=(30, 20))
k=0
for i, row in enumerate(axs):
    for j, col in enumerate(row):
        img = cv2.imread(cover_img_path[k])
        col.imshow(img)
        col.set_title(cover_img_path[k])
        k=k+1
plt.suptitle('Samples from Cover Images', fontsize=14)
plt.show()

In [None]:
fig, axs = plt.subplots(nrows=3, ncols=4, figsize=(30, 20))
for i in range(3):
   
    cv_img = cv2.imread(cover_img_path[i])
    uni_img = cv2.imread(JUNIWARD_img_path[i])
    jpod_img = cv2.imread(JMIPOD_img_path[i])
    uerd_img = cv2.imread(UERD_img_path[i])
    
    axs[i,0].imshow(cv_img)
    axs[i,0].set_title('Cover_IMG'+train_img_ids[i])
    axs[i,1].imshow(uni_img)
    axs[i,1].set_title('JNIWARD_IMG'+train_img_ids[i])
    axs[i,2].imshow(jpod_img)
    axs[i,2].set_title('JMiPOD_IMG'+train_img_ids[i])
    axs[i,3].imshow(uerd_img)
    axs[i,3].set_title('UERD_IMG'+train_img_ids[i])

In [None]:
! git clone https://github.com/dwgoon/jpegio

In [None]:
!pip install jpegio/.
import jpegio as jio

In [None]:
import numpy as np
coverDCT = np.zeros([512,512,3])
stego_juni_DCT = np.zeros([512,512,3])
stego_uerd_DCT = np.zeros([512,512,3])
stego_jmpd_DCT = np.zeros([512,512,3])
jpeg = jio.read(cover_img_path[1])
stego_juniward = jio.read(JUNIWARD_img_path[1])
stego_uerd = jio.read(UERD_img_path[1])
stego_jmpd = jio.read(JMIPOD_img_path[1])

In [None]:
#Getting values from corresponding channels
coverDCT[:,:,0] = jpeg.coef_arrays[0] ; coverDCT[:,:,1] = jpeg.coef_arrays[1] ; coverDCT[:,:,2] = jpeg.coef_arrays[2]
stego_juni_DCT[:,:,0] = stego_juniward.coef_arrays[0] ; stego_juni_DCT[:,:,1] = stego_juniward.coef_arrays[1] ; stego_juni_DCT[:,:,2] = stego_juniward.coef_arrays[2]
stego_uerd_DCT[:,:,0] = stego_uerd.coef_arrays[0] ; stego_uerd_DCT[:,:,1] = stego_uerd.coef_arrays[1] ; stego_uerd_DCT[:,:,2] = stego_uerd.coef_arrays[2]
stego_jmpd_DCT[:,:,0] = stego_jmpd.coef_arrays[0] ; stego_jmpd_DCT[:,:,1] = stego_jmpd.coef_arrays[1] ; stego_jmpd_DCT[:,:,2] = stego_jmpd.coef_arrays[2]



In [None]:
DCT_diff1 = coverDCT - stego_juni_DCT
DCT_diff2 = coverDCT - stego_uerd_DCT
DCT_diff3 = coverDCT - stego_jmpd_DCT
# So since they are not the same Images the DCT_diff would not be zero
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(20, 20))
print(len(DCT_diff1[np.where(DCT_diff1!=0)]))
print(np.unique(DCT_diff1))
plt.subplot(131)
plt.title('juniward difference')
plt.imshow( abs(DCT_diff1) )
print(len(DCT_diff2[np.where(DCT_diff2!=0)]))
print(np.unique(DCT_diff2))
plt.subplot(132)
plt.title('uerd difference')
plt.imshow( abs(DCT_diff2) )
print(len(DCT_diff3[np.where(DCT_diff3!=0)]))
print(np.unique(DCT_diff3))
plt.subplot(133)
plt.title('jmipod difference')
plt.imshow( abs(DCT_diff3) )
plt.show()

so there is a really visible difference between the images

*taken from the Author of competition's notbook itself Reni*

In [None]:
#This code extract YCbCr channels from a jpeg object
def JPEGdecompressYCbCr(jpegStruct):
    
    nb_colors=len(jpegStruct.coef_arrays)
        
    [Col,Row] = np.meshgrid( range(8) , range(8) )
    T = 0.5 * np.cos(np.pi * (2*Col + 1) * Row / (2 * 8))
    T[0,:] = T[0,:] / np.sqrt(2)
    
    sz = np.array(jpegStruct.coef_arrays[0].shape)
    
    imDecompressYCbCr = np.zeros([sz[0], sz[1], nb_colors]);
    szDct = (sz/8).astype('int')
    
    
    
    for ColorChannel in range(nb_colors):
        tmpPixels = np.zeros(sz)
    
        DCTcoefs = jpegStruct.coef_arrays[ColorChannel];
        if ColorChannel==0:
            QM = jpegStruct.quant_tables[ColorChannel];
        else:
            QM = jpegStruct.quant_tables[1];
        
        for idxRow in range(szDct[0]):
            for idxCol in range(szDct[1]):
                D = DCTcoefs[idxRow*8:(idxRow+1)*8 , idxCol*8:(idxCol+1)*8]
                tmpPixels[idxRow*8:(idxRow+1)*8 , idxCol*8:(idxCol+1)*8] = np.dot( np.transpose(T) , np.dot( QM * D , T ) )
        imDecompressYCbCr[:,:,ColorChannel] = tmpPixels;
    return imDecompressYCbCr


In [None]:
coverDCT = np.zeros([512,512,3])
stego_juni_DCT = np.zeros([512,512,3])
stego_uerd_DCT = np.zeros([512,512,3])
stego_jmpd_DCT = np.zeros([512,512,3])
jpeg = jio.read(cover_img_path[1])
stego_juniward = jio.read(JUNIWARD_img_path[1])
stego_uerd = jio.read(UERD_img_path[1])
stego_jmpd = jio.read(JMIPOD_img_path[1])

In [None]:
Y_cover= JPEGdecompressYCbCr(jpeg)
Y_juniward=JPEGdecompressYCbCr(stego_juniward)
Y_uerd=JPEGdecompressYCbCr(stego_uerd)
Y_jmpd=JPEGdecompressYCbCr(stego_jmpd)

In [None]:
diff1 = Y_cover - Y_juniward
diff2 = Y_cover - Y_uerd
diff3 = Y_cover - Y_jmpd
# So since they are not the same Images the diff would not be zero
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(20, 20))
print(len(diff1[np.where(diff1!=0)]))
print(np.unique(diff1))
plt.subplot(131)
plt.title('juniward difference')
plt.imshow( abs(diff1[:,:,0]) ,cmap='gray')
print(len(diff2[np.where(DCT_diff2!=0)]))
print(np.unique(diff2))
plt.subplot(132)
plt.title('uerd difference')
plt.imshow( abs(diff2[:,:,1]) ,cmap='gray')
print(len(diff3[np.where(diff3!=0)]))
print(np.unique(diff3))
plt.subplot(133)
plt.title('jmipod difference')
plt.imshow( abs(diff3[:,:,2]) ,cmap='gray')
plt.show()

In [None]:
diff4= Y_cover-Y_cover
print(len(diff4[np.where(diff4!=0)]))
print(np.unique(diff4))
plt.figure(figsize=(10,10))
plt.imshow( abs(diff4) )

from above we see that , there is definitely a huge difference between the YCbCr channels of the stego images from that of the cover images

In [None]:
folder_names = ['JMiPOD/', 'JUNIWARD/', 'UERD/']
class_names = ['Normal', 'JMiPOD_75', 'JMiPOD_90', 'JMiPOD_95', 
               'JUNIWARD_75', 'JUNIWARD_90', 'JUNIWARD_95',
                'UERD_75', 'UERD_90', 'UERD_95']
class_labels = { name: i for i, name in enumerate(class_names)}

In [None]:
train_df = pd.read_csv('../input/add-data/alaska2_train_df.csv')
val_df = pd.read_csv('../input/add-data/alaska2_val_df.csv')


In [None]:
class Alaska2Dataset(Dataset):

    def __init__(self, df, augmentations=None):

        self.data = df
        self.augment = augmentations

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        fn, label = self.data.loc[idx]
        im = cv2.imread(fn)[:, :, ::-1]
        if self.augment:
            # Apply transformations
            im = self.augment(image=im)
        return im, label

img_size = 512
AUGMENTATIONS_TRAIN = Compose([
    VerticalFlip(p=0.5),
    HorizontalFlip(p=0.5),
    RandomRotate90(p=0.5),
    Rotate(limit=20, interpolation=0, border_mode=0, value=None, mask_value=None, always_apply=False, p=0.5),
    
    ToFloat(max_value=255),
    ToTensor()
], p=0.8)


AUGMENTATIONS_TEST = Compose([
    ToFloat(max_value=255),
    ToTensor()
], p=1)

In [None]:
temp_df = train_df.sample(64).reset_index(drop=True)
train_dataset = Alaska2Dataset(temp_df, augmentations=AUGMENTATIONS_TRAIN)
batch_size = 64
num_workers = 0

temp_loader = torch.utils.data.DataLoader(train_dataset,
                                          batch_size=batch_size,
                                          num_workers=num_workers, shuffle=False)


images, labels = next(iter(temp_loader))
images = images['image'].permute(0, 2, 3, 1)
max_images = 64
grid_width = 16
grid_height = int(max_images / grid_width)
fig, axs = plt.subplots(grid_height, grid_width,
                        figsize=(grid_width+1, grid_height+2))

for i, (im, label) in enumerate(zip(images, labels)):
    ax = axs[int(i / grid_width), i % grid_width]
    ax.imshow(im.squeeze())
    ax.set_title(str(label.item()))
    ax.axis('off')

plt.suptitle("0: Cover, 1: JMiPOD_75, 2: JMiPOD_90, 3: JMiPOD_95, 4: JUNIWARD_75, 5:JUNIWARD_90,\n 6: JUNIWARD_95, 7:UERD_75, 8:UERD_90, 9:UERD_95")
plt.show()
del images, temp_df
gc.collect()

In [None]:
class Net(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.model = EfficientNet.from_pretrained('efficientnet-b0')
        # 1280 is the number of neurons in last layer. is diff for diff. architecture
        self.dense_output = nn.Linear(1280, num_classes)

    def forward(self, x):
        feat = self.model.extract_features(x)
        feat = F.avg_pool2d(feat, feat.size()[2:]).reshape(-1, 1280)
        return self.dense_output(feat)

In [None]:
batch_size = 8
num_workers = 8

train_dataset = Alaska2Dataset(train_df, augmentations=AUGMENTATIONS_TRAIN)
valid_dataset = Alaska2Dataset(val_df.sample(1000).reset_index(drop=True), augmentations=AUGMENTATIONS_TEST) #for faster validation sample

train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=batch_size,
                                           num_workers=num_workers,
                                           shuffle=True)

valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                           batch_size=batch_size*2,
                                           num_workers=num_workers,
                                           shuffle=False)

device = 'cuda'
model = Net(num_classes=len(class_labels)).to(device)
# pretrained model in my pc. now i will train on all images for 2 epochs
model.load_state_dict(torch.load('../input/new-data/val_loss_6.08_auc_0.875.pth'))
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6)

In [None]:
# https://www.kaggle.com/anokas/weighted-auc-metric-updated

def alaska_weighted_auc(y_true, y_valid):
    tpr_thresholds = [0.0, 0.4, 1.0]
    weights = [2,   1]

    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_valid, pos_label=1)

    # size of subsets
    areas = np.array(tpr_thresholds[1:]) - np.array(tpr_thresholds[:-1])

    # The total area is normalized by the sum of weights such that the final weighted AUC is between 0 and 1.
    normalization = np.dot(areas, weights)

    competition_metric = 0
    for idx, weight in enumerate(weights):
        y_min = tpr_thresholds[idx]
        y_max = tpr_thresholds[idx + 1]
        mask = (y_min < tpr) & (tpr < y_max)
        # pdb.set_trace()

        x_padding = np.linspace(fpr[mask][-1], 1, 100)

        x = np.concatenate([fpr[mask], x_padding])
        y = np.concatenate([tpr[mask], [y_max] * len(x_padding)])
        y = y - y_min  # normalize such that curve starts at y=0
        score = metrics.auc(x, y)
        submetric = score * weight
        best_subscore = (y_max - y_min) * weight
        competition_metric += submetric

    return competition_metric / normalization

In [None]:
criterion = torch.nn.CrossEntropyLoss()
num_epochs = 2
train_loss, val_loss = [], []

for epoch in range(num_epochs):
    print('Epoch {}/{}'.format(epoch, num_epochs - 1))
    print('-' * 10)
    model.train()
    running_loss = 0
    tk0 = tqdm(train_loader, total=int(len(train_loader)))
    for im, labels in tk0:
        inputs = im["image"].to(device, dtype=torch.float)
        labels = labels.to(device, dtype=torch.long)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        tk0.set_postfix(loss=(loss.item()))

    epoch_loss = running_loss / (len(train_loader)/batch_size)
    train_loss.append(epoch_loss)
    print('Training Loss: {:.8f}'.format(epoch_loss))

    tk1 = tqdm(valid_loader, total=int(len(valid_loader)))
    model.eval()
    running_loss = 0
    y, preds = [], []
    with torch.no_grad():
        for (im, labels) in tk1:
            inputs = im["image"].to(device, dtype=torch.float)
            labels = labels.to(device, dtype=torch.long)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            y.extend(labels.cpu().numpy().astype(int))
            preds.extend(F.softmax(outputs, 1).cpu().numpy())
            running_loss += loss.item()
            tk1.set_postfix(loss=(loss.item()))

        epoch_loss = running_loss / (len(valid_loader)/batch_size)
        val_loss.append(epoch_loss)
        preds = np.array(preds)
        # convert multiclass labels to binary class
        y = np.array(y)
        labels = preds.argmax(1)
        for class_label in np.unique(y):
            idx = y == class_label
            acc = (labels[idx] == y[idx]).astype(np.float).mean()*100
            print('accuracy for class', class_names[class_label], 'is', acc)
        
        acc = (labels == y).mean()*100
        new_preds = np.zeros((len(preds),))
        temp = preds[labels != 0, 1:]
        new_preds[labels != 0] = temp.sum(1)
        new_preds[labels == 0] = 1 - preds[labels == 0, 0]
        y = np.array(y)
        y[y != 0] = 1
        auc_score = alaska_weighted_auc(y, new_preds)
        print(`
            f'Val Loss: {epoch_loss:.3}, Weighted AUC:{auc_score:.3}, Acc: {acc:.3}')

    torch.save(model.state_dict(),
               f"epoch_{epoch}_val_loss_{epoch_loss:.3}_auc_{auc_score:.3}.pth")

In [None]:
import glob
class Alaska2TestDataset(Dataset):

    def __init__(self, df, augmentations=None):

        self.data = df
        self.augment = augmentations

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        fn = self.data.loc[idx][0]
        im = cv2.imread(fn)[:, :, ::-1]

        if self.augment:
            # Apply transformations
            im = self.augment(image=im)

        return im


test_filenames = test_img_path
test_df = pd.DataFrame({'ImageFileName': list(
    test_filenames)}, columns=['ImageFileName'])



batch_size = 16
num_workers = 4
test_dataset = Alaska2TestDataset(test_df, augmentations=AUGMENTATIONS_TEST)
test_loader = torch.utils.data.DataLoader(test_dataset,
                                          batch_size=batch_size,
                                          num_workers=num_workers,
                                          shuffle=False,
                                          drop_last=False)

In [None]:
model.eval()

preds = []
tk0 = tqdm(test_loader)

with torch.no_grad():
    for i, im in enumerate(tk0):
        inputs = im["image"].to(device)
        # flip vertical
        im = inputs.flip(2)
        outputs = model(im)
        # fliplr
        im = inputs.flip(3)
        outputs = (0.25*outputs + 0.25*model(im))
        outputs = (outputs + 0.5*model(inputs))        
        preds.extend(F.softmax(outputs, 1).cpu().numpy())

preds = np.array(preds)
labels = preds.argmax(1)
new_preds = np.zeros((len(preds),))
new_preds[labels != 0] = preds[labels != 0, 1:].sum(1)
new_preds[labels == 0] = 1 - preds[labels == 0, 0]

test_df['Id'] = test_df['ImageFileName'].apply(lambda x: x.split(os.sep)[-1])
test_df['Label'] = new_preds

test_df = test_df.drop('ImageFileName', axis=1)
test_df.to_csv('submission_eb0.csv', index=False)
print(test_df.head())