# Introduction
For this sort of a (image) classification task, it is worthwhile knowing whether the distribution of the train data is somewhat similar to that of the test data. If they are very different, a model fitted to the train data cannot do much job for classifying the test data. To see whether distributions of the train and test is similar, we can do so-called **adversarial validation**. 

This kernel is largely based on this kernel (https://www.kaggle.com/cdeotte/steel-adversarial-validation) which did the adversarial classification for the steel competition.

# Libraries

In [None]:
import os, glob
import random
from PIL import Image 
import numpy as np
import pandas as pd
import multiprocessing
import keras
import keras.backend as K
from keras.optimizers import Adam
from keras.callbacks import Callback
from keras.applications.densenet import DenseNet169
from keras.layers import Dense, Flatten
from keras.models import Model, load_model
from keras.utils import Sequence
from keras.preprocessing.image import ImageDataGenerator
from keras import layers
from keras.callbacks import LearningRateScheduler
import matplotlib.pyplot as plt, time
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
from numpy.random import seed
seed(1220)
from tensorflow import set_random_seed
set_random_seed(1220)
%matplotlib inline
print("libraries imported!")

In [None]:
test_imgs_folder = '../input/understanding_cloud_organization/test_images/'
train_imgs_folder = '../input/understanding_cloud_organization/train_images/'
num_cores = multiprocessing.cpu_count()

# train : test = 1: 1

In [None]:
# based on https://www.kaggle.com/cdeotte/steel-adversarial-validation/data
TRAIN_IMG = os.listdir('../input/understanding_cloud_organization/train_images')
TEST_IMG = os.listdir('../input/understanding_cloud_organization/test_images')
print('Original train count =',len(TRAIN_IMG),', Original test count =',len(TEST_IMG))
os.mkdir('../tmp/')
os.mkdir('../tmp/train_images/')
r = np.random.choice(TRAIN_IMG,len(TEST_IMG),replace=False)
for i,f in enumerate(r):
    img = Image.open('../input/understanding_cloud_organization/train_images/'+f)
    img.save('../tmp/train_images/'+f)
os.mkdir('../tmp/test_images/')
for i,f in enumerate(TEST_IMG):
    img = Image.open('../input/understanding_cloud_organization/test_images/'+f)
    img.save('../tmp/test_images/'+f)

In [None]:
TRAIN_IMG_AV = os.listdir('../tmp/train_images')
TEST_IMG_AV = os.listdir('../tmp/test_images')
print('New train count =',len(TRAIN_IMG_AV),', New test count =',len(TEST_IMG_AV))


# Build an Adversarial Classifier
I use a simple DenseNet169 as an adversarial classifier.

In [None]:
def get_model():
    K.clear_session()
    base_model = DenseNet169(weights='imagenet', include_top=False, pooling='avg', input_shape=(224, 224, 3))
    x = base_model.output
    y_pred = Dense(1, activation='sigmoid')(x)
    return Model(inputs=base_model.input, outputs=y_pred)

model = get_model()
model.compile(optimizer=Adam(lr=1e-4), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
img_dir = '../tmp/'
img_height = 224; img_width = 224
batch_size = 32; nb_epochs = 8

train_datagen = ImageDataGenerator(rescale=1./255,
    horizontal_flip=True,
    vertical_flip=True,
    validation_split=0.2) # set validation split

train_generator = train_datagen.flow_from_directory(
    img_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='binary',
    subset='training') # set as training data

validation_generator = train_datagen.flow_from_directory(
    img_dir, # same directory as training data
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='binary',
    subset='validation') # set as validation data

annealer = LearningRateScheduler(lambda x: 0.0001 * 0.95 ** x)

# fit!
h = model.fit_generator(
    train_generator,
    steps_per_epoch = train_generator.samples // batch_size,
    validation_data = validation_generator, 
    validation_steps = validation_generator.samples // batch_size,
    epochs = nb_epochs,
    callbacks = [annealer],
    verbose=2)

# Accuracy
If distributions of the train and test are similar to one another, the classification accuracy for validation should be around 0.5.

In [None]:
plt.figure(figsize=(15,5))
plt.plot(h.history['acc'],label='Train ACC')
plt.plot(h.history['val_acc'],label='Val ACC')
plt.title('TRAIN COMPARED WITH TEST. Training History')
plt.legend()
plt.show()