In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

> # Load Data

In [None]:
print('Reading train rsna data...')
train = pd.read_csv("../input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv")
print(train.shape)
train.head()

In [None]:
print('Reading train data...')
test = pd.read_csv("../input/chest-xray-anomaly-detection/train.csv")
print(test.shape)
test.head()

In [None]:
print('Reading sample data...')
ss = pd.read_csv("../input/chest-xray-anomaly-detection/sample_submission.csv")
print(ss.shape)
ss.head()

In [None]:
import cv2
from PIL import Image
import pydicom
from skimage.transform import resize

def get_dicom_images():
    inputdir = '../input/rsna-pneumonia-detection-challenge/stage_2_train_images/'
    train_list = [ f for f in  os.listdir(inputdir)]
    images = []
    for f in train_list[:2000]:   # remove "[:10]" to convert all images 
        ds = pydicom.read_file(inputdir + f) # read dicom image
        img = ds.pixel_array # get image array
    #     img = np.expand_dims(ds.pixel_array,axis=0)  
        img = resize(img, (32, 32, 3), mode='reflect')
        images.append(img)
    #     cv2.imwrite(outdir + f.replace('.dcm','.png'),img) # write png image
    return np.array(images)


def get_normal_images(filenames):
    path = "../input/chest-xray-anomaly-detection/images/"
    images = []
    for filename in filenames['fileName']:
        path_img = path + filename
        img_array = np.array(Image.open(path_img))
        img_array = resize(img_array, (32, 32, 3), mode='reflect')
        images.append(img_array)
    return np.array(images)

In [None]:
# import the necessary packages
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Conv2DTranspose
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Reshape
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
import numpy as np
class ConvAutoencoder:
	@staticmethod
	def build(width, height, depth, filters=(32, 64), latentDim=16):
		# initialize the input shape to be "channels last" along with
		# the channels dimension itself
		# channels dimension itself
		inputShape = (height, width, depth)
		chanDim = -1
		# define the input to the encoder
		inputs = Input(shape=inputShape)
		x = inputs
		# loop over the number of filters
		for f in filters:
			# apply a CONV => RELU => BN operation
			x = Conv2D(f, (3, 3), strides=2, padding="same")(x)
			x = LeakyReLU(alpha=0.2)(x)
			x = BatchNormalization(axis=chanDim)(x)
		# flatten the network and then construct our latent vector
		volumeSize = K.int_shape(x)
		x = Flatten()(x)
		latent = Dense(latentDim)(x)
		# build the encoder model
		encoder = Model(inputs, latent, name="encoder")
           
		# start building the decoder model which will accept the
		# output of the encoder as its inputs
		latentInputs = Input(shape=(latentDim,))
		x = Dense(np.prod(volumeSize[1:]))(latentInputs)
		x = Reshape((volumeSize[1], volumeSize[2], volumeSize[3]))(x)

		# loop over our number of filters again, but this time in
		# reverse order
		for f in filters[::-1]:
			# apply a CONV_TRANSPOSE => RELU => BN operation
			x = Conv2DTranspose(f, (3, 3), strides=2,
				padding="same")(x)
			x = LeakyReLU(alpha=0.2)(x)
			x = BatchNormalization(axis=chanDim)(x)

		# apply a single CONV_TRANSPOSE layer used to recover the
		# original depth of the image
		x = Conv2DTranspose(depth, (3, 3), padding="same")(x)
		outputs = Activation("sigmoid")(x)

		# build the decoder model
		decoder = Model(latentInputs, outputs, name="decoder")

		# our autoencoder is the encoder + decoder
		autoencoder = Model(inputs, decoder(encoder(inputs)),
			name="autoencoder")

		# return a 3-tuple of the encoder, decoder, and autoencoder
		return (encoder, decoder, autoencoder)

In [None]:
import matplotlib
matplotlib.use("Agg")
# import the necessary packages
# from pyimagesearch.convautoencoder import ConvAutoencoder
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.datasets import mnist
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import argparse
import random
import pickle
import cv2

In [None]:
def visualize_predictions(decoded, gt, samples=10):
	# initialize our list of output images
	outputs = None
	# loop over our number of output samples
	for i in range(0, samples):
		# grab the original image and reconstructed image
		original = (gt[i] * 255).astype("uint8")
		recon = (decoded[i] * 255).astype("uint8")
		# stack the original and reconstructed image side-by-side
		output = np.hstack([original, recon])
		# if the outputs array is empty, initialize it as the current
		# side-by-side image display
		if outputs is None:
			outputs = output
		# otherwise, vertically stack the outputs
		else:
			outputs = np.vstack([outputs, output])
	# return the output images
	return outputs

In [None]:
# initialize the number of epochs to train for, initial learning rate,
# and batch size
EPOCHS = 20
INIT_LR = 1e-3
BS = 32
# build our unsupervised dataset of images with a small amount of
# contamination (i.e., anomalies) added into it
# add a channel dimension to every image in the dataset, then scale
# the pixel intensities to the range [0, 1]
images = get_normal_images(test)
# images = np.expand_dims(train_images, axis=-1)
images = images.astype("float32") / 255.0

# construct the training and testing split
(trainX, testX) = train_test_split(images, test_size=0.2,
	random_state=42)

In [None]:
print(trainX.shape)

In [None]:
print("[INFO] building autoencoder...")
(encoder, decoder, autoencoder) = ConvAutoencoder.build(32, 32, 3)
opt = Adam(lr=INIT_LR, decay=INIT_LR / EPOCHS)
autoencoder.compile(loss="mse", optimizer=opt)
# train the convolutional autoencoder
H = autoencoder.fit(
	trainX, trainX,
	validation_data=(testX, testX),
	epochs=EPOCHS,
	batch_size=BS)
# use the convolutional autoencoder to make predictions on the
# testing images, construct the visualization, and then save it
# to disk
print("[INFO] making predictions...")
decoded = autoencoder.predict(testX)

In [None]:
vis = visualize_predictions(decoded, testX)
cv2.imwrite("vis.png", vis)
img_array = np.array(Image.open('./vis.png'))
plt.imshow(img_array)

In [None]:
# construct a plot that plots and saves the training history
N = np.arange(0, EPOCHS)
plt.style.use("ggplot")
plt.figure()
plt.plot(N, H.history["loss"], label="train_loss")
plt.plot(N, H.history["val_loss"], label="val_loss")
plt.title("Training Loss")
plt.xlabel("Epoch #")
plt.ylabel("Loss")
plt.legend(loc="lower left")
plt.show()
# plt.savefig("plot.jpg")

# serialize the image data to disk
print("[INFO] saving image data...")
f = open("images.pickle", "wb")
f.write(pickle.dumps(images))
f.close()

# serialize the autoencoder model to disk
print("[INFO] saving autoencoder...")
autoencoder.save("autoencoder.model", save_format="h5")

In [None]:
# import the necessary packages
from tensorflow.keras.models import load_model
import numpy as np
import argparse
import pickle
import cv2

# load the model and image data from disk
print("[INFO] loading autoencoder and image data...")
autoencoder = load_model("autoencoder.model")
# images = pickle.loads(open("images.pickle", "rb").read())
# make predictions on our image data and initialize our list of
# reconstruction errors
test_images = get_normal_images(ss)
decoded = autoencoder.predict(test_images)
errors = []
# loop over all original images and their corresponding
# reconstructions
for (image, recon) in zip(test_images, decoded):
	# compute the mean squared error between the ground-truth image
	# and the reconstructed image, then add it to our list of errors
	mse = np.mean((image - recon) ** 2)
	errors.append(mse)
    
# compute the q-th quantile of the errors which serves as our
# threshold to identify anomalies -- any data point that our model
# reconstructed with > threshold error will be marked as an outlier
thresh = np.quantile(errors, 0.89)
idxs = np.where(np.array(errors) >= thresh)[0]
print("[INFO] mse threshold: {}".format(thresh))
print("[INFO] {} outliers found".format(len(idxs)))    

In [None]:
print(idxs)

In [None]:
submission = []
for i, filename in enumerate(ss.fileName):
    if i in idxs:
        submission.append([filename, 1])
    else:
        submission.append([filename, 0])
df = pd.DataFrame(submission, columns = ['fileName', 'anomaly']) 
df.to_csv('test_predictions.csv', index=False)

In [None]:
# # initialize the outputs array
# outputs = None

# # loop over the indexes of images with a high mean squared error term
# for i in idxs:
# 	# grab the original image and reconstructed image
# 	original = (images[i] * 255).astype("uint8")
# 	recon = (decoded[i] * 255).astype("uint8")

# 	# stack the original and reconstructed image side-by-side
# 	output = np.hstack([original, recon])

# 	# if the outputs array is empty, initialize it as the current
# 	# side-by-side image display
# 	if outputs is None:
# 		outputs = output

# 	# otherwise, vertically stack the outputs
# 	else:
# 		outputs = np.vstack([outputs, output])

# # show the output visualization
# cv2.imshow("Output", outputs)
# cv2.waitKey(0)