In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import cv2

%matplotlib inline

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
path_zip = '/kaggle/input/denoising-dirty-documents/'
path = '/kaggle/working/'

In [None]:
import zipfile

with zipfile.ZipFile(os.path.join(path_zip, 'train.zip'), 'r') as zip_ref:
    zip_ref.extractall(path)

with zipfile.ZipFile(os.path.join(path_zip, 'test.zip'), 'r') as zip_ref:
    zip_ref.extractall(path)  
    
with zipfile.ZipFile(os.path.join(path_zip, 'train_cleaned.zip'), 'r') as zip_ref:
    zip_ref.extractall(path)  
    
with zipfile.ZipFile(os.path.join(path_zip, 'sampleSubmission.csv.zip'), 'r') as zip_ref:
    zip_ref.extractall(path)  

In [None]:
train_img = sorted(os.listdir(os.path.join(path, 'train')))
train_cleaned_img = sorted(os.listdir(os.path.join(path, 'train_cleaned')))
test_img = sorted(os.listdir(os.path.join(path, 'test')))

In [None]:
input_shape = (420, 540, 1)


In [None]:
def process_image(path):
    img = cv2.imread(path)
    img = np.asarray(img, dtype="float32")
    img = cv2.resize(img, (input_shape[1], input_shape[0]))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = img/255.0
    img = np.reshape(img, input_shape)
    
    return img

In [None]:
train = []
train_cleaned = []
test = []

directory = os.path.join(path, 'train')  
for f in sorted(os.listdir(directory)):
    train.append(process_image(os.path.join(directory, f)))
                               
directory = os.path.join(path, 'train_cleaned')                               
for f in sorted(os.listdir(directory)):
    train_cleaned.append(process_image(os.path.join(directory, f)))

directory = os.path.join(path, 'test')                               
for f in sorted(os.listdir(directory)):
    test.append(process_image(os.path.join(directory, f)))

In [None]:
plt.figure(figsize=(15,25))
for i in range(0,8,2):
    plt.subplot(4,2,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(train[i][:,:,0], cmap='gray')
    plt.title('Noise image: {}'.format(train_img[i]))
    
    plt.subplot(4,2,i+2)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(train_cleaned[i][:,:,0], cmap='gray')
    plt.title('Denoised image: {}'.format(train_img[i]))

plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X_train = np.asarray(train)
y_train = np.asarray(train_cleaned)
X_test = np.asarray(test)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)

In [None]:
batch_size = 32
kernel_size = 3
latent_dim = 16
layer_filters = [
    #{"filters":32, "kernel_size":3},
    {"filters":64, "kernel_size":3}, 
    {"filters":128, "kernel_size":3},
    {"filters":256, "kernel_size":3}
]

In [None]:
from keras import backend as K
from keras.layers import Flatten, Dense, Reshape, Conv2DTranspose
from keras.layers import Conv2D, MaxPooling2D, UpSampling2D, Dropout, BatchNormalization, Input
from tensorflow.keras.models import Model

def model():
    input_layer = Input(shape=input_shape) 

    # encoding
    x = input_layer
    for layer_f in layer_filters:
        x = Conv2D(layer_f["filters"], (layer_f["kernel_size"], layer_f["kernel_size"]), 
                   activation='relu', padding='same')(x)
    
    x = BatchNormalization()(x)

    x = MaxPooling2D((2, 2), padding='same')(x)

    x = Dropout(0.5)(x)

    # decoding
    for layer_f in layer_filters[::-1]:
        x = Conv2D(layer_f["filters"], (layer_f["kernel_size"], layer_f["kernel_size"]), 
                   activation='relu', padding='same')(x)
        
    x = BatchNormalization()(x)

    x = UpSampling2D((2, 2))(x)

    output_layer = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)
    autoencoder = Model(inputs=[input_layer], outputs=[output_layer])
    autoencoder.compile(optimizer='adam' , loss='mean_squared_error', metrics=['mae'])
    return autoencoder

In [None]:
model = model()
model.summary()

In [None]:
model_weights_file_path = "Best_model_weights.hdf5"

In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

checkpoint = ModelCheckpoint(filepath=model_weights_file_path, monitor="val_loss", verbose=1, save_best_only=True, mode="min", save_weights_only=True)
es = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=20)
lr_reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, verbose=0, mode='min', min_delta=0.0001, cooldown=0, min_lr=0)

In [None]:
callbacks=[checkpoint, es, lr_reduce]

epochs=500
batch_size=8

In [None]:
history = model.fit(X_train, y_train, 
                    validation_data = (X_val, y_val), 
                    epochs=epochs, 
                    batch_size=batch_size, 
                    verbose=1, 
                    callbacks=callbacks)
                   

In [None]:
epoch_loss = history.history['loss']
epoch_val_loss = history.history['val_loss']
epoch_mae = history.history['mae']
epoch_val_mae = history.history['val_mae']

plt.figure(figsize=(20,6))
plt.subplot(1,2,1)
plt.plot(range(0,len(epoch_loss)), epoch_loss, 'b-', linewidth=2, label='Train Loss')
plt.plot(range(0,len(epoch_val_loss)), epoch_val_loss, 'r-', linewidth=2, label='Val Loss')
plt.title('Evolution of loss on train & validation datasets over epochs')
plt.legend(loc='best')

plt.subplot(1,2,2)
plt.plot(range(0,len(epoch_mae)), epoch_mae, 'b-', linewidth=2, label='Train MAE')
plt.plot(range(0,len(epoch_val_mae)), epoch_val_mae, 'r-', linewidth=2,label='Val MAE')
plt.title('Evolution of MAE on train & validation datasets over epochs')
plt.legend(loc='best')

plt.show()


In [None]:
model.load_weights(model_weights_file_path)

In [None]:
y_test = model.predict(X_test, batch_size=batch_size)

In [None]:
plt.figure(figsize=(15,25))
for i in range(0,8,2):
    plt.subplot(4,2,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(X_test[i][:,:,0], cmap='gray')
    plt.title('Noisy image: {}'.format(test_img[i]))
    
    plt.subplot(4,2,i+2)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(y_test[i][:,:,0], cmap='gray')
    plt.title('Denoised by autoencoder: {}'.format(test_img[i]))

plt.show()

In [None]:
ids = []
vals = []
for i, f in enumerate(test_img):
    file = path + 'test/' + f
    imgid = int(f[:-4])
    img = cv2.imread(file, 0)
    img_shape = img.shape
    preds_reshaped = cv2.resize(y_test[i], (img_shape[1], img_shape[0]))

    for r in range(img_shape[0]):
        for c in range(img_shape[1]):
            ids.append(str(imgid)+'_'+str(r + 1)+'_'+str(c + 1))
            vals.append(preds_reshaped[r, c])

In [None]:
submission = pd.DataFrame({'id': ids, 'value': vals})
submission.to_csv('submission.csv',index = False)

In [None]:
my_submission = pd.read_csv('submission.csv')
my_submission.head(5)