In [None]:
import os
import zipfile, cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras import layers, callbacks
import imgaug as ia
from imgaug import augmenters as iaa

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

DATA_DIR = '../input/denoising-dirty-documents/'
path = '/kaggle/working/'
IMG_SIZE = (420, 540)
BS = 12

In [None]:
with zipfile.ZipFile(DATA_DIR + 'train.zip') as zip_file:
    zip_file.extractall(path)
    
with zipfile.ZipFile(DATA_DIR + 'train_cleaned.zip') as zip_file:
    zip_file.extractall(path)
    
with zipfile.ZipFile(DATA_DIR + 'test.zip') as zip_file:
    zip_file.extractall(path)

with zipfile.ZipFile(DATA_DIR + 'sampleSubmission.csv.zip') as zip_file:
    zip_file.extractall(path)

In [None]:
train_img = sorted(os.listdir(path + 'train'))
train_cleaned_img = sorted(os.listdir(path + 'train_cleaned'))
test_img = sorted(os.listdir(path + 'test'))

In [None]:
def preprocess_image(path, single_img=False):
    img = cv2.imread(path)
    if single_img:
        img = cv2.resize(img, IMG_SIZE)
    else:
        img = cv2.resize(img, IMG_SIZE[::-1])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = img / 255.0
    img = np.reshape(img, (*IMG_SIZE, 1))
    return img

img = preprocess_image(os.path.join(path, 'train', '2.png'))
plt.imshow(img.squeeze(axis=2))

In [None]:
train = []
train_cleaned = []
test = []

for f in train_img:
    train.append(preprocess_image(os.path.join(path, 'train', f)))
    
for f in train_cleaned_img:
    train_cleaned.append(preprocess_image(os.path.join(path, 'train_cleaned', f)))
    
for f in test_img:
    test.append(preprocess_image(os.path.join(path, 'test', f)))
    
train = np.asarray(train)
train_cleaned = np.asarray(train_cleaned)
test = np.asarray(test)

train.shape, train_cleaned.shape, test.shape

In [None]:
### Image Augmentation is skipped for now so as to get base model fast

In [None]:
class DenoiseModel(Model):
    def __init__(self):
        super(DenoiseModel, self).__init__()
        self.encoder = tf.keras.Sequential([
            layers.Input(shape=(*IMG_SIZE, 1)), 
            layers.Conv2D(48, (3, 3), activation='relu', padding='same'),
            layers.Conv2D(72, (3, 3), activation='relu', padding='same'),
            layers.Conv2D(144, (3, 3), activation='relu', padding='same'),
            layers.BatchNormalization(),
            layers.MaxPooling2D((2, 2), padding='same'),
            layers.Dropout(0.5),
        ])
        
        self.decoder = tf.keras.Sequential([
            layers.Conv2D(144, (3, 3), activation='relu', padding='same'),
            layers.Conv2D(72, (3, 3), activation='relu', padding='same'),
            layers.Conv2D(48, (3, 3), activation='relu', padding='same'),
            layers.BatchNormalization(),
            layers.UpSampling2D((2, 2)),
            layers.Conv2D(1, (3, 3), activation='sigmoid', padding='same')
        ])
        
    def call(self, x):
        encoder = self.encoder(x)
        decoder = self.decoder(encoder)
        return decoder
    
autoencoder = DenoiseModel()
autoencoder.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])

In [None]:
es = callbacks.EarlyStopping(monitor='loss', patience=30, verbose=1, restore_best_weights=True)
history = autoencoder.fit(
    train, train_cleaned,
    shuffle=True,
    callbacks=[es],
    epochs=200,
    batch_size=BS
)

In [None]:
temp = preprocess_image(os.path.join(path, 'test', '1.png'))
plt.imshow(temp.squeeze(axis=2))

In [None]:
temp_out = autoencoder.predict(np.expand_dims(temp, axis=0))
temp_out = temp_out.squeeze(axis=0).squeeze(axis=2)
plt.imshow(temp_out)
# print(temp_out)

### Submission

In [None]:
ids = []
vals = []
for i, f in tqdm(enumerate(test_img)):
    file = path + 'test/' + f
    imgid = int(f[:-4])
    img = cv2.imread(file, 0)
    img_shape = img.shape
    decoded_img = np.squeeze(autoencoder.decoder(autoencoder.encoder(test[i:i+1]).numpy()).numpy())
    preds_reshaped = cv2.resize(decoded_img, (img_shape[1], img_shape[0]))

    for r in range(img_shape[0]):
        for c in range(img_shape[1]):
            ids.append(str(imgid)+'_'+str(r + 1)+'_'+str(c + 1))
            vals.append(preds_reshaped[r, c])

print('Length of IDs: {}'.format(len(ids)))            
pd.DataFrame({'id': ids, 'value': vals}).to_csv('submission.csv',index=False)
print('Results saved to submission.csv!')