<div style="padding:20px;color:black;margin:0;font-size:200%;text-align:center;display:fill;border-radius:5px;background-color:#E7B10A;overflow:hidden;font-weight:500">Denoising Dirty Documents 🗎🗎🗎  Using Autoencoders  </div>





<div id='imports'
     style = "display: fill;
              border-radius: 10px;
              background-color: #CB1C8D;">
    <h2 style = "padding: 15px; 
                 color: White;
                 text-align: left;
                 font-family: Trebuchet MS;">Table of Contents
    </h2>
</div>


- [1. Introduction](#1)
- [2. Data Preprocessing](#2)
- [3. Model Building](#3)
- [4. Prediction ](#4)
- [5. Submission  ](#5)

<img src="https://media.giphy.com/media/v1.Y2lkPTc5MGI3NjExNzU0ZDQyODc3MGI2MzgwOGU0YjMyNGM5M2I4Njc4YmIxMTUxZDRjYSZjdD1n/OhLrbM4CqP8dr7DpIo/giphy.gif">

<div id='1'
     style = "display: fill;
              border-radius: 10px;
              background-color: #CB1C8D;">
    <h2 style = "padding: 15px; 
                 color: White;
                 text-align: left;
                 font-family: Trebuchet MS;">Introduction
    </h2>
</div>

<div id='load_data'
     style = "display: fill;
              border-radius: 10px;
              background-color: #425F57;">
    <h3 style = "padding: 15px; 
                 color: White;
                 text-align: left;
                 font-family: Trebuchet MS;">Importing Libraries 
    </h3>
</div>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile
import os
import cv2
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, UpSampling2D, Dropout, BatchNormalization, Input
from tensorflow.keras.callbacks import EarlyStopping
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

<div id='load_data'
     style = "display: fill;
              border-radius: 10px;
              background-color: #425F57;">
    <h3 style = "padding: 15px; 
                 color: White;
                 text-align: left;
                 font-family: Trebuchet MS;">Importing Data 
    </h3>
</div>

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Path to zipped & working directories
path_zip ='/kaggle/input/denoising-dirty-documents/'
path = '/kaggle/working/'

# Unzip files first to working directory
with zipfile.ZipFile(path_zip+'train.zip', 'r') as zip_ref:
    zip_ref.extractall(path)
with zipfile.ZipFile(path_zip+'test.zip', 'r') as zip_ref:
    zip_ref.extractall(path)
with zipfile.ZipFile(path_zip+'train_cleaned.zip', 'r') as zip_ref:
    zip_ref.extractall(path)
with zipfile.ZipFile(path_zip+'sampleSubmission.csv.zip', 'r') as zip_ref:
    zip_ref.extractall(path)
    

In [None]:
# Store image names in list for later use
train_img = sorted(os.listdir(path+'/train'))
train_cleaned_img = sorted(os.listdir(path+'/train_cleaned'))
test_img = sorted(os.listdir(path+'/test'))

<div id='2'
     style = "display: fill;
              border-radius: 10px;
              background-color: #CB1C8D;">
    <h2 style = "padding: 15px; 
                 color: White;
                 text-align: left;
                 font-family: Trebuchet MS;">Data Preprocessing
    </h2>
</div>

In [None]:
# prepare function
def process_image(path):
    img= cv2.imread(path)
    img = np.asarray(img, dtype = "float32")
    img = cv2.resize(img, (540,420))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = img/255.0
    img = np.reshape(img ,(420,540,1))
    
    return img

In [None]:
# preprocess images
train=[]
train_cleaned = []
test = []

for f in sorted(os.listdir(path+'train/')):
    train.append(process_image(path+'train/'+f))

    
for f in sorted(os.listdir(path+'train_cleaned/')):
    train_cleaned.append(process_image(path+'train_cleaned/'+f))

    
for f in sorted(os.listdir(path+'test/')):
    test.append(process_image(path+'test/'+f))

<div id='load_data'
     style = "display: fill;
              border-radius: 10px;
              background-color: #425F57;">
    <h3 style = "padding: 15px; 
                 color: White;
                 text-align: left;
                 font-family: Trebuchet MS;">Sample Images 
    </h3>
</div>

In [None]:
plt.figure(figsize= (15,25))

for i in range(0,8,2):
    plt.subplot(4,2,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(train[i][:,:,0],cmap='gray')
    plt.title('Noise image: {}'.format(train_img[i]))
    
    plt.subplot(4,2,i+2)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(train_cleaned[i][:,:,0],cmap='gray')
    plt.title('Denoised image: {}'.format(train_img[i]))

<div id='load_data'
     style = "display: fill;
              border-radius: 10px;
              background-color: #425F57;">
    <h3 style = "padding: 15px; 
                 color: White;
                 text-align: left;
                 font-family: Trebuchet MS;">Splitting Data 
    </h3>
</div>

In [None]:
# converting  list to numpy array
X_train = np.asarray(train)
Y_train = np.asarray(train_cleaned)
X_test = np.asarray(test)

X_train,X_val,Y_train,Y_val = train_test_split(X_train,Y_train, test_size=0.2)

<div id='3'
     style = "display: fill;
              border-radius: 10px;
              background-color: #CB1C8D;">
    <h2 style = "padding: 15px; 
                 color: White;
                 text-align: left;
                 font-family: Trebuchet MS;">Model Building
    </h2>
</div>

Model is pretty simple and it's nice to look at model summary how it works in terms of layer sizes. 420x540 -> 210x270 --> 210x270 -> 420x540

Adam as optimizer is used (as it worked best out of other optimizers), loss is based on mean squared error and we are looking on mean absolute error as well.

In [None]:
def model():
    input_layer = Input(shape = (420,540,1))
    
    #encoding
    x = Conv2D(64,(3,3), activation='relu' , padding = 'same')(input_layer)
    
    x= Conv2D(128, (3,3), activation = 'relu', padding = 'same')(x)
    x =  BatchNormalization()(x)
    
    x = MaxPooling2D((2,2), padding= 'same')(x)
    
    x= Dropout(0.5)(x)
    
    # decoding
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)

    x = UpSampling2D((2, 2))(x)
    
    output_layer = Conv2D(1, (3,3), activation = 'sigmoid' ,padding='same')(x)
    
    model = Model(inputs=[input_layer], outputs = [output_layer])
    model.compile(optimizer = 'adam', loss='mean_squared_error', metrics =['mae'])
    
    return model

model= model()
model.summary()

In [None]:
callback = EarlyStopping( monitor='loss', patience= 30)
# Number of epochs need to be more
history = model.fit(X_train,Y_train , validation_data= (X_val, Y_val), epochs=20 , batch_size=24, verbose =0 , callbacks =[callback])


<div id='load_data'
     style = "display: fill;
              border-radius: 10px;
              background-color: #425F57;">
    <h3 style = "padding: 15px; 
                 color: White;
                 text-align: left;
                 font-family: Trebuchet MS;">Plot error evolution on epochs 
    </h3>
</div>

In [None]:
epoch_loss= history.history['loss']
epoch_val_loss = history.history['val_loss']
epoch_mae = history.history['mae']
epoch_val_mae = history.history['val_mae']

plt.figure(figsize = (20,6))
plt.subplot(1,2,1)
plt.plot(range(0,len(epoch_loss)), epoch_loss, 'b-' ,linewidth=2, label ='Train Loss')
plt.plot(range(0,len(epoch_val_loss)), epoch_val_loss, 'r-', linewidth=2, label= 'Val loss')
plt.title('Evolution of loss on train & validation datasets over epochs')
plt.legend(loc='best')

plt.subplot(1,2,2)
plt.plot(range(0,len(epoch_mae)), epoch_mae, 'b-' ,linewidth=2, label ='Train MAE')
plt.plot(range(0,len(epoch_val_mae)), epoch_val_mae, 'r-', linewidth=2, label= 'Val MAE')
plt.title('Evolution of MAE on train & validation datasets over epochs')
plt.legend(loc='best')

plt.show()


<div id='4'
     style = "display: fill;
              border-radius: 10px;
              background-color: #CB1C8D;">
    <h2 style = "padding: 15px; 
                 color: White;
                 text-align: left;
                 font-family: Trebuchet MS;">Prediction
    </h2>
</div>

In [None]:
Y_test = model.predict(X_test, batch_size =16)


plt.figure(figsize=(15,25))
for i in range(0,8,2):
    plt.subplot(4,2,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(X_test[i][:,:,0], cmap='gray')
    plt.title('Noisy image: {}'.format(test_img[i]))
    
    plt.subplot(4,2,i+2)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(Y_test[i][:,:,0], cmap='gray')
    plt.title('Denoised by autoencoder: {}'.format(test_img[i]))

plt.show()

<div id='5'
     style = "display: fill;
              border-radius: 10px;
              background-color: #CB1C8D;">
    <h2 style = "padding: 15px; 
                 color: White;
                 text-align: left;
                 font-family: Trebuchet MS;">Submission
    </h2>
</div>

In [None]:
# it will take a while!
ids = []
vals = []
for i, f in enumerate(test_img):
    file = path + 'test/' + f
    imgid = int(f[:-4])
    img = cv2.imread(file, 0)
    img_shape = img.shape
    # print('Processing image: {} \tinto size: {}'.format(f, img_shape))    # uncomment to see progress
    preds_reshaped = cv2.resize(Y_test[i], (img_shape[1], img_shape[0]))

    for r in range(img_shape[0]):
        for c in range(img_shape[1]):
            ids.append(str(imgid)+'_'+str(r + 1)+'_'+str(c + 1))
            vals.append(preds_reshaped[r, c])

submission = pd.DataFrame({'id': ids, 'value': vals})
submission.to_csv('submission.csv',index = False)

print('Results saved to submission.csv!')

# quick check if length of IDs is OK
# we should get there number 14230080
print('Length of IDs: {}'.format(len(ids)))

In [None]:
# check first few rows of submission
my_submission = pd.read_csv('submission.csv')
my_submission.head(5)

In [None]:
# cleanup working directory
import shutil
shutil.rmtree(path + 'train/')
shutil.rmtree(path + 'test/')
shutil.rmtree(path + 'train_cleaned/')

<img src="https://media.tenor.com/WrG5Ar-qk5QAAAAC/upvote-give-this-man.gif">