In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#  Pick a Dataset you might be interested in.
#  Say, all airline-safety files...
import zipfile
import os 

train_dataset = "train"
clean_dataset = "train_cleaned"
test_dataset = "test"

base_path = "/kaggle/input/denoising-dirty-documents/"

def extract_files(base_path,dataset):
    full = os.path.join(base_path,dataset)
    # Will unzip the files so that you can see them..
    with zipfile.ZipFile(full+".zip","r") as z:
        z.extractall(".")

In [None]:
for dataset in [train_dataset, clean_dataset, test_dataset]:
    extract_files(base_path, dataset)

In [None]:
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D, Conv2DTranspose
from keras.models import Model
from keras.preprocessing.image import ImageDataGenerator
import numpy as np

In [None]:
import os 

In [None]:
for folder in ["test", "train", "train_cleaned"]:
    base_dir = "/kaggle/working/"
    txt_dir = os.path.join(base_dir, folder, "text")
    mv_dir = os.path.join(base_dir, folder)
    !mkdir -p  $txt_dir
    !mv $mv_dir/*.png $txt_dir

In [None]:
from pathlib import Path

dir_to_scan = "/kaggle/working/train/text"
data_dir = Path(dir_to_scan)
image_count = len(list(data_dir.glob('*.png')))
image_count

In [None]:
BATCH_SIZE = 32
IMG_HEIGHT = 420
IMG_WIDTH = 540

In [None]:
# All images will be rescaled by 1./255
train_datagen = ImageDataGenerator(rescale=1./255)
valid_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

In [None]:
train_generator = train_datagen.flow_from_directory(directory="/kaggle/working/train",
                                                     batch_size=BATCH_SIZE,
                                                     #shuffle=True,
                                                     target_size=(IMG_HEIGHT, IMG_WIDTH), 
                                                     color_mode='grayscale',
                                                     class_mode='input')

In [None]:
validation_generator = valid_datagen.flow_from_directory(directory=str("/kaggle/working/train_cleaned"),
                                                     batch_size=BATCH_SIZE,
                                                     #shuffle=True,
                                                     target_size=(IMG_HEIGHT, IMG_WIDTH),
                                                     color_mode='grayscale',    
                                                     class_mode='input')

In [None]:
test_generator = test_datagen.flow_from_directory(directory=str("/kaggle/working/test"),
                                                     batch_size=BATCH_SIZE,
                                                     shuffle=False,
                                                     color_mode='grayscale',
                                                     target_size=(IMG_HEIGHT, IMG_WIDTH),
                                                     class_mode='input')

In [None]:
def build_encoder():

    input_img = Input(shape=(IMG_HEIGHT, IMG_WIDTH, 1)) 
    
    # Encoding Layers
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
    encoded = MaxPooling2D((2, 2), padding='same')(x)
    
    # Decoding Layers 
    x = Conv2DTranspose(filters=32,kernel_size=3,strides=(2, 2),padding="same",activation='relu')(encoded)
    x = Conv2DTranspose(filters=32,kernel_size=3,strides=(2, 2),padding="same",activation='relu')(x)
    decoded = Conv2DTranspose(filters=1, kernel_size=3, strides=(1, 1), padding="same", activation='sigmoid')(x)
    
    return Model(input_img, decoded)

In [None]:
from keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [None]:
autoencoder = build_encoder()
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=0.001)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mse'])
autoencoder.summary()

In [None]:
history = autoencoder.fit(
      x=train_generator,
      epochs=100,
      validation_data = validation_generator,
      callbacks=[reduce_lr])

In [None]:
import matplotlib.pyplot as plt

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

In [None]:
plot_graphs(history, 'mse')

In [None]:
plot_graphs(history, 'loss')

In [None]:
preds = autoencoder.predict(test_generator)
#preds = np.squeeze(preds)

In [None]:
def plot_preds(test, preds,n=1):
    plt.figure(figsize=(30, 20))
    for i in range(n):
        # display original
        ax = plt.subplot(2, n, i + 1)
        test = test *  255.0
        plt.imshow(test.reshape((420,540)),cmap='gray')
        #plt.imshow(np.squeeze(test), cmap='gray')

        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)

        # display reconstruction
        ax = plt.subplot(2, n, i + 1 + n)
        preds = preds* 255.0
        plt.imshow(preds.reshape((420,540)), cmap='gray')
        #plt.imshow(np.squeeze(preds), cmap='gray')
        
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)
    plt.show()

In [None]:
test_generator[0][0][0].shape

In [None]:
plot_preds(test_generator[0][0][15], preds[15],1)

In [None]:
from pathlib import Path

dir_to_scan = "/kaggle/working/test/text"
data_dir = Path(dir_to_scan)
test_imgs = sorted(list(data_dir.glob('*.png')))

In [None]:
from matplotlib import image
import cv2

In [None]:
im = image.imread(str(test_imgs[0]))
height, width = im.shape
end_im = cv2.resize(preds[0], (width, height))
rows, cols = end_im.shape

In [None]:
with open("submissions.csv", "w") as file1:
    file1.write("id,value\n")
    for i, test_img in enumerate(test_imgs):
        id = test_img.stem
        im = image.imread(str(test_img))
        height, width = im.shape
        end_im = cv2.resize(preds[i], (width, height))
        rows, cols = end_im.shape
        for row in range(rows):
            for col in range(cols):
                file1.write("{}_{}_{},{}\n".format(id,row+1,col+1, end_im[row][col]))

In [None]:
!cat submissions.csv | wc -l 

In [None]:
!head submissions.csv

In [None]:
import pandas as pd

df = pd.read_csv('submissions.csv')

In [None]:
df[df.id == '100_1_1']