In [1]:
from matplotlib import pyplot as plt
%matplotlib notebook
import glob
import os
import numpy as np
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import os
from IPython.display import Image
import json


In [2]:
DATASTE_FOLDER="/Users/user421/datasets/FUNSD/dataset/"

train = os.path.join(DATASTE_FOLDER, "training_data")
test = os.path.join(DATASTE_FOLDER, "testing_data")

train_images = sorted(glob.glob("{}/images/*".format(train)))
test_images = sorted(glob.glob("{}/images/*".format(test)))

train_annotations = sorted(glob.glob("{}/annotations/*".format(train)))
test_annotations = sorted(glob.glob("{}/annotations/*".format(test)))

print("Total Train images={}. Total Train annotations={}.".format(len(train_images), len(train_annotations)))
print("Total Test images={}. Total Test annotations={}.".format(len(test_images), len(test_annotations)))

Total Train images=149. Total Train annotations=149.
Total Test images=50. Total Test annotations=50.


In [3]:
class Constants:
    RED = [255,0,0]
    GREEN = [0,255, 0]
    BLUE = [0,0,255]
    CYAN = [0, 255, 255]
    
    
class Box():
    def __init__(self, xmin, ymin, xmax, ymax):
        self.xmin, self.ymin, self.xmax, self.ymax = xmin, ymin, xmax, ymax
    
    def __gt__(self, rhs):
        return (self.ymin, self.xmin, self.ymax, self.xmax) > (rhs.ymin, rhs.xmin, rhs.ymax, rhs.xmax)
    
    def __repr__(self):
        return f"Box(xmin={self.xmin},ymin={self.ymin}, xmax={self.xmax}, ymax={self.ymax})"

class Word:
    def __init__(self, box, text):
        self.box = Box(*box)
        self.text = text
        
    def __getitem__(self, item):
        if not hasattr(self, item):
            raise KeyError(f"{item} is not present in {self.__class__}")
        return getattr(self, item)
    
    def __repr__(self):
        return f"Word(box={self.box}, text={self.text})"
        
    
class Entity:
    def __init__(self, id_, box, text, label, words, linking):
        self.id_ = id_
        self.box = Box(*box)
        self.text = text
        self.label = label
        self.words = [ Word(**word) for word in words ] 
        self.linking = linking
    
    def __getitem__(self, item):
        if not hasattr(self, item):
            raise KeyError(f"{item} is not present in {self.__class__}")
        return getattr(self, item)
    
    def __repr__(self):
        return f"Entity(id_={self.id_}, box={self.box}, text={self.text}, label={self.label}, words={self.words}, linking={self.linking})"
    
class Form():
    def __init__(self, entities):
        self.entities = [Entity(id_=idx, **entity) for idx, entity in enumerate(entities)]
        
    def __repr__(self):
        return f"Form(entities={self.entities})"
    


In [4]:
train_image_file_names = [image_file.rsplit(os.sep,1)[-1] for image_file in train_images ]
train_images_dir = train_images[0].rsplit(os.sep,1)[0]

annotations = []
for train_annotation in train_annotations:
    with open(train_annotation) as fp:
        annotations.append(Form(json.loads(fp.read())["form"]))


In [5]:
words_list = [ [word for enitity in annotation.entities for word in enitity.words] for annotation in annotations ]

In [6]:
features = []
all_words = sorted(words_list[0], key=lambda w_: w_.box)
for word in all_words:
    x1, y1, x2, y2 = word.box.xmin, word.box.ymin, word.box.xmax, word.box.ymax
    x, y, w, h = (x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)
    features.append([x, y])

In [7]:
from sklearn.cluster.k_means_ import check_random_state, check_array, _check_sample_weight, row_norms, _tolerance
class KmeansClustering(KMeans):
    
    def __init__(self, *args, **kwargs):
        super(KmeansClustering, self).__init__(*args, **kwargs)
    
    def fit(self, X, y=None, sample_weight=None):
            """Compute the centroids on X by chunking it into mini-batches.

            Parameters
            ----------
            X : array-like or sparse matrix, shape=(n_samples, n_features)
                Training instances to cluster. It must be noted that the data
                will be converted to C ordering, which will cause a memory copy
                if the given data is not C-contiguous.

            y : Ignored
                not used, present here for API consistency by convention.

            sample_weight : array-like, shape (n_samples,), optional
                The weights for each observation in X. If None, all observations
                are assigned equal weight (default: None)

            """
            random_state = check_random_state(self.random_state)
            X = check_array(X, accept_sparse="csr", order='C',
                            dtype=[np.float64, np.float32])
            n_samples, n_features = X.shape
            if n_samples < self.n_clusters:
                raise ValueError("n_samples=%d should be >= n_clusters=%d"
                                 % (n_samples, self.n_clusters))

            sample_weight = _check_sample_weight(X, sample_weight)

            n_init = self.n_init
            if hasattr(self.init, '__array__'):
                self.init = np.ascontiguousarray(self.init, dtype=X.dtype)
                if n_init != 1:
                    warnings.warn(
                        'Explicit initial center position passed: '
                        'performing only one init in MiniBatchKMeans instead of '
                        'n_init=%d'
                        % self.n_init, RuntimeWarning, stacklevel=2)
                    n_init = 1

            x_squared_norms = row_norms(X, squared=True)

            if self.tol > 0.0:
                tol = _tolerance(X, self.tol)

                # using tol-based early stopping needs the allocation of a
                # dedicated before which can be expensive for high dim data:
                # hence we allocate it outside of the main loop
                old_center_buffer = np.zeros(n_features, dtype=X.dtype)
            else:
                tol = 0.0
                # no need for the center buffer if tol-based early stopping is
                # disabled
                old_center_buffer = np.zeros(0, dtype=X.dtype)

            distances = np.zeros(self.batch_size, dtype=X.dtype)
            n_batches = int(np.ceil(float(n_samples) / self.batch_size))
            n_iter = int(self.max_iter * n_batches)

            init_size = self.init_size
            if init_size is None:
                init_size = 3 * self.batch_size
            if init_size > n_samples:
                init_size = n_samples
            self.init_size_ = init_size

            validation_indices = random_state.randint(0, n_samples, init_size)
            X_valid = X[validation_indices]
            sample_weight_valid = sample_weight[validation_indices]
            x_squared_norms_valid = x_squared_norms[validation_indices]

            # perform several inits with random sub-sets
            best_inertia = None
            for init_idx in range(n_init):
                if self.verbose:
                    print("Init %d/%d with method: %s"
                          % (init_idx + 1, n_init, self.init))
                weight_sums = np.zeros(self.n_clusters, dtype=sample_weight.dtype)

                # TODO: once the `k_means` function works with sparse input we
                # should refactor the following init to use it instead.

                # Initialize the centers using only a fraction of the data as we
                # expect n_samples to be very large when using MiniBatchKMeans
                cluster_centers = _init_centroids(
                    X, self.n_clusters, self.init,
                    random_state=random_state,
                    x_squared_norms=x_squared_norms,
                    init_size=init_size)

                # Compute the label assignment on the init dataset
                _mini_batch_step(
                    X_valid, sample_weight_valid,
                    x_squared_norms[validation_indices], cluster_centers,
                    weight_sums, old_center_buffer, False, distances=None,
                    verbose=self.verbose)

                # Keep only the best cluster centers across independent inits on
                # the common validation set
                _, inertia = _labels_inertia(X_valid, sample_weight_valid,
                                             x_squared_norms_valid,
                                             cluster_centers)
                if self.verbose:
                    print("Inertia for init %d/%d: %f"
                          % (init_idx + 1, n_init, inertia))
                if best_inertia is None or inertia < best_inertia:
                    self.cluster_centers_ = cluster_centers
                    self.counts_ = weight_sums
                    best_inertia = inertia

            # Empty context to be used inplace by the convergence check routine
            convergence_context = {}

            # Perform the iterative optimization until the final convergence
            # criterion
            for iteration_idx in range(n_iter):
                # Sample a minibatch from the full dataset
                minibatch_indices = random_state.randint(
                    0, n_samples, self.batch_size)

                # Perform the actual update step on the minibatch data
                batch_inertia, centers_squared_diff = _mini_batch_step(
                    X[minibatch_indices], sample_weight[minibatch_indices],
                    x_squared_norms[minibatch_indices],
                    self.cluster_centers_, self.counts_,
                    old_center_buffer, tol > 0.0, distances=distances,
                    # Here we randomly choose whether to perform
                    # random reassignment: the choice is done as a function
                    # of the iteration index, and the minimum number of
                    # counts, in order to force this reassignment to happen
                    # every once in a while
                    random_reassign=((iteration_idx + 1)
                                     % (10 + int(self.counts_.min())) == 0),
                    random_state=random_state,
                    reassignment_ratio=self.reassignment_ratio,
                    verbose=self.verbose)

                # Monitor convergence and do early stopping if necessary
                if _mini_batch_convergence(
                        self, iteration_idx, n_iter, tol, n_samples,
                        centers_squared_diff, batch_inertia, convergence_context,
                        verbose=self.verbose):
                    break

            self.n_iter_ = iteration_idx + 1

            if self.compute_labels:
                self.labels_, self.inertia_ = \
                        self._labels_inertia_minibatch(X, sample_weight)

            return self        

NameError: name 'KMeans' is not defined

In [10]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=20, random_state=0, max_iter=1000, algorithm="elkan", tol=0.000001,n_init=50).fit(features)

In [11]:
cluster_groupings = {} 

for idx, label in enumerate(kmeans.labels_):
    cluster_groupings[label] = cluster_groupings.get(label, []) + [all_words[idx].text +" " +  str(all_words[idx].box)]

In [12]:
all_words

[Word(box=Box(xmin=292,ymin=91, xmax=376, ymax=175), text=R&D),
 Word(box=Box(xmin=341,ymin=201, xmax=418, ymax=211), text=IMPROVEMENT),
 Word(box=Box(xmin=257,ymin=203, xmax=279, ymax=214), text=R&D),
 Word(box=Box(xmin=285,ymin=203, xmax=334, ymax=216), text=QUALITY),
 Word(box=Box(xmin=331,ymin=214, xmax=387, ymax=228), text=SOLUTION),
 Word(box=Box(xmin=256,ymin=215, xmax=324, ymax=229), text=SUGGESTION/),
 Word(box=Box(xmin=395,ymin=215, xmax=423, ymax=228), text=FORM),
 Word(box=Box(xmin=324,ymin=216, xmax=332, ymax=230), text=),
 Word(box=Box(xmin=560,ymin=264, xmax=575, ymax=279), text=3/),
 Word(box=Box(xmin=575,ymin=264, xmax=590, ymax=279), text=92),
 Word(box=Box(xmin=543,ymin=265, xmax=560, ymax=279), text=9/),
 Word(box=Box(xmin=482,ymin=268, xmax=518, ymax=282), text=Date:),
 Word(box=Box(xmin=390,ymin=271, xmax=451, ymax=282), text=Martinez),
 Word(box=Box(xmin=169,ymin=272, xmax=196, ymax=287), text=Ext.),
 Word(box=Box(xmin=215,ymin=272, xmax=230, ymax=287), text=M.),

In [87]:
cluster_groupings

{18: ['R&D Box(xmin=292,ymin=91, xmax=376, ymax=175)',
  'IMPROVEMENT Box(xmin=341,ymin=201, xmax=418, ymax=211)',
  'R&D Box(xmin=257,ymin=203, xmax=279, ymax=214)',
  'QUALITY Box(xmin=285,ymin=203, xmax=334, ymax=216)',
  'SOLUTION Box(xmin=331,ymin=214, xmax=387, ymax=228)',
  'SUGGESTION/ Box(xmin=256,ymin=215, xmax=324, ymax=229)',
  'FORM Box(xmin=395,ymin=215, xmax=423, ymax=228)',
  ' Box(xmin=324,ymin=216, xmax=332, ymax=230)'],
 7: ['3/ Box(xmin=560,ymin=264, xmax=575, ymax=279)',
  '92 Box(xmin=575,ymin=264, xmax=590, ymax=279)',
  '9/ Box(xmin=543,ymin=265, xmax=560, ymax=279)',
  'Date: Box(xmin=482,ymin=268, xmax=518, ymax=282)',
  'Licensee Box(xmin=511,ymin=309, xmax=570, ymax=323)'],
 1: ['Martinez Box(xmin=390,ymin=271, xmax=451, ymax=282)',
  'P. Box(xmin=293,ymin=272, xmax=307, ymax=286)',
  'P. Box(xmin=370,ymin=272, xmax=384, ymax=285)',
  'Harper, Box(xmin=314,ymin=274, xmax=363, ymax=285)',
  'Wigand Box(xmin=278,ymin=313, xmax=327, ymax=327)'],
 5: ['Ext. Box(

In [24]:
def label_to_color(label):
    return {
        "question": Constants.RED,
        "answer": Constants.GREEN,
        "header": Constants.BLUE,
        "other": Constants.CYAN,
    }[label]
    
    
def color_image(image, annotation):
    for entity in annotation.entities:
        box = entity.box
        image_crop = image[box.ymin:box.ymax, 
                           box.xmin:box.xmax, :]
        image_crop[np.where((image_crop < [127,127,127]).all(axis = 2))] = label_to_color(entity.label)
        image[box.ymin:box.ymax, 
                          box.xmin:box.xmax, :] = image_crop
        
    return image


In [None]:
colored_images = {image_file:color_image(cv2.imread(os.path.join(train_images_dir, image_file)), annotations[idx]) 
                for idx, image_file in enumerate(train_image_file_names) }