In [68]:
import cv2
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

from PIL import Image

# Constants

In [69]:
path2zettel: str = "/mnt/win_share/MLW/zettel/"
path2datajson: str = "/mnt/win_share/MLW/data.json"
path2outputv1: str = "/home/philko/output__v1"
path2outputv2: str = "/home/philko/output_v2"

# Data

## Helper Functions

In [70]:
def load_output(path: str) -> pd.DataFrame:
    """Load and Process Output File.

    Loads and processes output file labeled by the previous visual grounding
    model. Besides coordinates of BBs, length and height are also added to 
    the DataFrame. All BBs, that start at 0.0, 0.0 are dropped and considered
    a failure.

    :param path: Path to output file.
    :return: DataFrame of imported output file. 
    """
    output: any = open(str(path),"r")
    output: any = output.read()
    output: str = output.split("/n")
    todelete: int = len(output) - 1
    del output[todelete]
    output = [json.loads(d) for d in output]

    id = [d["file"].split(".")[0] for d in output]
    x1 = [d["result"][0]["box"][0] for d in output]
    y1 = [d["result"][0]["box"][1] for d in output]
    x2 = [d["result"][0]["box"][2] for d in output]
    y2 = [d["result"][0]["box"][3] for d in output]

    outputs_bb = pd.DataFrame(np.array([id,x1,y1,x2,y2]).T, columns=["id","x1","y1","x2","y2"])

    outputs_bb['id'] = outputs_bb['id'].astype("int64")
    outputs_bb['x1'] = round(outputs_bb['x1'].astype("float"))
    outputs_bb['y1'] = round(outputs_bb['y1'].astype("float"))
    outputs_bb['x2'] = round(outputs_bb['x2'].astype("float"))
    outputs_bb['y2'] = round(outputs_bb['y2'].astype("float"))

    # Getting the length and height of the Bounding Boxes
    outputs_bb["length"] = outputs_bb["x2"] - outputs_bb["x1"]
    outputs_bb["height"] = outputs_bb["y2"] - outputs_bb["y1"]

    # Removing all BBs with first point at (0,0)
    outputs_bb_df = outputs_bb[(outputs_bb["x1"]!=0) & (outputs_bb["y1"]!=0)].copy()
    return outputs_bb_df

In [71]:
def load_data_json(path: str) -> pd.DataFrame:
    """ Load data.json.

    Load `data.json` file from main data ('MLW') directory.

    :param path: Path to data.json in the  '<drive>/MLW' directory.
    :return: DataFrame of imported data.
    """
    f = open(path)
    data = json.load(f)
    data = pd.DataFrame(data)
    data["length_lemma"] = [len(i) for i in data["lemma"]]
    return data

In [72]:
def create_dataset(path2datajson: str, path2output: str) -> pd.DataFrame:
    """Create Dataset to Label.
    
    Function to merge output and data.json datasets. Both datasets are
    first loaded and subsequently merged.
    
    :param path2datajson: Path to `data.json` file.
    :param path2output: Path to `output` file.
    :return: Merged Dataset.
    """
    datajson: pd.DataFrame = load_data_json(path2datajson)
    dataoutput: pd.DataFrame = load_output(path2output)
    data: pd.DataFrame = datajson.merge(dataoutput, on="id")
    return data

## Process Data

In [73]:
datav1: pd.DataFrame = create_dataset(path2datajson, path2outputv1)
datav2: pd.DataFrame = create_dataset(path2datajson, path2outputv2)

In [74]:
datav1.columns, datav2.columns

(Index(['id', 'lemma', 'length_lemma', 'x1', 'y1', 'x2', 'y2', 'length',
        'height'],
       dtype='object'),
 Index(['id', 'lemma', 'length_lemma', 'x1', 'y1', 'x2', 'y2', 'length',
        'height'],
       dtype='object'))

# Scatterplots

## Prompt 1

_cursive text upper left_

### LemmaLen x Length

In [None]:
plt.scatter(datav1['length_lemma'], datav1['length'])
plt.show()

### LemmaLen x Height

In [None]:
plt.scatter(datav1['length_lemma'], datav1['height'])
plt.show()

### LemmaLen x Lenght + Height

In [None]:
plt.scatter(datav1['length_lemma'], (datav1['height'] + datav1['length']))
plt.show()

## Prompt 2

_handwritten cursive word upper left_

### LemmaLen x Length

In [None]:
plt.scatter(datav2['length_lemma'], datav2['length'])
plt.show()

### LemmaLen x Height

In [None]:
plt.scatter(datav2['length_lemma'], datav2['height'])
plt.show()

### LemmaLen x Length + Height

In [None]:
plt.scatter(datav2['length_lemma'], (datav2['height'] + datav2['length']))
plt.show()

# Further Look at Lemma of Length 2

## Helper Function

In [98]:
def vis_cluster_sample(
        data: pd.DataFrame,
        columns: int=4,
        rows: int=5,
        w: int=40,
        h: int=40):

    # fig = plt.figure(figsize=(w, h))

    # Sample
    if len(data) < columns * rows:
        X = data
    else:
        indices = np.random.choice(range(len(data)), columns * rows)
        X = data.iloc[indices]

    # Get Images
    images: list = list()
    for _, e in X.iterrows():
        img = np.asarray(Image.open(os.path.join(path2zettel, str(e['id']) + '.jpg'), mode="r"))
        cv2.rectangle(
            img,
            (int(e['x1']), int(e['y1'])),
            (int(e['x2']), int(e['y2'])),
            (0, 255, 0),
            3
        )
        images.append(img)

    # Process Images
    dims = list(map(lambda e: e.shape, images))
    size_height, size_width = tuple(map(lambda e: min(e), [list(t) for t in zip(*dims)][0:2]))
    images_cropped = list(map(lambda e: e[0:size_height, 0:size_width], images))

    fig, axs = plt.subplots(rows, columns, figsize=(w, h), constrained_layout=True,
                        sharex=True, sharey=True)

    # for index, e in enumerate(images_cropped):
    #     img = Image.fromarray(e)
    #     fig.add_subplot(rows, columns, index + 1, label = "test")
    #     plt.imshow(img)

    for i, ax in enumerate(axs.flat):
        ax.imshow(Image.fromarray(images_cropped[i]))
        ax.set_title(
            str(X['id'].values[i]) + '.jpg',
            fontsize='small',
            loc='left')
    
    plt.show()

## Lemma, Len = 2, Prompt 1

In [83]:
data_len_1 = datav1[(datav1['length_lemma'] == 2)]

In [None]:
vis_cluster_sample(data_len_1)

## Lemma, Len = 2, Prompt 2

In [85]:
data_len_2 = datav2[(datav2['length_lemma'] == 2)]

In [None]:
vis_cluster_sample(data_len_2)

In [None]:
data_len_10 = datav2[(datav2['length_lemma'] == 10)]
vis_cluster_sample(data_len_10)

In [None]:
data_len_6 = datav2[(datav2['length_lemma'] == 10)]
vis_cluster_sample(data_len_6)