In [1]:
import sys
sys.path.append('..')

In [2]:
import math
import re
import sqlite3
import string
import unicodedata as uni
from pathlib import Path

import numpy as np
import pandas as pd
from PIL import Image, ImageDraw, ImageFont
from tqdm import tqdm

from digi_leap.pylib import (
    string_align as sa,
    db,
    font as fu,
    image_util as iu,
    label_transforms as lt,
)

In [3]:
DB = Path('..') / 'data' / 'sernec' / 'sernec.sqlite'

In [4]:
def text_size(char, image_size, font):
    image = Image.new('L', (image_size, image_size), color='black')
    draw = ImageDraw.Draw(image)
    draw.text((0, 0), char, font=font, anchor="lt", fill="white")

    data = np.asarray(image) > 128
    data = data.astype("float")
    nz = np.nonzero(data)
    h = np.max(nz[0]) - np.min(nz[0]) + 1
    w = np.max(nz[1]) - np.min(nz[1]) + 1
    return h, w, image

In [5]:
def iou(image1, image2):
    data1 = np.asarray(image1) > 128
    data2 = np.asarray(image2) > 128
    data1 = data1.astype("float")
    data2 = data2.astype("float")
    area = np.sum(data1) + np.sum(data2)
    inter = np.sum((data1 + data2) == 2.0)
    return inter / (area - inter) if inter > 0.0 else 0.0

In [6]:
def iou_space(char2, image_size, font):
    image2 = Image.new('L', (image_size, image_size), color='black')
    draw2 = ImageDraw.Draw(image2)
    draw2.text((0, 0), char2, font=font, anchor="lt", fill="white")
    
    data2 = np.asarray(image2) > 128
    data2 = data2.astype("float")
    inter = np.sum(data2)
    return inter

In [7]:
def compare(char1, char2, image_size=40, font_size=24):
    if char1 == char2:
        return 1.0
    
    image1 = Image.new('L', (image_size, image_size), color='black')
    draw1 = ImageDraw.Draw(image1)
    font = ImageFont.truetype(str(fu.FONT2), font_size)
    
    if char1 == ' ':
        return iou_space(char2, image_size, font)

    h, w, _ = text_size(char1, image_size, font)

    left = (image_size - w) // 2
    top = (image_size - h) // 2

    draw1.text((left, top), char1, font=font, anchor="lt", fill="white")
    max_iou = 0.0

    for x in range(image_size):
        for y in range(image_size):
            image2 = Image.new('L', (image_size, image_size), color='black')
            draw2 = ImageDraw.Draw(image2)
            draw2.text((x, y), char2, font=font, anchor="lt", fill="white")

            curr_iou = iou(image1, image2)

            max_iou = max(max_iou, curr_iou)

    return max_iou


# print(compare(';', ';'))
# print(compare(';', ','))
# print(compare(',', ';'))
# print(compare('W', 'w'))
# print(compare('w', 'W'))
# print(compare('o', '0'))
# print(compare('0', 'o'))
# print(compare(' ', ','))

In [8]:
def get_chars(database):
    texts = [d['text'] for d in db.select_ocr(database)]
    chars = sorted({c for t in texts for c in list(t)})
    print(len(chars))
    print(' '.join(chars))
    return chars


CHARS = get_chars(DB)

95
  ! " # % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; = ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z \ ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z ~ ° é — ‘ ’ “ ” ™


In [9]:
def build_table(chars, limit = 99999999):
    matrix = []
    for i, char1 in tqdm(enumerate(chars[:-1])):
        if i == limit:
            break
        matrix.append({
            'char1': char1,
            'char2': char1,
            'score': 1.0,
        })
        for char2 in chars[i+1:]:
            c1, c2 = (char1, char2) if char1 < char2 else (char2, char1)
            score = compare(c1, c2)
            matrix.append({
                'char1': c1,
                'char2': c2,
                'score': score,
            })
    return matrix


# MATRIX = build_table(CHARS)

# df = pd.DataFrame(MATRIX)
# df.head()

# with sqlite3.connect(DB) as cxn:
#     df.to_sql('chars', cxn, if_exists='replace', index=False)

In [10]:
SPACES = build_table(CHARS, limit=1)
# SPACES

most = max(s['score'] for s in SPACES[1:])

SPACES[0]['score'] = 2.0
for r in SPACES:
    c2 = r['char2']
    score = -2.0
    if c2 == ' ':
        score = 2.0
    elif c2.isalnum():
        score = -2.0
    else:
        if r['score'] < 20.0:
            score = 1.0
        elif r['score'] < 30.0:
            score = 0.0
        else:
            score = -1.0
    r['score'] = score

SPACES

1it [00:00,  3.22it/s]


[{'char1': ' ', 'char2': ' ', 'score': 2.0},
 {'char1': ' ', 'char2': '!', 'score': -1.0},
 {'char1': ' ', 'char2': '"', 'score': 0.0},
 {'char1': ' ', 'char2': '#', 'score': -1.0},
 {'char1': ' ', 'char2': '%', 'score': -1.0},
 {'char1': ' ', 'char2': '&', 'score': -1.0},
 {'char1': ' ', 'char2': "'", 'score': 1.0},
 {'char1': ' ', 'char2': '(', 'score': -1.0},
 {'char1': ' ', 'char2': ')', 'score': -1.0},
 {'char1': ' ', 'char2': '*', 'score': -1.0},
 {'char1': ' ', 'char2': '+', 'score': 0.0},
 {'char1': ' ', 'char2': ',', 'score': 1.0},
 {'char1': ' ', 'char2': '-', 'score': 1.0},
 {'char1': ' ', 'char2': '.', 'score': 1.0},
 {'char1': ' ', 'char2': '/', 'score': -1.0},
 {'char1': ' ', 'char2': '0', 'score': -2.0},
 {'char1': ' ', 'char2': '1', 'score': -2.0},
 {'char1': ' ', 'char2': '2', 'score': -2.0},
 {'char1': ' ', 'char2': '3', 'score': -2.0},
 {'char1': ' ', 'char2': '4', 'score': -2.0},
 {'char1': ' ', 'char2': '5', 'score': -2.0},
 {'char1': ' ', 'char2': '6', 'score': -2

In [11]:
# sql = """update chars set score = :score where char1 = :char1 and char2 = :char2"""
# with sqlite3.connect(DB) as cxn:
#     cxn.executemany(sql, SPACES)

In [28]:
sql = """select * from chars"""
with sqlite3.connect(DB) as cxn:
    cxn.row_factory = sqlite3.Row
    ROWS = cxn.execute(sql)

ROWS = [r for r in ROWS if r['char1'] <= r['char2']]
ROWS = [dict(r) for r in ROWS if r['char1'] != ' ']
ROWS = sorted(ROWS, key=lambda r: r['score'])

In [32]:
for r in ROWS:
    if 0.0 <= r['score'] < 0.4:
        print(r)

{'char1': '-', 'char2': 'M', 'score': 0.04081632653061224}
{'char1': '.', 'char2': 'M', 'score': 0.04081632653061224}
{'char1': '/', 'char2': '—', 'score': 0.041666666666666664}
{'char1': '\\', 'char2': '—', 'score': 0.041666666666666664}
{'char1': '!', 'char2': '—', 'score': 0.043478260869565216}
{'char1': '-', 'char2': '@', 'score': 0.04411764705882353}
{'char1': '.', 'char2': '@', 'score': 0.04411764705882353}
{'char1': '%', 'char2': '-', 'score': 0.044642857142857144}
{'char1': '(', 'char2': '—', 'score': 0.04477611940298507}
{'char1': '-', 'char2': 'W', 'score': 0.045454545454545456}
{'char1': ')', 'char2': '—', 'score': 0.046153846153846156}
{'char1': '-', 'char2': 'H', 'score': 0.046511627906976744}
{'char1': '.', 'char2': 'H', 'score': 0.046511627906976744}
{'char1': '&', 'char2': '-', 'score': 0.04716981132075472}
{'char1': '-', 'char2': 'd', 'score': 0.047619047619047616}
{'char1': '.', 'char2': 'W', 'score': 0.048}
{'char1': 'W', 'char2': '_', 'score': 0.04929577464788732}
{

In [33]:
for r in ROWS:
    if r['char1'] == r['char2']:
        score = 2.0
    elif r['score'] >= 0.7:
        score = 1.0
    elif r['score'] >= 0.5:
        score = 0.0
    elif r['score'] >= 0.4:
        score = -1.0
    else:
        score = -2.0
    r['score'] = score

In [36]:
ALL_ROWS = SPACES + ROWS

LINES = [
    '"""Weights for sequence alignment."""',
    '# Thie file is automacically generated',
    '',
    'SUBS = {',
]
for r in ALL_ROWS:
    key = r['char1'] + r['char2']
    key = key.replace('\\', '⋄')
    key = key.replace('"', '\\"')
    key = key.replace('⋄', '\\\\')
    ln = f'    "{key}": {r["score"]},'
    LINES.append(ln)
    
LINES += [
    '}'
]

for ln in LINES[:20]:
    print(ln)

"""Weights for sequence alignment."""
# Thie file is automacically generated

WEIGHTS = {
    "  ": 2.0,
    " !": -1.0,
    " \"": 0.0,
    " #": -1.0,
    " %": -1.0,
    " &": -1.0,
    " '": 1.0,
    " (": -1.0,
    " )": -1.0,
    " *": -1.0,
    " +": 0.0,
    " ,": 1.0,
    " -": 1.0,
    " .": 1.0,
    " /": -1.0,
    " 0": -2.0,


In [37]:
header = Path('..') / 'digi_leap' / 'pylib' / 'string_align_subs.py'
with open(header, 'w') as out_file:
    for ln in LINES:
        out_file.write(ln)
        out_file.write("\n")