# Setup

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
mpl.rc('figure',figsize=(5, 20))
mpl.rc('xtick', labelsize=16) 
mpl.rc('ytick', labelsize=16)
mpl.rc('font', size=16)

In [3]:
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)


import seaborn as sns
import pandas as pd
import json, os, glob, string

from time import time
from skimage import io
from tqdm import tqdm

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [4]:
ann_path = os.path.join('..', 'HKR_Dataset_Words_Public', 'ann')
img_path = os.path.join('..', 'HKR_Dataset_Words_Public', 'img')

In [5]:
def read_image(path: str, ax: matplotlib.axes._subplots.Axes,
               title: str='', img_path: str=img_path) -> None:

    '''Utility fiunction for printing images from "path"'''

    image = io.imread(os.path.join(img_path, path))
    ax.imshow(image, cmap='gray')
    ax.axis("off")
    title = title if title else path
    ax.set_title(title)


In [6]:
def counts_to_df(df: pd.DataFrame, column: str='description') -> pd.DataFrame:
    
    '''Return dataframe with symbols counts from "column"'''

    counts = pd.DataFrame(df[column].map(list).explode())
    counts = counts.join(counts[column].value_counts(), on=column, rsuffix='1')
    counts.columns = ['symbols', 'counts']
    counts = counts[~(counts.symbols == '') & ~(counts.symbols == ' ')]  #.drop_duplicates()
    
    return counts

In [7]:
def meta_collect(ann_path: str, result_file: str, sep: str='\t') -> None:
    
    '''collect metadata for all images to "result_file"
    from json files in "ann_path" (execution time: about 5 mins)'''

    start = time()
    with open(result_file, 'w',  encoding='utf-8') as f:
        f.write(sep.join(['width', 'height', 'description',
                           'isModerated', 'moderatedBy', 'predicted']) + '\n')

    
        for file in tqdm(glob.glob(os.path.join(ann_path, '*.json'))):

            with open(file, encoding='utf-8') as js:
                tmp = json.load(js)

            try:
                f.write(sep.join([tmp['name'], str(tmp['size']['width']), str(tmp['size']['height']),
                               tmp['description'], str(tmp['moderation']['isModerated']),
                               tmp['moderation']['moderatedBy'], str(tmp['moderation']['predicted'])]) + '\n')
            except Exception:
                print(tmp['description'])
    print('execution time:', (time() - start), 'secs')

meta_collect(ann_path, 'metadata.tsv')
df = pd.read_csv(os.path.join('..', 'metadata', 'metadata.tsv'), sep='\t', index_col=0)

print(df.shape)
df.head()

 25%|██▌       | 16371/64943 [05:17<15:43, 51.51it/s]


KeyboardInterrupt: 

# EDA with some preprocessing

In [None]:
df

### Drop useless columns

In [None]:
df[~df.predicted.isna()]

In [None]:
df.isModerated.value_counts()

In [None]:
df.drop(['predicted', 'isModerated'], axis=1, inplace=True)

### Moderator - useless or not? (need help)

In [None]:
cou = df.moderatedBy.value_counts()
cou

In [None]:
cou[1] / (cou[0] + cou[1]) # ?

### Some random pictures

In [None]:
n = 10
img_names = random.choice(df.index, n)
fig, axes = subplots(n, 1)

for img_name, ax in zip(img_names, axes):
    read_image(img_name + '.jpg', ax=ax, title=img_name + f'  ({df.loc[img_name].description})')
tight_layout()

### Dataset symbol counts

In [None]:
# Creating dataframe with symbol counts with indexes from original df
counts = counts_to_df(df, 'description')

# Barplot with symbol counts in dataset
fig, ax = subplots(figsize=(20, 10))
sns.barplot(data=counts.sort_values('counts', ascending=False), x='symbols', y='counts', ax=ax)

tight_layout()
counts

### Find all non-ordinary symbols for Russian language

In [None]:
# Creating reference alphabet with Russian (lower- and uppercase) and punctuation symbols
alphabet_lower = [chr(ord("а") + i) for i in range(32)] + [chr(ord("а") + 33)] # Last is "ё"
alphabet_upper = [chr(ord("А") + i) for i in range(32)]
punctuation = list(string.punctuation)

alphabet = set(alphabet_lower + alphabet_upper + punctuation)

# Creating alphabet from dataset
counts_dict = counts.set_index('symbols')['counts'].to_dict()

# difference between dataset and reference alphabet
smth_symbols = set(counts_dict) - alphabet 
smth_symbols

### Plotting the non-reference symbols 

In [None]:
fig, axes = subplots(len(smth_symbols), 1)

for sym, ax in zip(smth_symbols, axes):
    ind = counts[counts.symbols == sym].index[0]
    read_image(ind + '.jpg', ax, df.loc[ind].description + f'    ({sym})')
tight_layout()

### Rows with non-reference symbols

In [None]:
pd.options.display.max_rows = 100
df.loc[counts[counts.symbols.isin(smth_symbols)].index.drop_duplicates()].drop_duplicates('description')

### Some Russian symbols are in latin spelling and some punctuation symbols are not in unicode format, so rework a part of them

In [None]:
df.description = df.description.str.replace('o', 'о').str.replace('H', 'Н')
df.description = df.description.str.replace('–', '-').str.replace('—', '-').str.replace('…', '...')

### The remaining non-Russian and non-punctuation symbols

In [None]:
counts = counts_to_df(df, 'description')

counts_dict = counts.set_index('symbols')['counts'].to_dict()
kazakh_symbols = set(counts_dict) - alphabet
kazakh_symbols

### Drop remaining non-reference symbols

In [None]:
print("rows to drop:", df.loc[counts[counts.symbols.isin(kazakh_symbols)].index.drop_duplicates()].shape[0])
df = df.drop(counts[counts.symbols.isin(kazakh_symbols)].index.drop_duplicates(), axis=0)
df

### Сomparison between the frequency of letters in the Russian alphabet ([ref link](https://ru.wikipedia.org/wiki/%D0%A7%D0%B0%D1%81%D1%82%D0%BE%D1%82%D0%BD%D0%BE%D1%81%D1%82%D1%8C)) and in the dataset

In [None]:
fig, axes = subplots(2, 1, figsize=(15, 15))

tmp = counts_to_df(df)
tmp = tmp[~tmp.symbols.isin(punctuation)].drop_duplicates('symbols')

dictir = {}
for i, j in tmp.iterrows():
    sym, cou = j
    dictir[sym.lower()] = dictir.get(sym.lower(), 0) + cou
tmp = pd.DataFrame(dictir, index=['counts']
                  ).T.reset_index().sort_values('counts', ascending=False)

sns.barplot(data=tmp, x='index', y='counts', ax=axes[0])
axes[0].set_title('Dataset alphabet')

ls = pd.read_csv(os.path.join('..', 'metadata', 'alphabet.tsv'), sep='\t', 
                 index_col=1).sort_values('Частотность', ascending=False)
sns.barplot(data=ls, x='Буква', y='Частотность', ax=axes[1])
axes[1].set_title('Russian alphabet')
tight_layout()

### Label counts

In [None]:
df.description.value_counts()

### Pictures sizes description

In [None]:
print(f"width: max = {df.width.max()}, min = {df.width.min()}, mean = {df.width.mean()}")
print(f"height: max = {df.height.max()}, min = {df.height.min()}, mean = {df.height.mean()}\n")

mheight = df.height.value_counts().sort_values(ascending=False)
mwidth = df.width.value_counts().sort_values(ascending=False)

print(f'most common widths (of {mwidth.shape[0]} size):\nwidth\tcount\n{mwidth.head(10)}\nand their mean = {mwidth.head(10).index.to_series().mean()}\n')
print(f'most common heights (of {mheight.shape[0]} size):\nheight\tcount\n{mheight.head(10)}\nand their mean = {mheight.head(10).index.to_series().mean()}')

### Most common size values pictures

In [None]:
fig, axes = subplots(4, 1, figsize=(10, 15))

df_list = [df[df.height == mheight.index[0]],
           df[df.height == mheight.index[-1]],
           df[df.width == mwidth.index[0]],
           df[df.width == mwidth.index[-1]]]

titles = ['most common height example',
          'least common height example',
          'most common width example',
          'least common width example']

for tmp, title, ax in zip(df_list, titles, axes):
    read_image(tmp.index[0] + '.jpg', ax, title=title + f' ({tmp.description[0]})')
    ax.axis('on')

tight_layout()

### Extreme size values pictures

In [None]:
fig, axes = subplots(6, 1, figsize=(10, 15))

df_list = [df[df.height == df.height.max()],
           df[df.height == df.height.min()],
           df[df.width == df.width.max()],
           df[df.width == df.width.min()],
           df[df.description.apply(len) == df.description.apply(len).max()],
           df[df.description.apply(len) == df.description.apply(len).min()]]

titles = ['max hight example',
          'min hight example',
          'max width example',
          'min width example',
          'max len description',
          'min len description']

for tmp, title, ax in zip(df_list, titles, axes):
    read_image(tmp.index[0] + '.jpg', ax, title=title + f' ({tmp.description[0]})')
    ax.axis('on')

tight_layout()

In [None]:
img_height, img_width = 100, 600
df = df[(df.width <= img_width) & (df.height <= img_height)]
max_length = df.description.str.len().max()
print(max_length)
df

# Train test val split

### Get utility dataframe with rows as picture names and columns as all symbols that presented in dataset.

In [None]:
# There is 1.0 in cell if there is this symbol in this picture otherwise 0 
counts = counts_to_df(df, 'description')
counts.counts = 1
splitter = counts.reset_index().drop_duplicates().pivot(index='index', columns='symbols').fillna(0)
splitter

### split into three dataframes (train 85%, test 10%, val 5%)

In [None]:
# treat splitter df as multilabel class signature, so we can easy split original df to train and test
train, test, _, ls = train_test_split(df, splitter, shuffle=True,
                            test_size=0.15, random_state=12)

# And then split test to final test and val dfs
test, val, _, _ = train_test_split(test, ls, shuffle=True,
                            test_size=0.33, random_state=17)

train_counts = counts_to_df(train, 'description')
test_counts = counts_to_df(test, 'description')
val_counts = counts_to_df(val, 'description')

print('Sets differences between presented symbols in train, test and val data\n')

print('train_counts - test_counts:   ', set(train_counts.symbols) - set(test_counts.symbols))
print('train_counts - val_counts:   ', set(train_counts.symbols) - set(val_counts.symbols))

print('test_counts - train_counts:   ', set(test_counts.symbols) - set(train_counts.symbols))
print('test_counts - val_counts:   ', set(test_counts.symbols) - set(val_counts.symbols))

print('val_counts - train_counts:   ', set(val_counts.symbols) - set(train_counts.symbols))
print('val_counts - test_counts:   ', set(val_counts.symbols) - set(test_counts.symbols))

# Plot frequencies of symbols in three new dataframes
fig, axes = subplots(3, 1, figsize=(15, 15))

for tmp, ax, name in zip((train_counts, test_counts, val_counts), axes, ['train', 'test', 'val']):
    sns.barplot(data=tmp.sort_values('counts', ascending=False),
                x='symbols', y='counts', ax=ax)
    ax.set_title(name)
tight_layout()

### As we see, all symbols, except "(" and ")" (they occur in dataset only 2 and 1 times respectively), are presented in all three dataframes and frequencies of symbols are very close too. Is it good or not?

# Making tf Dataset

In [None]:
batch_size = 16

### Creating mappers

In [None]:
# Mapping characters to integers
counts = counts_to_df(df)
counts = counts[~counts.isin(['', ' '])].symbols.unique().tolist() + [' ', '#']
vocab = pd.Series(counts).str.encode('utf8')

char_to_num = layers.experimental.preprocessing.StringLookup(
    vocabulary=vocab,
    mask_token=None,
)

# Mapping integers back to original characters
num_to_char = layers.experimental.preprocessing.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True,
)

blank_index = char_to_num(tf.strings.unicode_split('#', input_encoding="UTF-8")).numpy()[0]
blank_index  # For oov symbols

### Functions for tf datasets

In [None]:
def encode_single_sample(img_path, label):
    
    """Function for processing one image from tf dataset"""
    
    # 1. Read 
    img = tf.io.read_file(img_path)
    
    # 2. Decode and convert to grayscale
    img = tf.io.decode_png(img, channels=1)
    
    # 3. Convert to float32 in [0, 1] range
    img = tf.image.convert_image_dtype(img, tf.float32)
    
    # 4. Resize to the desired size
    img = 1 - img
    img = tf.image.resize_with_crop_or_pad(img, np.int32(img_height), np.int32(img_width))
    img = 0.5 - img

    # 5. Transpose the image because we want the time
    # dimension to correspond to the width of the image.
    img = tf.transpose(img, perm=[1, 0, 2])
    
    # 6. Map the characters in label to numbers
    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
    label = tf.pad(label, [[0, max_length-len(label)]], constant_values=blank_index)
    
    # 7. Return a dict as our model is expecting two inputs
    return {"image": img, "label": label}

In [None]:
def get_dataset(samples: pd.DataFrame, batch_size=batch_size, 
                shuffle_buffer:int=1024, prefetch:int=tf.data.experimental.AUTOTUNE) -> tf.data.Dataset:
    
    """Function for creating tf dataset"""
    
    dataset = tf.data.Dataset.from_tensor_slices(
        (samples.index.to_series().apply(lambda x: os.path.join(img_path, x) + '.jpg').tolist(),
         samples.description.tolist())
    )
    
    dataset = (
        dataset.map(
        encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
        )
        .batch(batch_size)
        .prefetch(prefetch)
    )
    
    return dataset

In [None]:
def show_batch(batch, batch_size):
    
    """Utility function for imshow batch"""

    _, ax = plt.subplots(batch_size, 1, figsize=(10, batch_size * 2))
    images = batch['image']
    labels = batch['label']
    for i in range(batch_size):
        img = ((images[i] + 0.5) * 255).numpy().astype('uint8')
        label = tf.strings.reduce_join(num_to_char(labels[i])).numpy().decode('utf-8').replace('#', '')
    
        ax[i].imshow(img[i:, :, 0].T, cmap='gray')
        ax[i].set_title(label)
    tight_layout()
    plt.show()

### One batch from tf dataset

In [None]:
ind = df.index.tolist()
# random.shuffle(ind)
ls = get_dataset(df.loc[ind].iloc[:16])
for batch in ls.take(1):
    show_batch(batch, batch_size=batch_size)

# Data augmentation

In [None]:
import cv2
import imgaug as ia
import imgaug.augmenters as iaa
import imageio
import numpy as np

paths = df.index.to_series().apply(lambda x: os.path.join(img_path, x) + '.jpg')

aug_1 = os.path.join(img_path, 'aug_1')

if not os.path.exists(aug_1):
    os.mkdir(aug_1)
sometimes = lambda aug: iaa.Sometimes(0.5, aug)
seq = iaa.Sequential(
    [

        iaa.Sometimes(0.1, iaa.GaussianBlur(3.0)),
    
        iaa.Sometimes(0.1, iaa.AveragePooling(2)),
        iaa.Sometimes(0.1, iaa.Emboss(alpha=(0.0, 1.0), strength=(0.75, 1.25))),
        iaa.Sometimes(0.1, iaa.GammaContrast((0.5, 1.0))),
        iaa.Invert(0.05, per_channel=True),
        iaa.Sometimes(0.1, iaa.CoarseDropout((0.0, 0.05), size_percent=(0.02, 0.25))),

        iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25),

        iaa.PerspectiveTransform(scale=(0.02, 0.05)),

        iaa.Sometimes(0.1, iaa.SaltAndPepper(0.05)),
    ],
    random_order=True
)

for path in tqdm(paths):
    break
    print('start')
    img = imageio.imread(path)
    image = [np.copy(img) for _ in range(30)]

    ls = seq(images=image)
    
    print(len(ls))
    for i in range(30):
        _, name = os.path.split(path)
        name = os.path.join(aug_1, f'{i}_aug_' + name)
        cv2.imwrite(name, ls[i])

# Preprocess module default use 

In [None]:
from preprocess import *

img_width = 600

img_height = 100

# default paths
WORKING_DIR = os.path.join('/home', 'mts')
ann_path = os.path.join(WORKING_DIR, 'HKR_Dataset_Words_Public', 'ann')
img_path = os.path.join(WORKING_DIR, 'HKR_Dataset_Words_Public', 'img')

# collect metadata
meta_collect(ann_path, os.path.join(WORKING_DIR, 'metadata', 'metadata.tsv'))

# get preprocessed metadata dataframe
df = PreprocessFrame(metadata=os.path.join(WORKING_DIR, 'metadata', 'metadata.tsv'),
                     img_height=img_height, img_width=img_width)
print(df.shape)

# Make augments file (if they exists: comment or delete line)
aug_df = None
aug_df = make_augments(df=df, img_path=img_path, WORKING_DIR=WORKING_DIR,
                        img_height=img_height, img_width=img_width)

# get augments metadata dataframe from original dataframe if not starting make_augments
if not isinstance(aug_df, pd.DataFrame):
    aug_df = df.copy()
    aug_df.index = aug_df.index.to_series().apply(lambda x: os.path.join('aug_1', 'aug_' + x))

train, test, val = list(Dataset(df, aug_df=aug_df,
                                test_size=0.1,
                                val_size=0.05,
                                img_path=img_path,
                                img_height=img_height,
                                img_width=img_width,
                                WORKING_DIR=WORKING_DIR,
                                shuffle=True,
                                random_state=12))
train

In [None]:
for batch in train.take(1):
    show_batch(batch, batch_size=batch_size)

In [None]:
for batch in test.take(1):
    show_batch(batch, batch_size=batch_size)

In [None]:
for batch in val.take(1):
    show_batch(batch, batch_size=batch_size)