In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!pip install imutils
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Define util functions

In [None]:
import json
import os
import tqdm

path = '../input/super-ai-engineer-2021-font-recognition/'
join = os.path.join

def get_ground_truth(json_path):
    data = json.load(open(json_path))
    return data

def get_font_family_dict():
    font = {}
    for k,i in enumerate(os.scandir(join(path,'fonts'))):
        name = i.name.split(".ttf")[0]
        font[k] = name
    return font
        
def get_font_size_weight_style_dict():
    size = []
    weight = []
    style = []
    for i in tqdm.tqdm(os.scandir(path+'train'), os.path.getsize(path+'train')):
        for j in os.scandir(i):
            data = get_ground_truth(join(j.path,'gt_text.json'))
            for ele in data:
                ele = ele["style"]
                size.append(ele['fontSize'])
                weight.append(ele['fontWeight'])
                style.append(ele['fontStyle'])
    conv = lambda x: list(set(x))
    size, weight, style = conv(size), conv(weight), conv(style)
    size = {i:j for i,j in enumerate(size)}
    weight = {i:j for i,j in enumerate(weight)}
    style = {i:j for i,j in enumerate(style)}
    return size, weight, style

def get_label_dict():
    ff_dict = get_font_family_dict()
    s,w,st = get_font_size_weight_style_dict()
    return ff_dict, s, w, st

def prepare_dataset_folder():
    main_dir = './datasets'
    train_dir = join(main_dir,'train')
    test_dir = join(main_dir,'test')
    os.makedirs(main_dir, exist_ok=True)
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

In [None]:
get_font_family_dict()

# Character cropping function using Label & Regionprops

In [None]:
import cv2
import imutils
from skimage.measure import label, regionprops

def get_char_segmented_image(segment, upsamp_multiplier = 3):
    image = segment
    image = cv2.resize(image, (image.shape[1]*upsamp_multiplier, image.shape[0]*upsamp_multiplier), cv2.INTER_CUBIC)
    image = cv2.adaptiveThreshold(image, 255,cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 21, 10)

    label_img = label(image, connectivity=image.ndim)
    props = regionprops(label_img)

    rois = []
    for prop in props:
        box = prop.bbox
        roi = image[box[0]:box[2], box[1]:box[3]]
        rois.append(roi)
            
    return rois

# Regoinprops & Label example

In [None]:
import matplotlib.pyplot as plt
import cv2
import imutils
from skimage.measure import label, regionprops
import tqdm

img_ids = []
max_image_count = 10
count = 0
for i in os.scandir(join(path, 'train', 'set2')):
    if count >= max_image_count: break
    img_ids.append(i.name)
    count += 1

for img_id in tqdm.tqdm(img_ids):
    image = cv2.imread(join(path,'train','set1',str(img_id),'image.png'),0)
    js = json.load(open(join(path,'train','set1',str(img_id),'gt_text.json')))
    rect = js[0]["rect"]
    image = image[int(rect["y"]):int(rect["y"]+rect["height"]), int(rect["x"]):int(rect["x"]+rect["width"])]
    image = cv2.resize(image, (image.shape[1]*3, image.shape[0]*3), cv2.INTER_CUBIC)
    image = cv2.adaptiveThreshold(image, 255,cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 21, 10)
    contours, hierarchy = cv2.findContours(image,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)

    clone = image.copy()

    i = 0

    fig = plt.figure(figsize=(15, 7))

    label_img = label(image, connectivity=image.ndim)
    props = regionprops(label_img)

    for prop in props:
        i = i+1
        box = prop.bbox
        roi = image[box[0]:box[2], box[1]:box[3]]
        fig.add_subplot(1, len(props), i)
        plt.imshow(roi)
        plt.axis('off') 
    
    i = 0

    for cnt in contours:
        i = i+1
        x,y,w,h = cv2.boundingRect(cnt)
        roi = image[y:y+h, x:x+w]
        fig.add_subplot(2, len(contours), i)
        plt.imshow(roi)
        plt.axis('off') 

    fig2 = plt.figure(figsize=(10, 7))
    fig2.add_subplot(3, 1, 1)
    plt.imshow(image)
    plt.axis('off') 

# Preprocess Datasets****

In [None]:
!rm -rf ./datasets

In [None]:
prepare_dataset_folder()

In [None]:
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

dataset_path = '../input/w1n1-data'
output_path = './datasets'
split_size = 0.7
print(os.listdir(dataset_path))

df = pd.read_csv(join(dataset_path, 'df.csv'))

df = df.sample(frac=1).reset_index(drop=True)
print(df.head())
print(len(df))

seg_count = 0
split_pos = int(len(df) * split_size)

print('processing train dataset...')
created_folder = []
for k,v in tqdm(df.loc[0:split_pos].iterrows(), total = len(df.loc[0:split_pos])):
    file = v["file"]
    font_family = v['fontFamily']
    image_path = join(dataset_path, 'crop_images', file)
    image = cv2.imread(image_path,0)
    segmented = get_char_segmented_image(image, upsamp_multiplier = 3)
    seg_count += len(segmented)
    save_path = join(output_path, 'train', font_family)
    if font_family not in created_folder:
        os.makedirs(save_path)
        created_folder.append(font_family)
    for counter, seg in enumerate(segmented):
        cv2.imwrite(join(save_path, f'{file}_{counter}'), seg)
        
        
print('processing test dataset...')
created_folder = []
for k,v in tqdm(df.loc[split_pos:].iterrows(), total = len(df.loc[split_pos:])):
    file = v["file"]
    font_family = v['fontFamily']
    image_path = join(dataset_path, 'crop_images', file)
    image = cv2.imread(image_path,0)
    segmented = get_char_segmented_image(image, upsamp_multiplier = 3)
#     print(len(segmented))
    seg_count += len(segmented)
    save_path = join(output_path, 'test', font_family)
    if font_family not in created_folder:
        os.makedirs(save_path)
        created_folder.append(font_family)
    for counter, seg in enumerate(segmented):
        cv2.imwrite(join(save_path, f'{file}_{counter}'), seg)
    
                                    

In [None]:
print(split_pos)

In [None]:
train_path = './datasets/train/'
test_path = './datasets/test/'

print('train dataset')
for p in os.scandir(train_path):
    print(p.name, " : ", len(list(os.listdir(p.path))))
print()
print('test dataset')
for p in os.scandir(test_path):
    print(p.name, " : ", len(list(os.listdir(p.path))))


In [None]:
import os
os.chdir(r'/kaggle/working')
from IPython.display import FileLinks
FileLinks('./datasets/train/')

In [None]:
import cv2

os.makedirs("dataset")
def crop_image():
    size = []
    weight = []
    style = []
    for i in tqdm.tqdm(os.scandir(path+'train'), os.path.getsize(path+'train')):
        for j in os.scandir(i):
            data = get_ground_truth(join(j.path,'gt_text.json'))
            image = cv2.imread(join(j.path,'image.png',0))
            id = j.name
            for ele in data:
                rect = ele["rect"]
                cropped = image[int(rect["y"]):int(rect["y"]+rect["height"]), int(rect["x"]):int(rect["x"]+rect["width"])]
                
                
                
                

In [None]:
import os
import cv2

class FontFamilyGenerator(keras.utils.Sequence):
    def __init__(self, ids, path=path, batch_size=32, dim, n_channels,
                 n_classes, shuffle=True):
        self.dim = dim
        self.batch_size = batch_size
        self.ids = ids
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.path = path
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.ids) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        list_IDs_temp = [self.ids[k] for k in indexes]

        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        if self.shuffle == True:
            np.random.shuffle(self.ids)

    def __data_generation(self, list_IDs_temp):
        X,Y = [],[]
        for i in list_IDs_temp:
            img_path = join(self.path, 'train','set1' , str(i))
            if os.path.exist(image_path): 
                image = cv2.imread(img_path, 0)
                label = json.load(open(join(img_path, 'gt_text.json')))
            else: 
                img_path = join(self.path, 'train','set0' , str(i))
                image = cv2.imread(join(img_path,'image.png'), 0)
            X.append(image)
            Y
            

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)
            