In [1]:
!pip install english-words

Collecting english-words
  Downloading english-words-2.0.1.tar.gz (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: english-words
  Building wheel for english-words (pyproject.toml) ... [?25l[?25hdone
  Created wheel for english-words: filename=english_words-2.0.1-py3-none-any.whl size=8196236 sha256=fe0578e042dd3ba304e4d9ced5da339708a1dd1a32547a2fed9e90ce83685db0
  Stored in directory: /root/.cache/pip/wheels/f0/e6/d9/16a123647999fe535f03a36e7af23eef203736d84c7ca25b0b
Successfully built english-words
Installing collected packages: english-words
Successfully installed english-words-2.0.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
import zipfile

archive_path = '/content/drive/MyDrive/Datasets/SOURCE1/by_class.zip'
with zipfile.ZipFile(archive_path, 'r') as arc_f:
  arc_f.extractall('/content/')

In [4]:
from PIL import Image
import numpy as np
import cv2

def get_grayscale_image(filepath):
  img = Image.open(filepath).convert('L')
  return np.array(img)

def crop_grayscale(img):
  _, bin_img = cv2.threshold(img, 128, 255, type=cv2.THRESH_BINARY_INV)
  contours, _ = cv2.findContours(bin_img, mode=cv2.RETR_EXTERNAL,
                              method=cv2.CHAIN_APPROX_SIMPLE)

  if not contours:
    return img

  y_min, y_max, x_min, x_max = np.inf, -np.inf, np.inf, -np.inf

  for contour in contours:
    x, y, w, h = cv2.boundingRect(contour)
    y_min = min(y, y_min)
    y_max = max(y+h, y_max)
    x_min = min(x, x_min)
    x_max = max(x+w, x_max)

  croppped_img = bin_img[y_min:y_max, x_min:x_max]

  return croppped_img

In [5]:
from english_words import get_english_words_set
words_list = list(get_english_words_set(['web2']))


In [11]:
import os
import random
from PIL import Image, ImageOps
import time
import shutil
import numpy as np

train_dir = "/content/by_class/train"
classes_imgs = {}
classes_lengths = {}

for c in os.listdir(train_dir):
  classes_imgs[c] = os.listdir(os.path.join(train_dir, c))
  classes_lengths[c] = len(classes_imgs[c])

def generate_word(word, output_path):
    char_imgs = []
    word = list(word.strip())
    max_height = 0
    widths = []
    for c in word:
        class_dir = os.path.join(train_dir, c)
        idx = random.randint(0, classes_lengths[c]-1)

        img = get_grayscale_image(os.path.join(class_dir, classes_imgs[c][idx]))
        char = Image.fromarray(crop_grayscale(img), mode='L').convert('RGB')
        max_height = max(max_height, char.size[1])
        widths.append(char.size[0])
        char_imgs.append(char)

    padding_range = (0, 4)
    padded_chars = []
    random_paddings = [random.randint(*padding_range) for i in range(len(char_imgs) - 1)] + [0]

    for char_idx in range(len(char_imgs)):
        char_img = char_imgs[char_idx]
        vert_padding = (max_height - char_img.size[1]) // 2 + 3

        horz_padding = random_paddings[char_idx]

        pad_img = ImageOps.expand(ImageOps.invert(char_img), (0, vert_padding, horz_padding, vert_padding), (255, 255, 255))

        padded_chars.append(pad_img)

    new_im = Image.new('RGB', (sum(widths) + sum(random_paddings), max_height + 6), color=(255, 255, 255))
    x_offset = 0
    for im in padded_chars:
      new_im.paste(im, (x_offset, 0))
      x_offset += im.size[0]
    new_im.save(output_path)


def generate_filename(word, frmt='jpg'):
    return word + '_' + str(time.time()) + '_' + str(random.randint(100, 999)) + '.' + frmt


In [12]:
out_dir = '/content/words/train/'

os.makedirs(out_dir, exist_ok=True)

for idx in range(50000):
  random_w_idx = random.randint(0, len(words_list) - 1)
  word = words_list[random_w_idx]
  filename = generate_filename(word)
  while os.path.exists(os.path.join(out_dir, filename)):
    filename = generate_filename(word)
  generate_word(word, os.path.join(out_dir, filename))



In [13]:
import shutil
shutil.make_archive("/content/words_archive", "zip", out_dir)


'/content/words_archive.zip'