# Notebook for configuring the project
The project already contains files needed to run it by default in `data AM substitution/`, copy it's content to `data/` and project will be configured

Below are instructions to change parts of this default configuration (most likely third section - adding more images is most useful for you)

## Adding custom text
If you are not satisfied with source text in `data AM substitution/newsgroup/newsgroup.txt` you can just directly edit it. The only thing that matters is the name of the folder and file - keep it the same.

## Constructing necessary font models
This cell looks at fonts, specified by `data/fonts/fontlist.txt` and constructs linear models needed for rendering them on images

Run this after adding more fonts to `data/fonts/` to generate new font models at `data/models/font_px2pt.cp`
"Font models map each pixel of the font to related points, because points have physical measure of length while pixels dont." - say authors of original paper.

In [None]:
# Author: Ankush Gupta
# Date: 2015

"""
THIS SCRIPT IS NOT GUARANTEED TO WORK 

Fonts and font_px2pt.cp in `data AM substitution/` were configured using this file, so it's probably working,
but it's extremely hard to test
"""

import pygame
from pygame import freetype
from SynthTextCore.text_utils import FontState
import numpy as np 
import matplotlib.pyplot as plt 
import pickle as pkl


pygame.init()


ys = np.arange(8,200)
A = np.c_[ys,np.ones_like(ys)]

xs = []
models = {} #linear model

FS = FontState()
# plt.figure()
for i in range(len(FS.fonts)):
	print(i)
	font = freetype.Font(FS.fonts[i], size=12)
	h = []
	for y in ys:
		h.append(font.get_sized_glyph_height(float(y)))
	h = np.array(h)
	m,_,_,_ = np.linalg.lstsq(A,h)
	models[font.name] = m
	xs.append(h)

with open('data/models/font_px2pt.cp','wb') as f:
	pkl.dump(models,f)
# plt.plot(xs,ys[i])
# plt.show()


0
1
2
3
4
5
6
7
8
9


## Adding more source images
This section downloads a huge amount of pre-processed images from URLs specified below
(Alternatively, you can download it manually [here](https://academictorrents.com/details/2dba9518166cbd141534cbf381aa3e99a087e83c))

Then merges three parts: depth (15.4GB), segmentation (7.2GB) and images (9GB) into one `data/dset.h5` file (82GB).

This cell just prepares functions and dependencies

In [None]:
from tqdm import tqdm
import numpy as np
import h5py
import os, sys
import wget, tarfile
from SynthTextCore.common import *
import os.path as osp
from PIL import Image

DATA_PATH = 'data'
# path to the data-file, containing image, depth and segmentation:
DB_FNAME = osp.join(DATA_PATH, 'dset.h5')

# paths to the downloaded pre-processed data
more_depth_path = osp.join(DATA_PATH,'depth.h5')
more_seg_path = osp.join(DATA_PATH,'seg.h5')
more_img_file_path = osp.join(DATA_PATH, 'bg_img')

# url of the pre-processed data
URL_IMG = 'http://thor.robots.ox.ac.uk/~vgg/data/scenetext/preproc/bg_img.tar.gz'
URL_DEPTH = 'http://thor.robots.ox.ac.uk/~vgg/data/scenetext/preproc/depth.h5'
URL_SEG = 'http://thor.robots.ox.ac.uk/~vgg/data/scenetext/preproc/seg.h5'

# download the pre-processed data: background image, depth and segmentation
def download_preproc():
  if not osp.exists(more_img_file_path):
    try:
      colorprint(Color.BLUE,'\tdownloading image-data (8.9G) from: '+URL_IMG,bold=True)
      print()
      sys.stdout.flush()
      out_fname = 'bg_img.tar.gz'
      wget.download(URL_IMG,out=out_fname)
      tar = tarfile.open(out_fname)
      tar.extractall()
      tar.close()
      os.remove(out_fname)
      colorprint(Color.BLUE,'\n\tdata saved at:'+more_img_file_path,bold=True)
      sys.stdout.flush()
    except:
      print (colorize(Color.RED,'Image-Data not found and have problems downloading.',bold=True))
      sys.stdout.flush()
      sys.exit(-1)
  elif not osp.exists(more_seg_path):
    try: 
      colorprint(Color.BLUE,'\tdownloading segmentation-data (6.9G) from: '+URL_SEG,bold=True)
      print()
      sys.stdout.flush()
      out_fname = 'seg.h5'
      wget.download(URL_SEG,out=out_fname)
      colorprint(Color.BLUE,'\n\tdata saved at:'+more_seg_path,bold=True)
      sys.stdout.flush()
    except:
      print (colorize(Color.RED,'Segmentation-Data not found and have problems downloading.',bold=True))
      sys.stdout.flush()
      sys.exit(-1)
  elif not osp.exists(more_depth_path):
    try: 
      colorprint(Color.BLUE,'\tdownloading depth-data (15G) from: '+URL_DEPTH,bold=True)
      print()
      sys.stdout.flush()
      out_fname = 'depth.h5'
      wget.download(URL_DEPTH,out=out_fname)
      colorprint(Color.BLUE,'\n\tdata saved at:'+more_depth_path,bold=True)
      sys.stdout.flush()
    except:
      print (colorize(Color.RED,'Depth-Data not found and have problems downloading.',bold=True))
      sys.stdout.flush()
      sys.exit(-1)


# add/merge pre-processed data files into dset_8000.h5 
def add_more_data_into_dset(DB_FNAME,more_img_file_path,more_depth_path,more_seg_path):
  print (colorize(Color.GREEN,'adding data into h5 file..',bold=True))
  # open files (a:append, r:read, w:write/overwrite)
  db=h5py.File(DB_FNAME,'w')
  depth_db=h5py.File(more_depth_path, 'r')
  seg_db=h5py.File(more_seg_path, 'r')
  db.create_group('image')
  db.create_group('depth')
  db.create_group('seg')

  liist = os.listdir(more_img_file_path)
  # Should be the only problematic images in the dataset
  liist.remove('turtles_5.jpg')
  liist.remove('hubble_44.jpg')
  liist.remove('aquarium_126.jpg')
  liist.remove('van+gogh_19.jpg')
  iterator = tqdm(liist, desc="Constructing dset.h5", total=len(liist))
  try:
    for imname in iterator:
      if imname.endswith('.jpg'):
        full_path=more_img_file_path + '\\' + imname
        iterator.set_description(f"'{full_path}': {os.path.getsize(full_path)} bytes")

        try:
          img_np = np.array(Image.open(full_path))
        except OSError:
          continue

        # specify exceptions, because not every image has a corresponding depth and segmentation 
        try:
          db['depth'].create_dataset(imname,data=depth_db[imname])
        except KeyError:
          continue 

        try:
          db['seg'].create_dataset(imname,data=seg_db['mask'][imname])
          db['seg'][imname].attrs['area']=seg_db['mask'][imname].attrs['area']
          db['seg'][imname].attrs['label']=seg_db['mask'][imname].attrs['label']
        except KeyError:
          continue

        db['image'].create_dataset(imname,data=img_np)
    print(colorize(Color.GREEN,'\t-> done',bold=True))
  except KeyboardInterrupt:
    print(colorize(Color.RED,'\tExiting early',bold=True))
  finally:
    db.close()
    depth_db.close()
    seg_db.close()
    print (colorize(Color.BLUE,'Stored the data in: '+DB_FNAME, bold=True))


If you downloaded the dataset manually, make sure to set up the files like this:
data/
  - depth.h5
  - seg.h5
  - bg_img/
    - img1.jpg
    - img2.jpg
    - ...

Or else this script will try to download them anyway

After initialization, run this cell.
Make sure you have enough disk space! This script __should__ be safe in the sense that any data already saved will be fine, even if the script crashes in the middle of merging, but there might be something I did not catch!

In [None]:
if __name__ == '__main__':
  download_preproc() # won't do anything if data/ is set up like described above
  add_more_data_into_dset(DB_FNAME,more_img_file_path,more_depth_path,more_seg_path)