In [2]:

from PIL import Image
import cv2
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.path as mppath
import os
import json

img_dir = '/scratch/lt2326-h21/a1/images/'
meta_file = '/scratch/lt2326-h21/a1/info.json'
annotation_file = '/scratch/lt2326-h21/a1/train.jsonl'

# open folder, loop through files, check if file in training set
with open(meta_file, "r") as json_file:
    meta = json.load(json_file)
    train_file_names = [obj['file_name'] for obj in meta['train']]
    files_to_keep = [img_path for img_path in os.listdir(img_dir) if img_path in train_file_names]

# split into training, validation, and test datasets
from sklearn.model_selection import train_test_split
training_data, temp_data = train_test_split(files_to_keep, test_size=0.3)
validation_data = temp_data[:len(temp_data)//2]
testing_data = temp_data[len(temp_data)//2:]

# find the corresponding bounding box information for each image
def get_bounding_boxes(files_list):
    with open(annotation_file, "r") as jsonl_file:
        result = {}
        for line in jsonl_file:
            line_as_dict = json.loads(line)
            if line_as_dict['file_name'] in files_list:
                for sentence in line_as_dict['annotations']:
                    for instance in sentence:
                        if instance['is_chinese']:
                            if line_as_dict['file_name'] not in result:
                                result[line_as_dict['file_name']] = []
                            result[line_as_dict['file_name']].append(instance['polygon'])
        return result
                            
def get_bounding_box1(data):
    with open('/scratch/lt2326-h21/a1/train.jsonl') as jsonl_f:
        jsonl_data = [json.loads(x) for x in jsonl_f]
        #images_polygons = []

        for dictionary in jsonl_data:
            if dictionary['file_name'] in data:
                insider_data = {dictionary['file_name']:[]}

                for sentence in dictionary['annotations']:
                    for character in sentence:
                        if character['is_chinese']:
                            insider_data[dictionary['file_name']].append(character['polygon'])
                #images_polygons.append(insider_data)
        
        return insider_data #images_polygons
        
training_dict = get_bounding_boxes(training_data)
validation_dict = get_bounding_boxes(validation_data)
testing_dict = get_bounding_boxes(testing_data)



In [3]:

grid = np.array([[[a,b] for b in list(range(2048))] for a in list(range(2048))])
grid = grid.reshape(2048*2048, 2)

# training data is tuple (img, pixel_labels)
# where img is of shape (2048, 2048, 3) (3 is rgb values of the pixel)
# pixel_labels is of shape (4194304,)
# meaning each pixel in img should have a label 1 or 0, corresponding if that pixel is inside the polygon

def get_pixel_labels(img_key, data_dict):
    # convert to numpy array
    img_array = mpimg.imread(img_dir + img_key)

    # if pixel is inside polygon, give it label true
    # if not, give label false
    final_labels = []
    for polygon in data_dict[img_key]:
        path = mppath.Path(np.array(polygon))
        pixel_labels = np.asarray(path.contains_points(grid), int)
        if len(final_labels) < 1:
            final_labels = pixel_labels
        else:
            final_labels = np.maximum(final_labels, pixel_labels) # replace old zeros with ones

    return (img_array, pixel_labels)


def parallel_processing(img_key, data_dict):
    return (get_pixel_labels(img_key, data_dict))


def process_in_parallel(validation_data_list):

    tic = time.time()
    processed_data = Parallel(n_jobs=10)(delayed(parallel_processing)(img_key, validation_dict) for img_key in list(validation_dict.keys()))
    toc = time.time()

    print('Elapsed time for the entire processing: {:.2f} s'.format(toc - tic))
    
    return processed_data

In [4]:
from joblib import Parallel, delayed
import time

print("Processing training data...")
train_processed = process_in_parallel(training_dict)
print("Processing test data...")
test_processed = process_in_parallel(testing_dict)
print("Processing validation data...")
val_processed = process_in_parallel(validation_dict)

Processing training data...
Elapsed time for the entire processing: 83.36 s
Processing test data...
Elapsed time for the entire processing: 82.08 s
Processing validation data...
Elapsed time for the entire processing: 82.45 s


In [5]:
print(len(list(validation_dict.keys())))

127
