In [42]:
import pickle
import pandas as pd
import numpy as np
import time
import math
import os
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import datetime

a1dir = "../../../scratch/lt2326-h21/a1"
imgs = os.listdir(a1dir+'/images')

batch_size = 4

In [71]:
def get_meta(direc):
    
    df = pd.read_json(direc + "/train.jsonl", lines=True)
    imgs = os.listdir(direc+'/images')
    #print(df.columns)
    df =  df[df['file_name'].isin(imgs)]
    df = df.drop(['ignore', 'image_id', 'height', 'width'], axis=1)    
    return df

In [72]:
df = get_meta(a1dir)

In [27]:
# this literally doesn't even take a second so no worries
def get_bboxes(df, k=None):
    
    if k is not None:
        df = df.sample(k, random_state=42)
    
    all_bboxes = {}
    for file in df.iterrows():
        img_bboxes = [] # bboxes per image
        for sign in file[1]['annotations']:
            for d in sign:
                if d['is_chinese']:
                    x, y, w, h = d['adjusted_bbox']
                    xmin = int(math.floor(x))
                    xmax = int(math.ceil(x + w))
                    ymin = int(math.floor(y))
                    ymax = int(math.ceil(y + h))
                    bbox = np.array([xmin, ymin, xmax, ymax])
                    img_bboxes.append(bbox)


        all_bboxes[file[1]['file_name']] = np.array(img_bboxes)
    
    return all_bboxes
    

In [51]:
tic = time.perf_counter()
all_bboxes = get_bboxes(df)
toc = time.perf_counter()
print(f"Finished in {toc - tic:0.4f} seconds")

Finished in 0.2483 seconds


In [52]:
def get_labels(bboxes=all_bboxes):
    
    print(f'Time now: {datetime.datetime.now().strftime("%H:%M:%S")}')
    
    p =  [(img_i0, img_i1) for img_i0, img_i1 in np.ndindex(2048, 2048)] # first two numbers of indices of each image

    tic = time.perf_counter()
    labelling = {}
    count = 0
    for fn, bbox in all_bboxes.items():
        tac = time.perf_counter()
        lab = ((p>=bbox[:,None,:2]) & (p<=bbox[:,None,2:])).all(2) # thanks https://stackoverflow.com/questions/62235257/determine-if-many-points-are-in-bounding-box
        lab2 = lab.reshape(len(bbox), 2048, 2048).astype(int)
        #lab2int = lab2.astype(int)
        lab2intsumclip = np.clip(np.sum(lab2, axis = 0), 0, 1).reshape(2048, 2048, 1)
        #lab2intsumreshape = lab2intsumclip.reshape(2048, 2048, 1)

        labelling[fn] = lab2intsumclip
        count += 1
        tuc = time.perf_counter()
        print(f"Finished image {fn} in {tuc - tac:0.4f} seconds, {count}/{len(all_bboxes)}")


    toc = time.perf_counter()
    final_time = str(datetime.timedelta(seconds=round(toc - tic)))
    s_per_img = (toc - tic)/len(all_bboxes)
    
    print(f"Finished in {final_time}\nAvg. time per img: {s_per_img} seconds.")
    print(f'Finished at: {datetime.datetime.now().strftime("%H:%M:%S")}')
    
    return labelling


In [53]:
labelling = get_labels()

Time now: 22:55:32
Finished image 0000172.jpg in 13.3935 seconds, 1/845
Finished image 0000174.jpg in 9.3783 seconds, 2/845
Finished image 0000176.jpg in 12.4703 seconds, 3/845
Finished image 0000181.jpg in 12.9510 seconds, 4/845
Finished image 0000183.jpg in 16.5338 seconds, 5/845
Finished image 0000223.jpg in 12.3339 seconds, 16/845
Finished image 0000225.jpg in 12.3185 seconds, 17/845
Finished image 0000230.jpg in 12.6340 seconds, 18/845
Finished image 0000233.jpg in 15.3009 seconds, 19/845
Finished image 0000234.jpg in 16.0894 seconds, 20/845
Finished image 0000392.jpg in 13.4432 seconds, 21/845
Finished image 0000395.jpg in 14.5208 seconds, 22/845
Finished image 0000397.jpg in 14.2179 seconds, 23/845
Finished image 0000400.jpg in 13.4316 seconds, 24/845
Finished image 0000401.jpg in 13.5317 seconds, 25/845
Finished image 0000406.jpg in 12.6224 seconds, 26/845
Finished image 0000415.jpg in 8.7132 seconds, 27/845
Finished image 0000419.jpg in 12.9708 seconds, 28/845
Finished image 0

Finished image 0000884.jpg in 9.9530 seconds, 163/845
Finished image 0000885.jpg in 10.2740 seconds, 164/845
Finished image 0000886.jpg in 19.8908 seconds, 165/845
Finished image 0000887.jpg in 15.4462 seconds, 166/845
Finished image 0000889.jpg in 11.5536 seconds, 167/845
Finished image 0000890.jpg in 9.6660 seconds, 168/845
Finished image 0000892.jpg in 14.8703 seconds, 169/845
Finished image 0000893.jpg in 17.4932 seconds, 170/845
Finished image 0000895.jpg in 19.5003 seconds, 171/845
Finished image 0000896.jpg in 14.0523 seconds, 172/845
Finished image 0000899.jpg in 12.2026 seconds, 173/845
Finished image 0000900.jpg in 13.5731 seconds, 174/845
Finished image 0000901.jpg in 12.0315 seconds, 175/845
Finished image 0000902.jpg in 9.2196 seconds, 176/845
Finished image 0000906.jpg in 12.7903 seconds, 177/845
Finished image 0000913.jpg in 10.9199 seconds, 178/845
Finished image 0000925.jpg in 9.9430 seconds, 179/845
Finished image 0000926.jpg in 9.8227 seconds, 180/845
Finished image 

Finished image 0001586.jpg in 15.1311 seconds, 314/845
Finished image 0001587.jpg in 14.5270 seconds, 315/845
Finished image 0001730.jpg in 8.4665 seconds, 316/845
Finished image 0001738.jpg in 9.2075 seconds, 317/845
Finished image 0001739.jpg in 11.2243 seconds, 318/845
Finished image 0001740.jpg in 11.2524 seconds, 319/845
Finished image 0001746.jpg in 7.5272 seconds, 320/845
Finished image 0001748.jpg in 11.5377 seconds, 321/845
Finished image 0001754.jpg in 9.8525 seconds, 322/845
Finished image 0001755.jpg in 9.2176 seconds, 323/845
Finished image 0001756.jpg in 11.8373 seconds, 324/845
Finished image 0001757.jpg in 8.9166 seconds, 325/845
Finished image 0001758.jpg in 10.2913 seconds, 326/845
Finished image 0001760.jpg in 7.5050 seconds, 327/845
Finished image 0001761.jpg in 9.3629 seconds, 328/845
Finished image 0001762.jpg in 9.3708 seconds, 329/845
Finished image 0001763.jpg in 11.2320 seconds, 330/845
Finished image 0001764.jpg in 10.6752 seconds, 331/845
Finished image 0001

Finished image 1000602.jpg in 11.8774 seconds, 464/845
Finished image 1000603.jpg in 13.2835 seconds, 465/845
Finished image 1000604.jpg in 13.4089 seconds, 466/845
Finished image 1000605.jpg in 12.0214 seconds, 467/845
Finished image 1000606.jpg in 12.6572 seconds, 468/845
Finished image 1000608.jpg in 12.9540 seconds, 469/845
Finished image 1000609.jpg in 10.8839 seconds, 470/845
Finished image 1000611.jpg in 10.9347 seconds, 471/845
Finished image 1000613.jpg in 17.1579 seconds, 472/845
Finished image 1000618.jpg in 16.0605 seconds, 473/845
Finished image 1000620.jpg in 8.8959 seconds, 474/845
Finished image 1000621.jpg in 10.1383 seconds, 475/845
Finished image 1000622.jpg in 11.3827 seconds, 476/845
Finished image 1000634.jpg in 18.2823 seconds, 477/845
Finished image 1000635.jpg in 17.3070 seconds, 478/845
Finished image 1000636.jpg in 16.7860 seconds, 479/845
Finished image 1000637.jpg in 14.5415 seconds, 480/845
Finished image 1000638.jpg in 16.9411 seconds, 481/845
Finished im

Finished image 1001557.jpg in 9.8585 seconds, 614/845
Finished image 1001558.jpg in 9.0795 seconds, 615/845
Finished image 1001560.jpg in 7.6395 seconds, 616/845
Finished image 1001561.jpg in 7.6310 seconds, 617/845
Finished image 1001562.jpg in 7.6647 seconds, 618/845
Finished image 1001563.jpg in 9.8488 seconds, 619/845
Finished image 1001564.jpg in 8.5701 seconds, 620/845
Finished image 1001573.jpg in 11.8885 seconds, 621/845
Finished image 1001574.jpg in 10.9366 seconds, 622/845
Finished image 1001575.jpg in 13.1165 seconds, 623/845
Finished image 1001576.jpg in 14.1201 seconds, 624/845
Finished image 1001577.jpg in 12.2079 seconds, 625/845
Finished image 1001578.jpg in 10.2439 seconds, 626/845
Finished image 1001579.jpg in 10.0014 seconds, 627/845
Finished image 1001581.jpg in 12.2030 seconds, 628/845
Finished image 1001582.jpg in 11.0622 seconds, 629/845
Finished image 1001583.jpg in 13.9346 seconds, 630/845
Finished image 1001584.jpg in 14.3674 seconds, 631/845
Finished image 10

Finished image 1002266.jpg in 10.7724 seconds, 764/845
Finished image 1002267.jpg in 11.7108 seconds, 765/845
Finished image 1002268.jpg in 11.8640 seconds, 766/845
Finished image 1002269.jpg in 11.5067 seconds, 767/845
Finished image 1002270.jpg in 10.7700 seconds, 768/845
Finished image 1002272.jpg in 13.0952 seconds, 769/845
Finished image 1002273.jpg in 10.6068 seconds, 770/845
Finished image 1002276.jpg in 27.1103 seconds, 771/845
Finished image 1002277.jpg in 28.6867 seconds, 772/845
Finished image 1002278.jpg in 30.6331 seconds, 773/845
Finished image 1002279.jpg in 23.8489 seconds, 774/845
Finished image 1002280.jpg in 23.5496 seconds, 775/845
Finished image 1002281.jpg in 17.1468 seconds, 776/845
Finished image 1002282.jpg in 19.3472 seconds, 777/845
Finished image 1002283.jpg in 11.8411 seconds, 778/845
Finished image 1002285.jpg in 9.7389 seconds, 779/845
Finished image 1002287.jpg in 9.9902 seconds, 780/845
Finished image 1002288.jpg in 9.1975 seconds, 781/845
Finished imag

In [55]:
labelling['0000172.jpg'].shape

(2048, 2048, 1)

In [54]:
pickle.dump(labelling, open("labelling_alsotrash.p", "wb"))

In [56]:
labelling = pickle.load(open("labelling_small.p", "rb"))

In [58]:
file_names = labelling.keys() 

In [59]:
def get_imgs(filenames, datadir, imgs):
    return {x : np.array(Image.open("{}/{}".format(datadir, x)).convert('RGB')) for x in filenames if x in imgs}

In [60]:
data = get_imgs(file_names, a1dir+'/images', imgs)

In [361]:
data_labels = [(data[i], labelling[i]) for i in labelling.keys()] # not sure if we need this oop

In [61]:
label_arrs = list(labelling.values())

In [62]:
img_arrs = list(data.values())

In [63]:
X_train, X_test, y_train, y_test = train_test_split(img_arrs, label_arrs, train_size=0.8, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, train_size=0.5, random_state=42)

In [64]:
load_train = DataLoader(list(zip(X_train, y_train)), batch_size=batch_size, shuffle=True)
load_test = DataLoader(list(zip(X_test, y_test)), batch_size=batch_size, shuffle=True)
load_val = DataLoader(list(zip(X_val, y_val)), batch_size=batch_size, shuffle=True)
