In [1]:
from pathlib import Path
import json
import pandas as pd
import pickle
from tqdm import tqdm

In [2]:
from dask import delayed
import dask.bag as db
from operator import add
from PIL import Image

In [3]:
from fastai.text import *
from fastai.callbacks import SaveModelCallback, ReduceLROnPlateauCallback, CSVLogger
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader

In [4]:
!pwd
CURRENT_PATH = Path('.')

/home/data/Project/work2


## COCO (Read data)

In [5]:
!find COCO/cocoapi/images/test2014/ | wc -l
!find COCO/cocoapi/images/train2014/ | wc -l
!find COCO/cocoapi/images/val2014/ | wc -l

40776
82785
40506


In [5]:
COCO_PATH = Path('COCO/cocoapi/')

COCO_ANNOTATIONS = COCO_PATH/'annotations'

COCO_TRAIN_ANNOTATIONS = COCO_ANNOTATIONS/'captions_train2014.json'
COCO_VAL_ANNOTATIONS = COCO_ANNOTATIONS/'captions_val2014.json'

COCO_TRAIN_PATH = COCO_PATH/'train'
COCO_TEST_PATH = COCO_PATH/'test'
COCO_VAL_PATH = COCO_PATH/'val'


In [7]:
coco_train_json = json.load((COCO_TRAIN_ANNOTATIONS).open())
coco_val_json = json.load((COCO_VAL_ANNOTATIONS).open())

In [8]:
coco_val_json['images'][0], coco_val_json['annotations'][0], len(coco_val_json['images']), len(coco_val_json['annotations'])

({'license': 3,
  'file_name': 'COCO_val2014_000000391895.jpg',
  'coco_url': 'http://images.cocodataset.org/val2014/COCO_val2014_000000391895.jpg',
  'height': 360,
  'width': 640,
  'date_captured': '2013-11-14 11:18:45',
  'flickr_url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg',
  'id': 391895},
 {'image_id': 203564,
  'id': 37,
  'caption': 'A bicycle replica with a clock as the front wheel.'},
 40504,
 202654)

In [9]:
coco_drask_val_images = db.from_sequence(coco_val_json['images'])
coco_drask_val_ann = db.from_sequence(coco_val_json['annotations'])
coco_drask_train_images = db.from_sequence(coco_train_json['images'])
coco_drask_train_ann = db.from_sequence(coco_train_json['annotations'])

In [10]:
coco_drask_val_images.take(2)

({'license': 3,
  'file_name': 'COCO_val2014_000000391895.jpg',
  'coco_url': 'http://images.cocodataset.org/val2014/COCO_val2014_000000391895.jpg',
  'height': 360,
  'width': 640,
  'date_captured': '2013-11-14 11:18:45',
  'flickr_url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg',
  'id': 391895},
 {'license': 4,
  'file_name': 'COCO_val2014_000000522418.jpg',
  'coco_url': 'http://images.cocodataset.org/val2014/COCO_val2014_000000522418.jpg',
  'height': 480,
  'width': 640,
  'date_captured': '2013-11-14 11:38:44',
  'flickr_url': 'http://farm1.staticflickr.com/1/127244861_ab0c0381e7_z.jpg',
  'id': 522418})

In [11]:
coco_drask_val_ann.take(2)

({'image_id': 203564,
  'id': 37,
  'caption': 'A bicycle replica with a clock as the front wheel.'},
 {'image_id': 179765,
  'id': 38,
  'caption': 'A black Honda motorcycle parked in front of a garage.'})

In [12]:
def generate_captions(drask_ann, drask_images, pname=None):
    g1 = (drask_ann
      .foldby(key='image_id', 
              binop=(lambda tot, x : tot + [x['caption']]), 
              initial=[], 
              combine=add, 
              combine_initial=[])
      .map(lambda x: (x[0], x[1])))
    
    g2 = drask_images.map(lambda record: (record['id'], record['file_name']))
    data = list(g1.join(delayed(g2), lambda x: x[0]).map(lambda x: (x[0][1], x[1][1])))
    pickle.dump(data, open(pname, 'wb'))

In [13]:
#%time generate_captions(coco_drask_val_ann, coco_drask_val_images, COCO_PATH/'val_data.pk')

CPU times: user 8.21 s, sys: 199 ms, total: 8.41 s
Wall time: 10.6 s


In [14]:
#%time generate_captions(coco_drask_train_ann, coco_drask_train_images, COCO_PATH/'train_data.pk')

CPU times: user 17.9 s, sys: 379 ms, total: 18.3 s
Wall time: 23 s


In [6]:
coco_valid_fns_caps = pickle.load(open(CURRENT_PATH/'coco_val_fns_caps.pkl', 'rb'))
coco_train_fns_caps = pickle.load(open(CURRENT_PATH/'coco_train_fns_caps.pkl', 'rb'))

In [16]:
!ls -altr {COCO_PATH}/'val_data.pk'
!ls -altr {COCO_PATH}/'train_data.pk'

-rw-rw-r-- 1 shajikk shajikk 14798830 Jul 30 01:50 COCO/cocoapi/val_data.pk
-rw-rw-r-- 1 shajikk shajikk 30439033 Jul 30 01:50 COCO/cocoapi/train_data.pk


In [17]:
coco_valid_fns_caps[:2]

[('COCO_val2014_000000203564.jpg',
  ['A bicycle replica with a clock as the front wheel.',
   'The bike has a clock as a tire.',
   'A black metal bicycle with a clock inside the front wheel.',
   'A bicycle figurine in which the front wheel is replaced with a clock\n',
   'A clock with the appearance of the wheel of a bicycle ']),
 ('COCO_val2014_000000179765.jpg',
  ['A black Honda motorcycle parked in front of a garage.',
   'A Honda motorcycle parked in a grass driveway',
   'A black Honda motorcycle with a dark burgundy seat.',
   'Ma motorcycle parked on the gravel in front of a garage',
   'A motorcycle with its brake extended standing outside'])]

## VIZWIZ (read data)

In [18]:
!find vizwiz/data/test/ | wc -l
!find vizwiz/data/train/ | wc -l
!find vizwiz/data/val/ | wc -l

8001
23955
7751


In [9]:
VIZWIZ_PATH = Path('vizwiz/data')

VIZWIZ_ANNOTATIONS = VIZWIZ_PATH/'annotations'

VIZWIZ_TRAIN_ANNOTATIONS = VIZWIZ_ANNOTATIONS/'train.json'
VIZWIZ_VAL_ANNOTATIONS = VIZWIZ_ANNOTATIONS/'val.json'

VIZWIZ_TRAIN_PATH = VIZWIZ_PATH/'train'
VIZWIZ_VAL_PATH = VIZWIZ_PATH/'val'

In [None]:
!ls 

In [20]:
vizwiz_train_json = json.load((VIZWIZ_TRAIN_ANNOTATIONS).open())
vizwiz_val_json = json.load((VIZWIZ_VAL_ANNOTATIONS).open())

In [21]:
vizwiz_val_json['images'][0], vizwiz_val_json['annotations'][0], len(vizwiz_val_json['images']), len(vizwiz_val_json['annotations'])

({'file_name': 'VizWiz_val_00000000.jpg',
  'vizwiz_url': 'https://ivc.ischool.utexas.edu/VizWiz_visualization_img/VizWiz_val_00000000.jpg',
  'id': 23431,
  'text_detected': True},
 {'caption': 'A computer screen shows a repair prompt on the screen.',
  'image_id': 23431,
  'is_precanned': False,
  'is_rejected': False,
  'id': 117155,
  'text_detected': True},
 7750,
 38750)

In [22]:
vizwiz_drask_val_images = db.from_sequence(vizwiz_val_json['images'])
vizwiz_drask_val_ann = db.from_sequence(vizwiz_val_json['annotations'])
vizwiz_drask_train_images = db.from_sequence(vizwiz_train_json['images'])
vizwiz_drask_train_ann = db.from_sequence(vizwiz_train_json['annotations'])

In [23]:
vizwiz_drask_val_ann = vizwiz_drask_val_ann.filter(lambda x: x['is_rejected'] == False and x['is_precanned'] == False)
vizwiz_drask_train_ann = vizwiz_drask_train_ann.filter(lambda x: x['is_rejected'] == False and x['is_precanned'] == False)

In [24]:
%time generate_captions(vizwiz_drask_val_ann, vizwiz_drask_val_images, VIZWIZ_PATH/'val_data.pk')

CPU times: user 1.93 s, sys: 92.8 ms, total: 2.03 s
Wall time: 2.4 s


In [25]:
%time generate_captions(vizwiz_drask_train_ann, vizwiz_drask_train_images, VIZWIZ_PATH/'train_data.pk')

CPU times: user 5.59 s, sys: 146 ms, total: 5.74 s
Wall time: 7.01 s


In [7]:
vizwiz_valid_fns_caps = pickle.load(open(CURRENT_PATH/'vizwiz_val_fns_caps.pkl', 'rb'))

In [8]:
vizwiz_train_fns_caps = pickle.load(open(CURRENT_PATH/'vizwiz_train_fns_caps.pkl', 'rb'))

In [28]:
!ls -altr {VIZWIZ_PATH}/'val_data.pk'

-rw-rw-r-- 1 shajikk shajikk 2757076 Jul 30 01:50 vizwiz/data/val_data.pk


In [29]:
!ls -altr {VIZWIZ_PATH}/'train_data.pk'

-rw-rw-r-- 1 shajikk shajikk 8361353 Jul 30 01:51 vizwiz/data/train_data.pk


In [30]:
vizwiz_valid_fns_caps[:2]

[('VizWiz_val_00000000.jpg',
  ['A computer screen shows a repair prompt on the screen.',
   'a computer screen with a repair automatically pop up',
   'partial computer screen showing the need of repairs',
   'Part of a computer monitor showing a computer repair message.',
   'The top of a laptop with a blue background and dark blue text.']),
 ('VizWiz_val_00000001.jpg',
  ['A person is holding a bottle that has medicine for the night time.',
   'A bottle of medication has a white twist top.',
   'night time medication bottle being held by someone',
   'a person holding a small black bottle of NIGHT TIME',
   'A bottle of what appears to be cough syrup held in hand.'])]

In [8]:
len(vizwiz_valid_fns_caps), len(vizwiz_train_fns_caps)

(7542, 22866)

## Tokenize

In [32]:
coco_train_caps = list(itertools.chain.from_iterable(list(zip(*coco_train_fns_caps))[1]))

In [33]:
coco_valid_caps = list(itertools.chain.from_iterable(list(zip(*coco_valid_fns_caps))[1]))

In [34]:
vizwiz_train_caps = list(itertools.chain.from_iterable(list(zip(*vizwiz_train_fns_caps))[1]))

In [35]:
vizwiz_valid_caps = list(itertools.chain.from_iterable(list(zip(*vizwiz_valid_fns_caps))[1]))

In [36]:
total_caps = coco_train_caps + coco_valid_caps + vizwiz_train_caps + vizwiz_valid_caps
len(total_caps)

750487

In [37]:
total_caps[:2]

['A very clean and well decorated empty bathroom',
 'A blue and white bathroom with butterfly themed wall tiles.']

In [38]:
tokenizer = Tokenizer(n_cpus=6)
all_captions_tokenized = tokenizer.process_all(total_caps)

In [39]:
vocab = Vocab.create(all_captions_tokenized, max_vocab=100000, min_freq=2)
pickle.dump(vocab, open(CURRENT_PATH/"vocab.pkl", 'wb'))

In [9]:
vocab = pickle.load((CURRENT_PATH/"vocab.pkl").open('rb'))

In [10]:
len(vocab.itos)

22400

In [42]:
from dask.distributed import Client
client = Client(processes = True)

In [43]:
from multiprocessing import Pool, cpu_count, Queue
from timeit import default_timer as timer

In [44]:
num_processes = cpu_count() * 2

def numericalize_tokens(tok):
    return np.array([vocab.numericalize(q) + [1] for q in tok])

def process_stuff(x):    
    return((x[0], [ numericalize_tokens(tokenizer._process_all_1([e]))[0] for e in x[1]]))

def build_data(fns_caps, PATH, name) :
    print(len(fns_caps))
    pool = Pool(processes=num_processes)
    caps = fns_caps
    results = [pool.apply_async(process_stuff, args=(x,)).get() for x in caps]
    #output = [p.get() for p in results]
    k = list(zip(*results))
    print("Write : ", str(PATH)+"/"+name+".pkl")
    pickle.dump(k, open(str(PATH)+"/"+name+".pkl", 'wb'))

In [45]:
# def build_data(fns_caps, PATH, name):
#     caps_arr = (db.from_sequence(fns_caps)
#      .map(lambda x : (x[0], [ numericalize_tokens(tokenizer._process_all_1([e]))[0] for e in x[1]])   )).compute()
#     dataset = list(zip(*caps_arr))
#     print("Write : ", str(PATH)+"/"+name+".pkl")
#     pickle.dump(dataset, open(str(PATH)+"/"+name+".pkl", 'wb'))

In [46]:
%time build_data(coco_valid_fns_caps, CURRENT_PATH, "coco_valid_1")

40504
Write :  ./coco_valid_1.pkl
CPU times: user 1min 54s, sys: 17.9 s, total: 2min 12s
Wall time: 1h 41min 53s


In [14]:
coco_valid_data = pickle.load(open(CURRENT_PATH/'coco_valid.pkl', 'rb'))
len(coco_valid_data[0])

40504

In [48]:
%time build_data(coco_train_fns_caps, CURRENT_PATH, "coco_train_1")

82783
Write :  ./coco_train_1.pkl
CPU times: user 3min 48s, sys: 41.3 s, total: 4min 29s
Wall time: 3h 27min 47s


In [10]:
coco_train_data = pickle.load(open(CURRENT_PATH/'coco_train.pkl', 'rb'))
len(coco_train_data[0])

82783

In [50]:
%time build_data(vizwiz_valid_fns_caps, CURRENT_PATH, "vizwiz_valid_1")

7542
Write :  ./vizwiz_valid_1.pkl
CPU times: user 18.8 s, sys: 3.8 s, total: 22.6 s
Wall time: 16min 51s


In [11]:
vizwiz_valid_data = pickle.load(open(CURRENT_PATH/'vizwiz_valid.pkl', 'rb'))
len(vizwiz_valid_data[0])

7542

In [52]:
%time build_data(vizwiz_train_fns_caps, CURRENT_PATH, "vizwiz_train_1")

22866
Write :  ./vizwiz_train_1.pkl
CPU times: user 59.2 s, sys: 12.9 s, total: 1min 12s
Wall time: 51min 17s


In [12]:
vizwiz_train_data = pickle.load(open(CURRENT_PATH/'vizwiz_train.pkl', 'rb'))
len(vizwiz_train_data[0])

22866

In [15]:
coco_valid_data[1][:2]

([array([   9,  414, 4517,   15,    9,  101,  134,   13,   43, 1152,   10,    1]),
  array([   5,   13,  286,   48,    9,  101,  134,    9, 2054,   10,    1]),
  array([   9,   35,  315,  414,   15,    9,  101,  167,   13,   43, 1152,   10,    1]),
  array([   9,  414, 1988,   14,  446,   13,   43, 1152,   17, 8094,   15,    9,  101,  426,    1]),
  array([   9,  101,   15,   13, 7865,   11,   13, 1152,   11,    9,  414,    1])],
 [array([   9,   35,    5, 5164,  162,   90,   14,   43,   11,    9, 1161,   10,    1]),
  array([   9,    5, 5164,  162,   90,   14,    9,   82, 1519,    1]),
  array([   9,   35,    5, 5164,  162,   15,    9,  244, 3488,  489,   10,    1]),
  array([   5, 6408,  162,   90,   12,   13, 1681,   14,   43,   11,    9, 1161,    1]),
  array([   9,  162,   15,  163, 6388, 2217,   27,  141,    1])])

In [19]:
all_train_fns_caps = coco_train_fns_caps + vizwiz_train_fns_caps

In [20]:
all_valid_fns_caps = coco_valid_fns_caps + vizwiz_valid_fns_caps

In [26]:
all_train_fns_caps[:1]

[('COCO_train2014_000000318556.jpg',
  ['A very clean and well decorated empty bathroom',
   'A blue and white bathroom with butterfly themed wall tiles.',
   'A bathroom with a border of butterflies and blue paint on the walls above it.',
   'An angled view of a beautifully decorated bathroom.',
   'A clock that blends in with the wall hangs in a bathroom. '])]

In [21]:
len(all_train_fns_caps), len(all_valid_fns_caps)

(105649, 48046)

In [35]:
all_train_data = [[],[]]
all_val_data = [[],[]]
all_train_data[0] = coco_train_data[0] + vizwiz_train_data[0]
all_train_data[1] = coco_train_data[1] + vizwiz_train_data[1]
all_valid_data[0] = coco_valid_data[0] + vizwiz_valid_data[0]
all_valid_data[1] = coco_valid_data[1] + vizwiz_valid_data[1]

In [36]:
len(all_train_data[0]), len(all_valid_data[0])

(105649, 48046)

In [38]:
pickle.dump(all_train_fns_caps, open("all_train_fns_caps.pkl", 'wb'))
pickle.dump(all_valid_fns_caps, open("all_valid_fns_caps.pkl", 'wb'))
pickle.dump(all_train_data, open("all_train_data.pkl", 'wb'))
pickle.dump(all_valid_data, open("all_valid_data.pkl", 'wb'))

## Load the test set

In [40]:
!ls {VIZWIZ_ANNOTATIONS}

test.json  train.json  val.json


In [41]:
coco_test_json = json.load((VIZWIZ_ANNOTATIONS/'test.json').open())

In [42]:
vizwiz_drask_test_images = db.from_sequence(coco_test_json['images'])

In [43]:
vizwiz_drask_test_images.take(1)

({'file_name': 'VizWiz_test_00000000.jpg',
  'vizwiz_url': 'https://ivc.ischool.utexas.edu/VizWiz_visualization_img/VizWiz_test_00000000.jpg',
  'id': 31181,
  'text_detected': True},)

In [44]:
CURRENT_PATH = Path('.')
def get_path(fn, path=CURRENT_PATH, tag='val') :
    if ("COCO_" in fn and tag == 'val') : fn = path/'COCO'/'cocoapi'/'images'/'val2014'/fn
    elif ("VizWiz_" in fn and tag == 'val') : fn = path/'vizwiz'/'data'/'val'/fn
    elif ("VizWiz_" in fn and tag == 'test') : fn = path/'vizwiz'/'data'/'test'/fn
    elif ("COCO_" in fn and tag == 'train') : fn = path/'COCO'/'cocoapi'/'images'/'train2014'/fn
    elif ("VizWiz_" in fn and tag == 'train') : fn = path/'vizwiz'/'data'/'train'/fn
    else : fn = 'error'
    return fn

In [45]:
mapped_data = vizwiz_drask_test_images.map(lambda record: ({'file_name' : get_path(record['file_name'], CURRENT_PATH, 'test'), 
                                                            'image_id' : record['id'],  
                                                            'caption' : ""}))


mapped_data.take(2)

({'file_name': PosixPath('vizwiz/data/test/VizWiz_test_00000000.jpg'),
  'image_id': 31181,
  'caption': ''},
 {'file_name': PosixPath('vizwiz/data/test/VizWiz_test_00000001.jpg'),
  'image_id': 31182,
  'caption': ''})

In [47]:
vizwiz_test_dict = mapped_data.compute()

In [48]:
pickle.dump(vizwiz_test_dict, open("vizwiz_test_dict.pkl", 'wb'))

In [82]:
vizwiz_result = []
vizwiz_result_all = []
vizwiz_test_dict = pickle.load(open(CURRENT_PATH/'vizwiz_test_dict.pkl', 'rb'))
for v in vizwiz_test_dict :
    file = v['file_name']
    caption = 'test'
    vizwiz_result.append({ 'image_id' : v['image_id'], 'caption' : caption})
    vizwiz_result_all.append({ 'image_id' : v['image_id'], 'caption' : caption, 'file_name' : v['file_name']})
    #print(v['file_name'])
    
json_object = json.dumps(vizwiz_result, indent = 4)   
with open('vizwiz_result.json', 'w') as fp: fp.write(json_object)

In [86]:
len(vizwiz_result_all)

8000