In [None]:
import urllib.request
import _pickle as cPickle
import os
import multiprocessing as mp

import scipy.misc
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

from pre_trained.cnn import PretrainedCNN
import tensorflow as tf

from IPython.display import display, SVG

# Preprocess: Image

Since the raw image takes about 20GB and may take days to download all of them. It's not included in the released file. But if you'd like to download origin image, you can request MS-COCO on-the-fly:

In [None]:
def download_image(img_dir, img_id):
    # download MS-COCO image
    urllib.request.urlretrieve('http://mscoco.org/images/{}'.format(img_id.split('.')[0]), os.path.join(img_dir, img_id))

## Transfer Learning: pre-trained CNN
Our task, image captioning, requires good understanding of images, like
 * objects appeared in the image
 * relative positions of objects
 * colors, sizes, ...etc

Training a good CNN from scratch is challenging and time-consuming, so we'll use existing pre-trained CNN model. The one we've prepared for you is the winner of 2012-ILSVRC model - VGG-16(or OxfordNet) in [pre_trained/cnn.py](pre_trained/cnn.py).

In [None]:
with open('model_ckpt/cnn-model.svg', 'rb') as f:
    arch = f.read()
display(SVG(arch))

VGG-16 consists of 16 layers, and we'll take the output of fc2 - the last layer before prediction layer, as input to our image-captioning model. However, since we have about 120,000 images, representing each image by 4,096 dimensions will make training inefficient and space-consuming. Therefore, dimensionality reduction techniques - PCA is used to reduce image feature dimension from 4096 to 256.

In summary, for each image, we do the following steps:
 1. raw image is fed into VGG-16
 2. take the output of second last layer
 3. apply PCA to reduce dimension to 256

### Preprocess: Image (step 1)

The pre-trained VGG-16 is taken from this [repository](https://github.com/machrisaa/tensorflow-vgg), which is converted to numpy format from origin Caffe model. However, there's a requirement for input image: the image must be
 * center cropped to $224\times224$
 * substracted mean image
 * converted to BGR format

In [None]:
# step 1
def process_image(img_dir, img_id):
    MEAN = np.array([103.939, 116.779, 123.68]).astype(np.float32) # BGR
    # crop image to 224 x 224 x 3 numpy array
    path = os.path.join(img_dir, img_id)
    if not os.path.exists(path):
        download_image(img_dir, img_id)
    img = scipy.misc.imread(path)
    # center crop
    short_edge = min(img.shape[:2])
    yy = int((img.shape[0] - short_edge) / 2)
    xx = int((img.shape[1] - short_edge) / 2)
    crop_img = img[yy: yy + short_edge, xx: xx + short_edge]
    img = scipy.misc.imresize(crop_img, (224,224,1))
    img = img.reshape((224,224,1)) if len(img.shape) < 3 else img
    if img.shape[2] < 3:
        print('{}: dimension insufficient'.format(path))
        img = img.reshape((224*224,img.shape[2])).T.reshape((img.shape[2], 224*224))
        for i in range(img.shape[0], 3):
            img = np.vstack([img, img[0,:]])
        img = img.reshape((3,224*224)).T.reshape((224,224,3))
    img = img.astype(np.float32)
    img = img[:,:,::-1]
    # RGB => BGR
    for i in range(3):
        img[:,:,i] -= MEAN[i]
    return img.reshape((224,224,3))

### Preprocess: Image (step 2)

feed processed image into VGG-16 and save the extracted feature to [dataset/train_img4096.pkl](dataset/train_img4096.pkl) and [dataset/test_img4096.pkl](dataset/test_img4096.pkl).

In [None]:
# step 2
def cnn_output(cnn_mdl, img_dir, img_ids, fout, bsize=100, ckpt_every=5000, frac=0.3, dev=0):
    # extract output of pre-trained CNN model
    fs, ims, outs, c = [], [], None, 0
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction=frac
    os.environ["CUDA_VISIBLE_DEVICES"]='{}'.format(dev)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        for img_id in img_ids:
            im = process_image(img_dir, img_id)
            ims.append(im)
            fs.append(img_id)
            c += 1
            if c % bsize == 0:
                print('{} done'.format(c))
                out = cnn_mdl.get_output(sess, ims)[0]
                outs = out if outs is None else np.vstack([outs, out])
                ims = []
                if c % ckpt_every == 0:
                    cPickle.dump({fs[i]:outs[i,:] for i in range(len(fs))}, open(fout, 'wb'))
        if c % bsize != 0:
            out = cnn_mdl.get_output(sess, ims)[0]
            outs = out if outs is None else np.vstack([outs, out])
        cPickle.dump({fs[i]:outs[i,:] for i in range(len(fs))}, open(fout, 'wb'))

To speedup the processing time, we use multiple GPUs at the same time.

In [None]:
def parallel(i, n_gpu=3):
    os.environ["CUDA_VISIBLE_DEVICES"]='{}'.format(int(i%n_gpu))
    vgg = PretrainedCNN('pre_trained/vgg16_mat.pkl')
    def run(train_test):
        ids = pd.read_csv('dataset/{}.csv'.format(train_test))['img_id']
        l = len(ids)
        bsize = int((l+n_gpu-1)/n_gpu)
        cnn_output(vgg, 'dataset/image', list(set(ids[bsize*i+th:bsize*(i+1)])), 'dataset/{}_img4096-{}.pkl'.format(train_test, i), bsize=100, frac=0.99, dev=(int(i%3)))
    run('train')
    run('test')
    
n_gpu = 3
pool=mp.Pool(processes=n_gpu)
pool.map(parallel, range(n_gpu))
pool.close()
pool.join()

train_img = {k:v for k,v in cPickle.load(open('dataset/train_img4096-{}.pkl'.format(i), 'rb')) for i in range(n_gpu)}
test_img = {k:v for k,v in cPickle.load(open('dataset/test_img4096-{}.pkl'.format(i), 'rb')) for i in range(n_gpu)}

cPickle.dump(train_img, open('train_img4096.pkl', 'wb'))
cPickle.dump(test_img, open('test_img4096.pkl', 'wb'))

for i in range(n_gpu):
    os.remove('dataset/train_img4096-{}.pkl'.format(i))
    os.remove('dataset/test_img4096-{}.pkl'.format(i))

### Preprocess: Image (step 3)

Reduce dimension of image feature from 4096 to 256 and save reduced image feature as [dataset/train_img256.pkl](dataset/train_img256.pkl) and [dataset/test_img256.pkl](dataset/test_img256.pkl).

In [None]:
# step 3
def pca(fin, fout, pca_transformer):
    img = cPickle.load(open(fin, 'rb'))
    K, V = [], []
    for k,v in img.items():
        K.append(k)
        V.append(v)
    x = np.array(V)
    print(x.shape)
    if pca_transformer is None:
        pca_transformer = PCA(n_components=256)
        pca_transformer.fit(x)
    x = pca_transformer.transform(x)
    for k,i in zip(K,range(len(K))):
        img[k] = x[i]
    cPickle.dump(img, open(fout, 'wb'))
    return pca_transformer

In [None]:
# perform PCA
pca_tf = None
pca_tf = pca('dataset/train_img4096.pkl', 'dataset/train_img256.pkl', pca_tf)
pca_tf = pca('dataset/test_img4096.pkl', 'dataset/test_img256.pkl', pca_tf)

The matrix $U$ used to perform PCA transforming is saved at [dataset/U.pkl](dataset/U.pkl).

In [None]:
# get PCA transformer matrix
img_train = cPickle.load(open('dataset/train_img256.pkl', 'rb'))
img_4096 = cPickle.load(open('dataset/train_img4096.pkl', 'rb'))

c = 0
V4096, V256 = [], []
for k,v in img_4096.items():
    V4096.append(v)
    V256.append(img_train[k])
    c += 1
    if c == 500:
        break
V4096 = np.array(V4096)
V256 = np.array(V256)
U = np.dot(np.linalg.pinv(V4096), V256)
cPickle.dump(U, open('dataset/U.pkl', 'wb'))

It should be enough for you to train a good image-captioning model. However, you're always welcome to use other CNN models to extract image features.

# Preprocess: Text

Dealing with raw strings is efficient, so we'll train on an encoded version of the captions. All necessary vocabularies is extracted in [dataset/text/vocab.pkl](dataset/text/vocab.pkl) and we'd like to represent captions by a sequence of integer IDs. However, since the length of captions may vary, our model needs to know where to start and stop. We'll append 2 special tokens `<ST>` and `<ED>` to the beginning and end of each caption. Also, the smaller the vocabulary size is the more efficient training will be, so we'll remove rare words by replacing rare words by `<RARE>` token. In summary, we'll going to 
 * append `<ST>` and `<ED>` token to the beginning and end of each caption
 * replace rare words by `<RARE>` token
 * represent captions by vocabulary IDs

In [None]:
vocab = cPickle.load(open('dataset/text/vocab.pkl', 'rb'))
print('total {} vocabularies'.format(len(vocab)))

In [None]:
def count_vocab_occurance(vocab, df):
    voc_cnt = {v:0 for v in vocab}
    for img_id, row in df.iterrows():
        for w in row['caption'].split(' '):
            voc_cnt[w] += 1
    return voc_cnt

df_train = pd.read_csv(os.path.join('dataset', 'train.csv'))

print('count vocabulary occurances...')
voc_cnt = count_vocab_occurance(vocab, df_train)

# remove words appear < 100 times
thrhd = 100
x = np.array(list(voc_cnt.values()))
print('{} words appear >= 100 times'.format(np.sum(x[(-x).argsort()] >= thrhd)))

In [None]:
def build_voc_mapping(voc_cnt, thrhd):
    """
    enc_map: voc --encode--> id
    dec_map: id --decode--> voc
    """
    def add(enc_map, dec_map, voc):
        enc_map[voc] = len(dec_map)
        dec_map[len(dec_map)] = voc
        return enc_map, dec_map
    # add <ST>, <ED>, <RARE>
    enc_map, dec_map = {}, {}
    for voc in ['<ST>', '<ED>', '<RARE>']:
        enc_map, dec_map = add(enc_map, dec_map, voc)
    for voc, cnt in voc_cnt.items():
        if cnt < thrhd: # rare words => <RARE>
            enc_map[voc] = enc_map['<RARE>']
        else:
            enc_map, dec_map = add(enc_map, dec_map, voc)
    return enc_map, dec_map

enc_map, dec_map = build_voc_mapping(voc_cnt, thrhd)
# save enc/decoding map to disk
cPickle.dump(enc_map, open('dataset/text/enc_map.pkl', 'wb'))
cPickle.dump(dec_map, open('dataset/text/dec_map.pkl', 'wb'))
vocab_size = len(dec_map)

In [None]:
def caption_to_ids(enc_map, df):
    img_ids, caps = [], []
    for idx, row in df.iterrows():
        icap = [enc_map[x] for x in row['caption'].split(' ')]
        icap.insert(0, enc_map['<ST>'])
        icap.append(enc_map['<ED>'])
        img_ids.append(row['img_id'])
        caps.append(icap)
    return pd.DataFrame({'img_id':img_ids, 'caption':caps}).set_index(['img_id'])


enc_map = cPickle.load(open('dataset/text/enc_map.pkl', 'rb'))
print('[transform captions into sequences of IDs]...')
df_proc = caption_to_ids(enc_map, df_train)
df_proc.to_csv('dataset/text/train_enc_cap.csv')

In [None]:
def decode(dec_map, ids):
    # decode IDs back to origin caption string
    return ' '.join([dec_map[x] for x in ids])

dec_map = cPickle.load(open('dataset/text/dec_map.pkl', 'rb'))

print('And you can decode back easily to see full sentence...\n')
for idx, row in df_proc.iloc[:8].iterrows():
    print('{}: {}'.format(idx, decode(dec_map, eval(row['caption']))))

In [None]:
def generate_embedding_matrix(w2v_path, dec_map, lang_dim=100):
    out_vocab = []
    embeddings_index = {}
    f = open(w2v_path, 'r')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    # prepare embedding matrix
    embedding_matrix = np.random.rand(len(dec_map), lang_dim)
    for idx, wd in dec_map.items():
        if wd in embeddings_index.keys():
            embedding_matrix[idx] = embeddings_index[wd]
        else:
            out_vocab.append(wd)
    print('words: "{}" not in pre-trained vocabulary list'.format(','.join(out_vocab)))
    return embedding_matrix

## Transfer Learning: pre-trained word embedding
Image captioning also requires good unstanding of word meaning, so it's a good idea to use pre-trained word embedding. We'll take advantages of the released by Google - [GloVe](http://nlp.stanford.edu/projects/glove). As an example, we choose to use the smallest release [pre_trained/glove.6B.100d.txt](pre_trained/glove.6B.100d.txt), which is trained on 6 billion corpus of Wikipedia and Gigaword. Again, you're welcomed to use any pre-trained word embedding.<br>

We'll pick the embedding vector of our vocabularies and save the embedding matrix in [dataset/text/embedding_matrix.pkl](dataset/text/embedding_matrix.pkl).

In [None]:
def generate_embedding_matrix(w2v_path, dec_map, lang_dim=100):
    out_vocab = []
    embeddings_index = {}
    f = open(w2v_path, 'r')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    # prepare embedding matrix
    embedding_matrix = np.random.rand(len(dec_map), lang_dim)
    for idx, wd in dec_map.items():
        if wd in embeddings_index.keys():
            embedding_matrix[idx] = embeddings_index[wd]
        else:
            out_vocab.append(wd)
    print('words: "{}" not in pre-trained vocabulary list'.format(','.join(out_vocab)))
    return embedding_matrix

dec_map = cPickle.load(open('dataset/text/dec_map.pkl', 'rb'))
embedding_matrix = generate_embedding_matrix('pre_trained/glove.6B.100d.txt', dec_map)
cPickle.dump(embedding_matrix, open('dataset/text/embedding_matrix.pkl', 'wb'))