In [1]:
from collections import defaultdict
from collections import namedtuple
import time
import random
import json
import string
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
import h5py
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from heapq import nlargest
import dill
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import requests
from io import BytesIO
from PIL import Image
import PIL
torch.manual_seed(1)
random.seed(1)

In [2]:
CUDA = torch.cuda.is_available()
print(CUDA)

False


In [3]:
IMG_FEATURES = "./images/IR_image_features.h5"
IMG_ID = "./images/IR_img_features2id.json"

TRAIN_HARD = "./Data/Hard/IR_train_hard.json"
TRAIN_EASY = "./Data/Easy/IR_train_easy.json"
TEST_HARD = "./Data/Hard/IR_test_hard.json"
TEST_EASY = "./Data/Easy/IR_test_easy.json"
VAL_HARD = "./Data/Hard/IR_val_hard.json"
VAL_EASY = "./Data/Easy/IR_val_easy.json"

IMGID2IMGINFO = "./Data/imgid2imginfo.json"

BATCH_SIZE = 100
LEARNING_RATE = 1e-3
EPOCHS = 1000

In [4]:
#GET features from images
with open("./images/IR_img_features2id.json", 'r') as f:
    visual_feat_mapping = json.load(f)['IR_imgid2id']
f.close()

img_features = np.asarray( h5py.File("./images/IR_image_features.h5", 'r')['img_features'])

def get_feature_from_id(img_id):
    h5_id = visual_feat_mapping[str(img_id)]
    return img_features[h5_id]

In [5]:
""" Helper function to show an image, from a image id """
def show_img_from_id(img_ids, target_id = -1):
    with open(IMGID2IMGINFO, 'r') as f:
        imgid2info = json.load(f)
    
    imgs = []
    for img_id in img_ids:
        response = requests.get(imgid2info[str(img_id)]['coco_url'])
        img = Image.open(BytesIO(response.content))
        imgs.append(img)
        
    f.close()
    
    min_shape = sorted( [(np.sum(i.size), i.size ) for i in imgs])[0][1]
    imgs_comb = np.hstack( (np.asarray( i.resize(min_shape) ) for i in imgs ) )

    # save that beautiful picture
    imgs_comb = PIL.Image.fromarray( imgs_comb)
    imgs_comb.show()  
    # imgs_comb.save( 'Trifecta_vertical.jpg' )
    
show_img_from_id([378466, 378466, 378466, 378466])

In [6]:
"""
Preprocess a sentence
"""
def preprocess(sentence, stemmer, stop):
    low_sent = sentence.lower()
    return word_tokenize(low_sent)

In [7]:
"""
Convert Samples
"""
def get_dialog_caption_targets_from_sample(sample):
    dialog = ''
    caption = sample['caption']
    targets = []
    targetidx = sample['target']
    for d in sample['dialog']:
        dialog += ' ' + d[0]
    for img in sample['img_list']:
        targets.append(img)
    return dialog, caption, targets, targetidx

Sample = namedtuple("Sample", ["words", "images", "target"])

"""
 For every Sample we retrieve the sentences and the img_ids, and the correct target_id. 
"""
def read_dataset(filename, stemmer, stopwords):
    with open(filename, "r") as f:
        dataset = json.load(f)
    f.close()
    for idx, sample in enumerate(dataset):
        if(idx % 10000 == 0):
            print(idx)
        dialog, caption, targets, targetidx = get_dialog_caption_targets_from_sample(dataset[
                                                                                     str(sample)])
        sentences = preprocess(dialog + ' ' + caption, stemmer, stopwords)
        yield Sample(words=[w2i[x] for x in sentences], images=targets, target=targetidx)



In [8]:
w2i = defaultdict(lambda: len(w2i))
UNK = w2i["<UNK>"]
PAD = w2i["<PAD>"]

# Do this super one time 
import nltk
# nltk.download("english")
stemmer = SnowballStemmer("english")
stop = stopwords.words('english') + list(string.punctuation)


In [9]:
#read the datasets and use w2i (only do this once)
train = list(read_dataset(TRAIN_HARD, stemmer, stop))
w2i = defaultdict(lambda: UNK, w2i)
val = list(read_dataset(VAL_HARD, stemmer, stop))
test = list(read_dataset(TEST_HARD, stemmer, stop))

0
10000
20000
30000
0
0


In [10]:
nwords = len(w2i)

In [11]:
"""
CLASS LSTM 
"""

class ClassificationNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, img_feat_dim, hidden_dim_mlp, output_dim, batch_size):
        super(ClassificationNN,self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=PAD)
        self.batch_size = batch_size
        self.linear1 = nn.Linear(embedding_size+img_feat_dim,hidden_dim_mlp)
        self.linear2 = nn.Linear(hidden_dim_mlp,output_dim)

    def forward(self, sentence, image_feat):
        embeds = self.word_embeddings(sentence)
        x = torch.sum(embeds, 1)
        x = x.unsqueeze(1)
        x = x.repeat(1,10,1)
        lin1 = F.sigmoid(self.linear1(torch.cat((x, image_feat),2)))
        lin2 = self.linear2(lin1)
        return lin2

In [12]:
# INIT MODEL AND INIT OPTIMIZER
print("Batch_size: ", BATCH_SIZE, "LEARNING_RATE: ",LEARNING_RATE)
print()

model = ClassificationNN(nwords, 300, 2048, 48, 1, BATCH_SIZE)
if CUDA:
    model.cuda()
print(model)

#@TODO we can use a adaptive learnrate for adam
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

Batch_size:  100 LEARNING_RATE:  0.001

ClassificationNN (
  (word_embeddings): Embedding(21999, 300, padding_idx=1)
  (linear1): Linear (2348 -> 48)
  (linear2): Linear (48 -> 1)
)


In [13]:
"""
HELPER FUNCTIONS
"""
def preprocessbatch(batch):
    """ Add zero-padding to a batch. """
    seqs = [sample.words for sample in batch]
    max_length = max(map(len, seqs))
    seqs = [seq + [PAD] * (max_length - len(seq)) for seq in seqs]

    ims = np.array([[get_feature_from_id(img_id) for img_id in sample.images] for sample in batch])

    idxs = [sample.target for sample in batch]
    
    image_ids = [sample.images for sample in batch]

    return seqs, ims, idxs, image_ids

def minibatch(data, batch_size=BATCH_SIZE):
    for i in range(0, len(data), batch_size):
        yield data[i:i+batch_size]

def getLongTensor(x):
    tensor = torch.cuda.LongTensor(x) if CUDA else torch.LongTensor(x)
    return Variable(tensor)


def getFloatTensor(x):
    tensor = torch.cuda.FloatTensor(torch.from_numpy(
        x).cuda()) if CUDA else torch.FloatTensor(x)
    return Variable(tensor)


In [14]:
""" 
FUNCTIONS TO SEE HOW GOOD OUR MODEL DID

@TODO: Not fast 
@TODO: top 5 currently not implemented
"""
def evaluate(model, data, show_n_images = 0):
    top1 = 0
    top5 = 0
    
    counter = 0
    for batch in minibatch(data):
        seqs, image_features, idxs, image_ids = preprocessbatch(batch)
        scores = model(getLongTensor(seqs), getFloatTensor(image_features))
        targets = getLongTensor([idxs])
        _, predictions = torch.max(scores[:, :,0].data, 1)
        

        top1 += torch.eq(predictions, targets[0]).sum().data[0]
        
        _, top5_predictions = torch.topk(scores[:, :,0].data, 5)
        _, top10_predictions = torch.topk(scores[:, :,0].data, 10)
        for i in range(len(top5_predictions)):
            if(targets[0][i].data.numpy() in top5_predictions[i].numpy()):
                top5 += 1
                
                #Show the images Good examples
                if counter < show_n_images:
                    image_list = [int(np.array(image_ids[i])[idxs[i]])]
                    image_list += list(np.array(image_ids[i])[top10_predictions[i].numpy()])
                    show_img_from_id(image_list)
                    counter += 1
            else: 
                #Show the images Bad examples
                if counter < show_n_images:
                    image_list = [int(np.array(image_ids[i])[idxs[i]])]
                    image_list += list(np.array(image_ids[i])[top10_predictions[i].numpy()])
                    show_img_from_id(image_list)
                    counter += 1
                
           
    return top1/len(data), top5/len(data)


In [15]:
"""
    RUNNING THE MODEL!!!!!!!!!!
"""

try:
    for ITER in range(EPOCHS):
        # Init variable
        random.shuffle(train)
        train_loss = 0.0
        start = time.time()
        updates = 0
        
        for batch in minibatch(train):
            updates += 1
            
            # pad data with zeros
            seqs, image_features, idxs, _ = preprocessbatch(batch)
            
            #reset hidden layer.1
            #@todo not certain if we need this
            #  model.hidden = model.init_hidden() 
            
            
            # forward pass
            scores = model(getLongTensor([seqs])[0], getFloatTensor(image_features))
            targets = getLongTensor([idxs])
            loss = nn.CrossEntropyLoss()
            output = loss(scores[:, :, 0], targets[0])
            train_loss += output.data[0]
            
            # backward pass
            model.zero_grad()
            output.backward()
            
            # update weights
            optimizer.step()
            
            if(updates % 100 == 0):
                print("update: {}, train_loss: {}, time {}".format(updates, train_loss/updates, time.time()-start))
            
        print("iter %r: avg train loss=%.4f, time=%.2fs" % (ITER, train_loss/updates, time.time()-start))
        top1, top5 = evaluate(model, val)   
        print("TOP 1: {}, TOP 5: {} \n".format(top1, top5))

except KeyboardInterrupt:
    print('Stopped at ITER: ' + str(ITER))

top1, top5 = evaluate(model, test, show_n_images=0)
print("TOP 1: {}, TOP 5: {}".format(top1, top5))

update: 100, train_loss: 2.291453647613525, time 16.41653084754944
update: 200, train_loss: 2.291338292360306, time 31.745835065841675
update: 300, train_loss: 2.2917072065671285, time 46.49011301994324
update: 400, train_loss: 2.291738773584366, time 61.255143880844116
iter 0: avg train loss=2.2917, time=61.26s
TOP 1: 0.0976, TOP 5: 0.5644 

update: 100, train_loss: 2.2983917593955994, time 15.716618061065674
update: 200, train_loss: 2.2974479007720947, time 31.559258937835693
update: 300, train_loss: 2.295619272391001, time 48.363394021987915
update: 400, train_loss: 2.2946953344345093, time 65.081218957901
iter 1: avg train loss=2.2947, time=65.08s
TOP 1: 0.1096, TOP 5: 0.583 

update: 100, train_loss: 2.288473765850067, time 16.144151210784912
update: 200, train_loss: 2.2863420820236207, time 33.487898111343384
update: 300, train_loss: 2.2884140841166176, time 52.32359600067139
update: 400, train_loss: 2.2881319373846054, time 70.17048621177673
iter 2: avg train loss=2.2881, time=7