In [22]:
from tracing.rl.actions import *
from tracing.rl.a3cmodel import A3CModel
from tracing.rl.rewards import PopupRewardsCalculator
from tracing.rl.environment import Environment
from tracing.rl.actor_learner import ActionsMemory
from tracing.rl.actor_learner import ActorLearnerWorker
import tensorflow as tf
import tensorflow.contrib.slim as slim
import threading
import json
import random
from scipy import misc
import numpy as np
import math

from tracing.selenium_utils.controls import Types
from nltk.tokenize import word_tokenize

In [2]:
def split(arr, batch_size):
    batch = []
    for item in arr:
        if len(batch) >= batch_size:
            yield batch
            batch = []

        batch.append(item)

    if len(batch) > 0:
        yield batch
        
        
def read_img(ctrl, size = 300):
    img = misc.imread(ctrl['img_file'])
    img = (img - 128.)/128.
    shape = img.shape
    assert shape[0] == size or shape[1] == size, 'found image {}, control {}'.format(shape, ctrl)
    
    return img


In [6]:
import gensim
import numpy as np
import string 
from nltk.tokenize import word_tokenize


class Encoder:
    
    def load_glove(self, glove_file):
        with open(glove_file,'r') as f:
            self.glove = {}
            for line in f:
                splitLine = line.split()
                word = splitLine[0]
                embedding = np.array([float(val) for val in splitLine[1:]])
                self.embeddings_dim = embedding.shape[0]
                
                self.glove[word] = embedding
    
    
    def __init__(self, glove_file):
        self.load_glove(glove_file)
        self.char2id = {c: i + 1 for i, c in enumerate(string.printable)}
        
        self.empty = np.zeros(self.embeddings_dim)
    
    def encode(self, sentences):
        tokenized_sentence = list([word_tokenize(sentence.lower()) for sentence in sentences])
        batch_size = len(sentences)
        
        max_sentence = 0
        max_token = 0
        for sentence in tokenized_sentence:
            max_sentence = max(max_sentence, len(sentence))
            for token in sentence:
                max_token = max(max_token, len(token))
        
        word_embeddings = np.zeros((batch_size, max_sentence, self.embeddings_dim))
        char_ids = np.zeros((batch_size, max_sentence, max_token))
        sentence_lengths = np.zeros(batch_size)
        word_lengths = np.zeros((batch_size, max_sentence))
        
        for i, sentence in enumerate(tokenized_sentence):
            sentence_lengths[i] = len(sentence)
            
            for j, token in enumerate(sentence):
                word_lengths[i, j] = len(token)
                word_embeddings[i, j, :] = self.glove.get(token, self.empty)
                
                for k, char in enumerate(token):
                    cid = self.char2id.get(char, 0)
                    char_ids[i, j, k] = cid
        
        return {
            'word_embeddings': word_embeddings,
            'char_ids': char_ids,
            'sentence_lengths': sentence_lengths,
            'word_lengths': word_lengths
        }
    

In [75]:
class ControlTextModel:
    # If session is not defined than default session will be used
    def __init__(self, a3c_model, session, encoder, repr_dim = 200):
        self.word_repr = None
        self.word_embeddings = None
                
        self.a3c_model = a3c_model
        self.session = session 
        self.encoder = encoder
        self.repr_dim = repr_dim
        
        self.device = '/cpu:0'
        
        self.add_img_repr()
        self.add_text_inputs()
        self.add_cnn_char_repr()
        self.add_pretrained_word_embeddings(encoder.embeddings_dim)
        self.add_context_repr(200)
        self.add_loss()
        self.add_training_op()
        
    
    def add_img_repr(self):
        with tf.device(self.device):
            self.img_repr = slim.fully_connected(self.a3c_model.net, self.repr_dim)
    
    
    def add_text_inputs(self):
        with tf.variable_scope("char_repr") as scope:
            # shape = (batch size, sentence, word)
            self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None], name="char_ids")

            # shape = (batch_size, sentence)
            self.word_lengths = tf.placeholder(tf.int32, shape=[None, None], name="word_lengths")

        with tf.variable_scope("word_repr") as scope:
            # shape = (batch size)
            self.sentence_lengths = tf.placeholder(tf.int32, shape=[None], name="sentence_lengths")

        with tf.variable_scope("training", reuse=None) as scope:
            # shape = (batch, 2)
            # One hot labels if a3c_model.img is similar to current sentence
            self.similar = tf.placeholder(tf.float32, shape=[None, 2], name="similar")
            

            self.lr = tf.placeholder_with_default(0.005,  shape=(), name="lr")
            self.dropout = tf.placeholder_with_default(1., shape=(), name="dropout")
        
                
    def add_cnn_char_repr(self, nchars = 101, dim=25, nfilters=25, pad=2):
        with tf.device(self.device):
        
            with tf.variable_scope("char_repr_cnn") as scope:
                # 1. Lookup for character embeddings
                char_range = math.sqrt(3 / dim)
                embeddings = tf.get_variable(name="char_embeddings", dtype=tf.float32,
                    shape=[nchars, dim],
                    initializer=tf.random_uniform_initializer(-char_range, char_range))

                # shape = (batch, sentence, word_len, embeddings dim)
                char_embeddings = tf.nn.embedding_lookup(embeddings, self.char_ids)
                char_embeddings = tf.nn.dropout(char_embeddings, self.dropout)
                s = tf.shape(char_embeddings)

                # shape = (batch x sentence, word_len, embeddings dim)
                char_embeddings = tf.reshape(char_embeddings, shape=[-1, s[-2], dim])

                # batch x sentence, word_len, nfilters
                conv1d = tf.layers.conv1d(
                    char_embeddings,
                    filters=nfilters,
                    kernel_size=[3],
                    padding='same',
                    activation=tf.nn.relu
                )

                # Max across each filter, shape = (batch x sentence, nfilters)
                char_repr = tf.reduce_max(conv1d, axis=1, keep_dims=True)
                char_repr = tf.squeeze(char_repr, squeeze_dims=[1])

                # (batch, sentence, nfilters)
                char_repr = tf.reshape(char_repr, shape=[s[0], s[1], nfilters])

                if self.word_repr is not None:
                    self.word_repr = tf.concat([self.word_repr, char_rep], axis=-1)
                else:
                    self.word_repr = char_repr

    
    
    def add_pretrained_word_embeddings(self, dim=100, trainable=True):
        with tf.device(self.device):
            with tf.variable_scope("word_repr") as scope:
                # shape = (batch size, sentence, dim)
                self.word_embeddings = tf.placeholder(tf.float32, shape=[None, None, dim],
                                                      name="word_embeddings")

                if self.word_repr is not None:
                    self.word_repr = tf.concat([self.word_repr, self.word_embeddings], axis=-1)
                else:
                    self.word_repr = word_embeddings
    
    
    def extract_last(self, source, lengths):
        batch_range = tf.range(tf.shape(source)[0])
        batch_indices = tf.stack([batch_range, lengths - 1], axis=1)
        res = tf.gather_nd(source, batch_indices)

        return res
    
    
    # Adds LSTM with size of each cell hidden_size
    def add_context_repr(self, hidden_size=200):
        with tf.device(self.device):
        
            with tf.variable_scope("context_repr") as scope:
                cell = tf.contrib.rnn.LSTMCell(hidden_size)
                
                word_repr = tf.nn.dropout(self.word_repr, self.dropout)

                output, state = tf.nn.dynamic_rnn(
                    cell,
                    word_repr,
                    sequence_length=self.sentence_lengths,
                    dtype=tf.float32)

                context_repr = tf.nn.dropout(output, self.dropout)
                
                # batch x hidden_size
                sentence_repr = self.extract_last(context_repr, self.sentence_lengths)
                
#                 w_bound = math.sqrt(6 / (hidden_size))
#                 W = tf.get_variable("W", shape=[hidden_size, self.repr_dim],
#                                 dtype=tf.float32,
#                                 initializer=tf.random_uniform_initializer(-w_bound, w_bound))


#                 b = tf.get_variable("b", shape=[self.repr_dim], dtype=tf.float32)
                
                # batch x hidden_size
                self.text_repr = slim.fully_connected(sentence_repr, self.repr_dim)
    
    
    def add_loss(self):
        same = tf.losses.cosine_distance(self.text_repr, self.img_repr, axis=-1)
        not_same = 1 - same
        
        probas = tf.stack([same, not_same])
        
        sim_loss = self.similar * tf.log(probas)
        sim_loss = tf.reduce_sum(sim_loss, -1)
        
        self.text_loss = tf.reduce_mean(sim_loss)
    
    
    # clip_gradient < 0  - no gradient clipping
    def add_training_op(self, clip_gradient = 5.0):

        with tf.variable_scope("training", reuse=None) as scope:
            optimizer = tf.train.MomentumOptimizer(learning_rate=self.lr, momentum=0.9)
            if clip_gradient > 0:
                gvs = optimizer.compute_gradients(self.text_loss)
                capped_gvs = [(tf.clip_by_value(grad, -clip_gradient, clip_gradient), var) for 
                              grad, var in gvs if grad is not None]
                self.train_op = optimizer.apply_gradients(capped_gvs)
            else:
                self.train_op = optimizer.minimize(self.text_loss)

            self.init_op = tf.variables_initializer(tf.global_variables(), name="init")
    
    
    def get_control_text(self, ctrl):
        label = (ctrl.get('label') or '').strip()
        tip = (ctrl.get('tip') or '').strip()
        
        result = label if label != '' else tip
        return result.strip().lower()
    
    
    def is_for_training(self, ctrl):
        txt = self.get_control_text(ctrl)
        return len(word_tokenize(txt)) > 0
    
    
    def is_similar(self, ctrl1, ctrl2):
        text1 = self.get_control_text(ctrl1)
        text2 = self.get_control_text(ctrl2)
        
        tokens1 = set(word_tokenize(text1))
        tokens2 = set(word_tokenize(text2))
        intesection = len(tokens1.intersection(tokens2))
        
        scale = intesection / min(len(tokens1), len(tokens2))
                    
        return scale >= 1 / 2
    
    
    def extract_pairs(self, controls, neg_samples = 5):
        result = []
                
        for ctrl in controls:
            result.append((ctrl, ctrl, True))
            
            for i in range(neg_samples):
                for _ in range(neg_samples):
                    neg_ctrl = random.choice(controls)
                    if not self.is_similar(ctrl, neg_ctrl):
                        result.append((ctrl, neg_ctrl, False))
                        break                    
                    
        
        random.shuffle(result)
        return result
        
    
    def batch_to_input(self, batch):
        imgs = []
        texts = []
        similarities = []
        
        for ctrl, ctrl2, similar in batch:
            imgs.append(read_img(ctrl))
            texts.append(self.get_control_text(ctrl2))
            
            sim = [1, 0] if similar else [0, 1]
            similarities.append(sim)
        
        text_input = self.encoder.encode(texts)
        
        return {
            self.a3c_model.img: imgs,
            self.similar: similarities,
            
            self.char_ids: text_input['char_ids'],
            self.word_embeddings: text_input['word_embeddings'],
            self.sentence_lengths: text_input['sentence_lengths'],
            self.word_lengths: text_input['word_lengths']
        }
        
    
    def train(self, controls, epoch_start = 0, epoch_end = 5, neg_samples = 5, batch_size = 10):
        to_train = list(filter(lambda ctrl: self.is_for_training(ctrl), controls))
        
        for epoch in range(epoch_start, epoch_end):
            pairs = self.extract_pairs(to_train, neg_samples)
            
            print('epoch {} started'.format(epoch))
            sum_loss = 0
            for batch in split(pairs, batch_size):
                feed = self.batch_to_input(batch)
                _, loss = self.session.run([self.train_op, self.text_loss], feed_dict = feed)
                sum_loss += loss / len(batch)
            
            print('epoch ended')
            print('loss: {}'.format(sum_loss))


In [4]:
encoder = Encoder('glove.6B/glove.6B.200d.txt')

In [7]:
#encoder.encode(['My little ponny', 'Enter your first name, please'])

In [76]:
tf.reset_default_graph()
session = tf.Session()

In [77]:
a3c = A3CModel(len(Actions.actions), session = session, train_deep=True)
model = ControlTextModel(a3c, session, encoder, repr_dim = 200)

session.run(tf.global_variables_initializer())

In [62]:
controls = []
with open('controls_dataset.jsonl') as f:
    for line in f:
        ctrl = json.loads(line)
        controls.append(ctrl)

In [None]:
model.train(controls, epoch_start = 0, epoch_end = 5, neg_samples = 5, batch_size = 10)    

epoch 0 started


`imread` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imread`` instead.
  from ipykernel import kernelapp as app
