implementing simple model from http://arxiv.org/pdf/1512.01712v1.pdf
inspiration from https://github.com/llSourcell/How_to_make_a_text_summarizer/blob/master/train.ipynb

In [1]:
import pickle as pickle

In [2]:
maxlen = 50
rnn_size = 512
rnn_layers = 3
batch_norm = False

In [3]:
activation_rnn_size = 40

In [4]:
# training params
seed = 420
p_W, p_U, p_dense, p_emb, weight_decay = 0, 0, 0, 0, 0
optimizer = 'adam'
LR = 1e-4
batch_size = 64
nflips = 10

In [5]:
nb_train_samples = 3000
nb_val_samples = 1396

In [6]:
# read word embeddings

with open('data/vocab_embed.pkl', 'rb') as f:
    embedding, idx2word, word2idx, glove_idx2idx = pickle.load(f)

with open('data/vocab_embed.data.pkl', 'rb') as f:
    X, Y = pickle.load(f)

vocab_size, embedding_size = embedding.shape

In [7]:
nb_unknown_words = 10

In [8]:
# number of unknown words
len(idx2word)-vocab_size-len(glove_idx2idx)

64913

In [9]:
for i in range(nb_unknown_words):
    idx2word[vocab_size-1-i] = "<%d>" % i

In [10]:
oov0 = vocab_size - nb_unknown_words

for i in range(oov0, len(idx2word)):
    idx2word[i] = idx2word[i] + "^"

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=nb_val_samples, random_state=seed)
len(X_train), len(Y_train), len(X_test), len(Y_test)

(3000, 3000, 1396, 1396)

In [12]:
del X
del Y

In [13]:
empty = 0
eos = 1
# idx2word[empty] = '-'
idx2word[eos] = '~'

In [14]:
import numpy as np
from keras.preprocessing import sequence
from keras.utils import np_utils
import random, sys

In [15]:
def pprint(label, x):
    print(label + ":", ' '.join([idx2word[w] for w in x]))

In [16]:
pprint('Summary', Y_train[220])
pprint('Article', X_train[220])

Summary: The Kannada Development Authority has directed the Bengaluru metro officials remove all Hindi signs without waiting for the state government's instructions.^ The authority also said that even the announcements can't made Hindi. This follows protests from pro-Kannada activists against the Centre's tri-language^ policy, calling forceful imposition Hindi non-Hindi speaking states.
Article: Namma^ Metro officials have finally relented,^ and said that Hindi signboards will removed from all stations. The move comes after the Kannada Development Authority mandated the immediate removal.The^ KDA^ took one step forward and said that even announcements cannot made Hindi.The^ managing director Bangalore Metropolitan Rail Corporation (BMRCL)^ said that would abide the directions the KDA.The^ KDA^ had also urged the state government move privilege motion against BMRCL.^ Non compliance would amount beach privilege.There^ have been massive protests across the state saying that Hindi was just

In [17]:
# model
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout, RepeatVector
from keras.layers.wrappers import TimeDistributed
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.regularizers import l2

In [18]:
random.seed(seed)
np.random.seed(seed)

In [19]:
regularizer = l2(weight_decay)

In [20]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_size,
                    input_length=maxlen, weights=[embedding], mask_zero=True,
                    name='embedding_1'))
for i in range(rnn_layers):
    lstm = LSTM(rnn_size, return_sequences=True, # batch_norm=batch_norm,
#                 embeddings_regularizer=regularizer, U_regularizer=regularizer,
#                 b_regularizer=regularizer, dropout_W=p_W, dropout_U=p_U,
                name='lstm_%d'%(i+1)
                  )
    model.add(lstm)
    model.add(Dropout(p_dense,name='dropout_%d'%(i+1)))

2022-03-06 21:21:58.690671: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [21]:
from keras.layers.core import Lambda
import keras.backend as K

maxlend=25
maxlenh=25

def simple_context(X, mask, n=activation_rnn_size, maxlend=25, maxlenh=25):
    desc, head = X[:,:maxlend,:], X[:,25:,:]
    head_activations, head_words = head[:,:,:n], head[:,:,n:]
    desc_activations, desc_words = desc[:,:,:n], desc[:,:,n:]
    
    # RTFM http://deeplearning.net/software/theano/library/tensor/basic.html#theano.tensor.batched_tensordot
    # activation for every head word and every desc word
    activation_energies = K.batch_dot(head_activations, desc_activations, axes=(2,2))
    # make sure we dont use description words that are masked out
    activation_energies = activation_energies + -1e20*K.expand_dims(1.-K.cast(mask[:, :maxlend],'float32'),1)
    
    # for every head word compute weights for every desc word
    activation_energies = K.reshape(activation_energies,(-1,maxlend))
    activation_weights = K.softmax(activation_energies)
    activation_weights = K.reshape(activation_weights,(-1,maxlenh,maxlend))

    # for every head word compute weighted average of desc words
    desc_avg_word = K.batch_dot(activation_weights, desc_words, axes=(2,1))
    return K.concatenate((desc_avg_word, head_words))

class SimpleContext(Lambda):
    def __init__(self,**kwargs):
        super(SimpleContext, self).__init__(simple_context,**kwargs)
        self.supports_masking = True

    def compute_mask(self, input, input_mask=None):
        return input_mask[:, 25:]
    
    def get_output_shape_for(self, input_shape):
        nb_samples = input_shape[0]
        n = 2*(rnn_size - activation_rnn_size)
        return (nb_samples, maxlenh, n)

In [22]:
if activation_rnn_size:
    model.add(SimpleContext(name='simplecontext_1'))
model.add(TimeDistributed(Dense(vocab_size,
                                name = 'timedistributed_1')))
model.add(Activation('softmax', name='activation_1'))

In [23]:
from tensorflow.keras.optimizers import RMSprop # usually I prefer Adam but article used rmsprop
# opt = Adam(lr=LR)  # keep calm and reduce learning rate
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [24]:
K.set_value(model.optimizer.lr,np.float32(LR))

In [25]:
def str_shape(x):
    return 'x'.join(map(str,x.shape))
    
def inspect_model(model):
    for i,l in enumerate(model.layers):
        print(i, 'cls=%s name=%s'%(type(l).__name__, l.name))
        weights = l.get_weights()
        for weight in weights:
            print(str_shape(weight))
        print()

In [26]:
inspect_model(model)

0 cls=Embedding name=embedding_1
40000x100

1 cls=LSTM name=lstm_1
100x2048
512x2048
2048

2 cls=Dropout name=dropout_1

3 cls=LSTM name=lstm_2
512x2048
512x2048
2048

4 cls=Dropout name=dropout_2

5 cls=LSTM name=lstm_3
512x2048
512x2048
2048

6 cls=Dropout name=dropout_3

7 cls=SimpleContext name=simplecontext_1

8 cls=TimeDistributed name=time_distributed
944x40000
40000

9 cls=Activation name=activation_1



In [27]:
def lpadd(x, maxlend=maxlend, eos=eos):
    """left (pre) pad a description to maxlend and then add eos.
    The eos is the input to predicting the first word in the headline
    """
    assert maxlend >= 0
    if maxlend == 0:
        return [eos]
    n = len(x)
    if n > maxlend:
        x = x[-maxlend:]
        n = maxlend
    return [empty]*(maxlend-n) + x + [eos]

In [28]:
samples = [lpadd([3]*26)]
# pad from right (post) so the first maxlend will be description followed by headline
data = sequence.pad_sequences(samples, maxlen=maxlen, value=empty, padding='post', truncating='post')

In [29]:
np.all(data[:,maxlend] == eos)

True

In [30]:
data.shape,list(map(len, samples))

((1, 50), [26])

In [31]:
probs = model.predict(data, verbose=0, batch_size=1)
probs.shape

(1, 25, 40000)

In [78]:
def flip_summary(x, nflips=None, model=None, debug=False):
    """given a vectorized input (after `pad_sequences`) flip some of the words in the second half (headline)
    with words predicted by the model
    """
    if nflips is None or model is None or nflips <= 0:
        return x
    
    batch_size = len(x)
#     assert np.all(x[:,maxlend] == eos)
    probs = model.predict(x, verbose=0, batch_size=batch_size)
    x_out = x.copy()
    for b in range(batch_size):
        # pick locations we want to flip
        # 0...maxlend-1 are descriptions and should be fixed
        # maxlend is eos and should be fixed
        flips = sorted(random.sample(range(maxlend+1,maxlen), nflips))
        if debug and b < debug:
            print(b)
        for input_idx in flips:
            if x[b,input_idx] == empty or x[b,input_idx] == eos:
                continue
            # convert from input location to label location
            # the output at maxlend (when input is eos) is feed as input at maxlend+1
            label_idx = input_idx - (maxlend+1)
            prob = probs[b, label_idx]
            w = prob.argmax()
            if w == empty:  # replace accidental empty with oov
                w = oov0
            if debug and b < debug:
                print('%s => %s'%(idx2word[x_out[b,input_idx]],idx2word[w]))
            x_out[b,input_idx] = w
        if debug and b < debug:
            print()
    return x_out

In [79]:
def vocab_fold(xs):
    """convert list of word indexes that may contain words outside vocab_size to words inside.
    If a word is outside, try first to use glove_idx2idx to find a similar word inside.
    If none exist then replace all accurancies of the same unknown word with <0>, <1>, ...
    """
#     print(xs)
    xs_ = list()
    for x in xs:
        if not isinstance(x, int):
            continue
        if x < oov0:
            xs_.append(x)
        else:
            xs_.append(glove_idx2idx.get(x,x))
    xs = xs_
#     xs = [x if  x < oov0 else glove_idx2idx.get(x,x) for x in xs]
    # the more popular word is <0> and so on
    outside = sorted([x for x in xs if x >= oov0])
    # if there are more than nb_unknown_words oov words then put them all in nb_unknown_words-1
    outside = dict((x,vocab_size-1-min(i, nb_unknown_words-1)) for i, x in enumerate(outside))
    xs = [outside.get(x,x) for x in xs]
    return xs

def conv_seq_labels(xds, xhs, nflips=None, model=None, debug=False):
    """description and hedlines are converted to padded input vectors. headlines are one-hot to label"""
    batch_size = len(xhs)
    assert len(xds) == batch_size
    x = [vocab_fold(lpadd(xd)+xh) for xd,xh in zip(xds,xhs)]  # the input does not have 2nd eos
    x = sequence.pad_sequences(x, maxlen=maxlen, value=empty, padding='post', truncating='post')
    x = flip_summary(x, nflips=nflips, model=model, debug=debug)
    
    y = np.zeros((batch_size, maxlenh, vocab_size))
    for i, xh in enumerate(xhs):
        xh = vocab_fold(xh) + [eos] + [empty]*maxlenh  # output does have a eos at end
        xh = xh[:maxlenh]
        y[i,:,:] = np_utils.to_categorical(xh, vocab_size)
        
    return x, y

In [80]:
def gen(Xd, Xh, batch_size=batch_size, nb_batches=None, nflips=None, model=None, debug=False, seed=seed):
    """yield batches. for training use nb_batches=None
    for validation generate deterministic results repeating every nb_batches
    
    while training it is good idea to flip once in a while the values of the headlines from the
    value taken from Xh to value generated by the model.
    """
    c = nb_batches if nb_batches else 0
    while True:
        xds = []
        xhs = []
        if nb_batches and c >= nb_batches:
            c = 0
        new_seed = random.randint(0, sys.maxsize)
        random.seed(c+123456789+seed)
        for b in range(batch_size):
            t = random.randint(0,len(Xd)-1)

            xd = Xd[t]
            s = random.randint(min(maxlend,len(xd)), max(maxlend,len(xd)))
            xds.append(xd[:s])
            
            xh = Xh[t]
            s = random.randint(min(maxlenh,len(xh)), max(maxlenh,len(xh)))
            xhs.append(xh[:s])

        # undo the seeding before we yield inorder not to affect the caller
        c+= 1
        random.seed(new_seed)

        yield conv_seq_labels(xds, xhs, nflips=nflips, model=model, debug=debug)

In [81]:
r = next(gen(X_train, Y_train, batch_size=batch_size))
r[0].shape, r[1].shape, len(r)

((64, 50), (64, 25, 40000), 2)

In [82]:
def test_gen(gen, n=5):
    Xtr,Ytr = next(gen)
    for i in range(n):
        assert Xtr[i,maxlend] == eos
        x = Xtr[i,:maxlend]
        y = Xtr[i,maxlend:]
        yy = Ytr[i,:]
        yy = np.where(yy)[1]
        pprint('L',yy)
        pprint('H',y)
        if maxlend:
            pprint('D',x)

In [83]:
test_gen(gen(X_train, Y_train, batch_size=batch_size))

L: Former Indian cricketer Virender Sehwag, who was accused bullying Delhi University student Gurmehar Kaur, has clarified that his recent tweet was not intended for her.
H: ~ Former Indian cricketer Virender Sehwag, who was accused bullying Delhi University student Gurmehar Kaur, has clarified that his recent tweet was not intended for
D: fun but people construed the other way," told India Today. <0>^ student the Lady Sri Ram College, stirred storm when her Facebook post February went
L: The has ended four-month ban flyers carrying laptops <0>^ flights from airports North Africa and the Middle East. Last month, officials announced new security requirements
H: ~ The has ended four-month ban flyers carrying laptops <0>^ flights from airports North Africa and the Middle East. Last month, officials announced new security
D: Turkish Airlines, Saudi Arabian Airlines, Royal Jordanian Kuwait Airways, 747 and Royal Air <1>^ which are the only carriers fly direct the from the region.
L: Accordi

In [84]:
history = {}

In [85]:
traingen = gen(X_train, Y_train, batch_size=batch_size, nflips=nflips, model=model)
valgen = gen(X_test, Y_test, nb_batches=nb_val_samples//batch_size, batch_size=batch_size)

In [86]:
r = next(traingen)
r[0].shape, r[1].shape, len(r)

((64, 50), (64, 25, 40000), 2)

In [87]:
for iteration in range(500):
    print('Iteration', iteration)
    h = model.fit_generator(traingen, steps_per_epoch=nb_train_samples,
                        epochs=1, validation_data=valgen)
    for k,v in h.history.iteritems():
        history[k] = history.get(k,[]) + v
    with open('data/train.history.pkl', 'wb') as fp:
        pickle.dump(history,fp,-1)
    model.save_weights('data/train.hdf5', overwrite=True)
    gensamples(batch_size=batch_size)

Iteration 0


  h = model.fit_generator(traingen, steps_per_epoch=nb_train_samples,


  12/3000 [..............................] - ETA: 3:36:23 - loss: 10.5844

KeyboardInterrupt: 