In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from gensim.models import word2vec, FastText

from keras.models import Sequential, Model
from keras.layers import LSTM, Dense, Dropout, Embedding, Bidirectional
from keras.callbacks import ModelCheckpoint
from keras.layers.merge import add, concatenate
from keras import Input
from keras.utils import to_categorical
import tensorflow as tf
!pip install "nltk==3.4.5"
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score



In [None]:
np.random.seed(105)
tf.random.set_seed(105)

## Import Data 

In [None]:
df = pd.read_csv('/content/keyworded.csv')

In [None]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,heads,descs,keywords
0,0,0,inclement weather prevents liar from getting t...,PROVIDENCE RI—In spite of his best efforts to ...,"{'carlson': 0.25, 'weather': 0.196, 'spotty': ..."
1,1,1,mother comes pretty close to using word stream...,PATERSON NJ—Family sources told reporters Tues...,"{'burkhart': 0.356, 'she': 0.283, 'close': 0.2..."
2,2,2,richard bransons globalwarming donation nearly...,LONDON—Analysts are predicting that the 3 bill...,"{'branson': 0.439, 'virgin': 0.235, 'balloonba..."
3,3,3,shadow government getting too large to meet in...,COLUMBUS OH—With its membership swelling in re...,"{'marriotts': 0.177, 'meeting': 0.176, 'confer..."
4,4,4,ford develops new suv that runs purely on gaso...,DEARBORN MI—The Ford Motor Company announced W...,"{'gasoline': 0.354, 'petrola': 0.296, 'nair': ..."


In [None]:
df.shape

(10580, 3)

In [None]:
# use the lemma_tokens as model input
X_body = [eval(x) for x in df['lemma_tokens'].values]
X_is_sarcastic = df.is_sarcastic.values

In [None]:
headlines = [eval(x) for x in df['tokens.1'].values]

## Data Preparation

### Convert Entities Data

In [None]:
entities = [eval(x) for x in df['entities'].values]

In [None]:
# remove type of the entities
temp_entities = []

for row_entities in entities:
    temp = []
    for typed_entities in row_entities:
        temp += typed_entities[1]
    temp_entities.append(list(set(temp)))

entities = temp_entities

In [None]:
# lowercase all entities
for row_entities in entities:
    for i in range(len(row_entities)):
        row_entities[i] = row_entities[i].lower()

In [None]:
X_is_entity = []

for i, (words, row_entities) in enumerate(zip(X_body, entities)):
    X_is_entity.append([])
    for word in words:
        if word in row_entities:
            X_is_entity[i].append(1)
        else:
            X_is_entity[i].append(0)

### Convert Keywords Data

In [None]:
keywords = [eval(x) for x in df['keywords'].values]

In [None]:
keywords = [list(keyword.keys()) for keyword in keywords]

In [None]:
X_is_keyword = []

for i, (words, row_keywords) in enumerate(zip(X_body, keywords)):
    X_is_keyword.append([])
    for word in words:
        if word in row_keywords:
            X_is_keyword[i].append(1)
        else:
            X_is_keyword[i].append(0)

### Word Embedding

In [None]:
def embed_words(embedding_model, sentence):
    vector = []
    for i in range(len(sentence)):
        vector.append(embedding_model[sentence[i]])
    
    return vector

#### Word2Vec

In [None]:
w2v = word2vec.Word2Vec

body_embedding = w2v.load('models/body_embedding.model')
head_embedding = w2v.load('models/head_embedding.model')

#### FastText

In [None]:
body_embedding = FastText.load('models/body_embedding_fasttext.model')
head_embedding = FastText.load('models/head_embedding_fasttext.model')

### Padding

#### Pad Body

In [None]:
BODY_LENGTH = max([len(x_body) for x_body in X_body])

In [None]:
EMBEDDING_DIM = 103

In [None]:
def pad_body(body):
    return [np.zeros(EMBEDDING_DIM)] * (BODY_LENGTH - len(body)) + body

#### Create Function for Pad Head

In [None]:
HEAD_LENGTH = max([len(headline) for headline in headlines])

In [None]:
HEAD_EMBEDDING_DIM = 100

In [None]:
def pad_head(headline):
    return [np.zeros(HEAD_EMBEDDING_DIM)] * (HEAD_LENGTH - len(headline)) + headline

### Create Dictionary for Head Word Index

In [None]:
words_list = list(set([inner for outer in headlines for inner in outer]))

In [None]:
idx_to_word = {i: word for i, word in enumerate(words_list)}

In [None]:
word_to_idx = {word: i for i, word in enumerate(words_list)}

### Count Head Vocab Size

In [None]:
HEAD_VOCAB_SIZE = len(words_list)

## Create Model

### Data Generator

In [None]:
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(X_body, X_is_entity, X_is_keyword, X_is_sarcastic, headlines, batch_size):
    while 1:
        for x_body, x_is_entity, x_is_keyword, x_is_sarcastic, headline in zip(X_body, X_is_entity, X_is_keyword, X_is_sarcastic, headlines):
            # word embedding
            x_body_embedded = embed_words(body_embedding, x_body)
            headline_embedded = embed_words(head_embedding, headline)
            
            # concat x_body
            for i in range(len(x_body_embedded)):
                x_body_embedded[i] = np.concatenate(([x_is_entity[i]], [x_is_keyword[i]], 
                                                     [x_is_sarcastic], x_body_embedded[i]))
            # pad x_body
            x_body_embedded = pad_body(x_body_embedded)
            # reshape
            x_body_embedded = np.array(x_body_embedded).reshape(1, BODY_LENGTH, EMBEDDING_DIM)
            
            for i in range(1, len(headline)):
                # split into input and output pair
                in_seq, out_seq = headline_embedded[:i], headline[i]
                # pad input sequence
                in_seq = pad_head(in_seq)
                # reshape
                in_seq = np.array(in_seq).reshape(1, HEAD_LENGTH, HEAD_EMBEDDING_DIM)
        
                yield [np.array(x_body_embedded), np.array(in_seq)], np.array([to_categorical(word_to_idx[out_seq], num_classes=HEAD_VOCAB_SIZE)])

In [None]:
def generate_headlines(model_name, starting_words, input_length, rnn_size):
  
    tf.reset_default_graph() 

    with tf.Session() as sess:

        model_path = 'model/'
        model = Model(input_length=input_length, rnn_size=rnn_size, vocab_size=vocab_size)

        try:
            model.saver.restore(sess, model_path+model_name)
            print(f'Model {model_name} Restored')
            
        except:
            print(f'Model {f} does not exist')
            return None
          
        generated_sentences = []
        

        
        for starting_word in starting_words:

            generated = [start] + numerize(starting_word)

            while len(generated) < input_length:

                # Pad current generated sentence to match the input_length
                padded = generated[:input_length] + [pad] * (input_length - len(generated))
                padded = np.array([padded])

                feed = {model.input_num : padded}

                logits = sess.run(model.logits, feed_dict=feed)

                last_logits = logits[0][len(generated)-1][5:]

                generated.append(np.argmax(last_logits)+5)


            generated_sentence = translate_numerized(generated)

            generated_sentences.append(generated_sentence)
            
    return generated_sentences

### Model Architecture

In [None]:
input_body = Input(shape=(BODY_LENGTH, EMBEDDING_DIM), name='input_body')
do_body = Dropout(0)(input_body)
lstm_body = LSTM(64)(do_body)

input_head = Input(shape=(HEAD_LENGTH, HEAD_EMBEDDING_DIM), name='input_head')
do_head = Dropout(0)(input_head)
lstm_head = LSTM(64)(do_head)

decoder1 = add([lstm_body, lstm_head])
decoder2 = Dense(1024, activation='relu', name='dense_decoder')(decoder1)
output = Dense(HEAD_VOCAB_SIZE, activation='softmax', name='output')(decoder2)

model = Model(inputs=[input_body, input_head], outputs=output)

In [None]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_body (InputLayer)         (None, 6488, 103)    0                                            
__________________________________________________________________________________________________
input_head (InputLayer)         (None, 31, 100)      0                                            
__________________________________________________________________________________________________
dropout_7 (Dropout)             (None, 6488, 103)    0           input_body[0][0]                 
__________________________________________________________________________________________________
dropout_8 (Dropout)             (None, 31, 100)      0           input_head[0][0]                 
____________________________________________________________________________________________

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# load model
# model.load_weights('./models/model_test19.h5')

### Training

In [None]:
epochs = 15
batch_size = 32

steps = len(X_body) // batch_size

for i in range(6, epochs):
    generator = data_generator(X_body, X_is_entity, X_is_keyword, X_is_sarcastic, headlines, batch_size)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save('./models/model_w2v_e' + str(i) + '.h5')

Epoch 1/1


  after removing the cwd from sys.path.


Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


### Predicting

In [None]:
def predict(x_body, x_is_entity, x_is_keyword, x_is_sarcastic):
    # word embedding
    x_body_embedded = embed_words(body_embedding, x_body)

    # concat x_body
    for i in range(len(x_body_embedded)):
        x_body_embedded[i] = np.concatenate(([x_is_entity[i]], [x_is_keyword[i]], 
                                             [x_is_sarcastic], x_body_embedded[i]))

    # pad x_body
    x_body_embedded = pad_body(x_body_embedded)
    x_body_embedded = np.array(x_body_embedded).reshape(1, BODY_LENGTH, EMBEDDING_DIM)
    in_text = '<startseq>'
    for i in range(HEAD_LENGTH):
        sequence = embed_words(head_embedding, in_text.split())
        sequence = pad_head(sequence)
        sequence = np.array(sequence).reshape(1, HEAD_LENGTH, HEAD_EMBEDDING_DIM)
        yhat = model.predict([x_body_embedded, sequence], verbose=0)
        
        yhat = np.argmax(yhat)
        word = idx_to_word[yhat]
#         print(word)
        in_text += ' ' + word
        if word == '<endseq>':
            break
    
    in_text = in_text.replace('<startseq>','')
    in_text = in_text.replace('<endseq>','')

    return in_text.strip().split()

In [None]:
i = 5
y_pred = predict(X_body[i], X_is_entity[i], X_is_keyword[i], X_is_sarcastic[i])
y_pred

  after removing the cwd from sys.path.


['its',
 'you',
 'you',
 'you',
 'me',
 'are',
 'your',
 'back',
 'are',
 'back',
 'back',
 'back',
 'back']

In [None]:
headlines[i][1:-1]

['its', 'not', 'you', 'its', 'me', 'are', 'your', 'holding', 'you', 'back']

In [None]:
starting_words = ['court', 'samsung', 'apple', 'google', 'google and apple', 'google and samsung', 'samsung and apple']

In [None]:
bleu_score(headlines[i][1:-1], y_pred)

[1.0, 1.0, 1.0, 1.0]

### Testing

##### BLEU Score

In [None]:
def bleu_score(reference, generated):
    bleu1 = sentence_bleu([reference], generated, weights=(1, 0, 0, 0))
    bleu2 = sentence_bleu([reference], generated, weights=(0.5, 0.5, 0, 0))
    bleu3 = sentence_bleu([reference], generated, weights=(0.33, 0.33, 0.33, 0))
    bleu4 = sentence_bleu([reference], generated, weights=(0.25, 0.25, 0.25, 0.25))
    
    return [bleu1, bleu2, bleu3, bleu4]

In [None]:
bleu_scores = []
for i in range(100):
    y_pred = predict(X_body[i], X_is_entity[i], X_is_keyword[i], X_is_sarcastic[i])
    bleu_scores.append(bleu_score(headlines[i][1:-1], y_pred))

  after removing the cwd from sys.path.
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
bleu_scores_avg = np.array(bleu_scores).mean(axis=0)

In [None]:
print('BLEU-1: %.3f' % bleu_scores_avg[0])
print('BLEU-2: %.3f' % bleu_scores_avg[1])
print('BLEU-3: %.3f' % bleu_scores_avg[2])
print('BLEU-4: %.3f' % bleu_scores_avg[3])

BLEU-1: 0.259
BLEU-2: 0.225
BLEU-3: 0.194
BLEU-4: 0.144


##### METEOR Score

In [None]:
meteor_scores = []
for i in range(100):
    y_pred = predict(X_body[i], X_is_entity[i], X_is_keyword[i], X_is_sarcastic[i])
    meteor_scores.append(meteor_score([' '.join(headlines[i][1:-1])], ' '.join(y_pred)))

  after removing the cwd from sys.path.


In [None]:
meteor_scores_avg = np.array(meteor_scores).mean()

In [None]:
print('METEOR: %.3f' % meteor_scores_avg)

METEOR: 0.270
