In [2]:
import time
import numpy as np
import pandas as pd
import os
import pickle as pkl
from matplotlib import pyplot as plt
from collections import defaultdict
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize

from collections import Counter

%matplotlib inline

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.optimizers import SGD

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model, load_model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers, regularizers, optimizers
from keras.callbacks import History, CSVLogger

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve, average_precision_score

Using TensorFlow backend.


NameError: name 'core' is not defined

## Read in train and test txt files

In [None]:
train_genres = open("train_genres.txt", encoding="utf-8").read().split('\n')
train_plots = open("train_plots.txt", encoding="utf-8").read().split('\n')

In [None]:
train_data = pd.DataFrame({"genres": train_genres, "plots": train_plots})

In [None]:
len(train_data)

In [None]:
train_data.loc[4]["genres"].split(" ")

In [None]:
test_genres = open("test_genres.txt", encoding="utf-8").read().split('\n')
test_plots = open("test_plots.txt", encoding="utf-8").read().split('\n')

In [None]:
test_data = pd.DataFrame({"genres": test_genres, "plots": test_plots})

In [None]:
len(test_data)

In [None]:
#Concat dataframes
full_data = pd.concat([train_data, test_data], ignore_index= True)

# BRIEF EDA

In [None]:
#Convert genres to list for easier analysis
def list_genres(row):
    return row["genres"].split(" ")[:-1]

In [None]:
full_data["list_genres"] = full_data.apply(lambda row: list_genres(row), axis = 1)

In [None]:
count_dict = defaultdict(int)

In [None]:
#Count num of movies per genre
def dict_count(row):
    global count_dict
    for genre in row["list_genres"]:
        count_dict[genre] += 1

In [None]:
#RUN ONLY ONCE!!!
count_val_series = full_data.apply(lambda row: dict_count(row), axis = 1)

In [None]:
#Count num of movies per genre
for key,val in count_dict.items():
    print("{:0.2f}% of the movies are {}".format(100*val/len(full_data), key))



fig, ax = plt.subplots(1,1) 
ax.bar(range(len(count_dict.keys())), list(count_dict.values()))
ax.set_xticks(range(len(count_dict.keys())))
ax.set_xticklabels(list(count_dict.keys()), rotation='vertical', fontsize=18)



In [None]:
#Number of genres per movie
genre_labels = full_data["list_genres"].str.len()
print("Avg num of genres:", np.mean(genre_labels))
print("Median num of genres:", np.median(genre_labels))
print("Max number of genres:", np.max(genre_labels))
print("Min number of genres:", np.min(genre_labels))


plt.hist(genre_labels)

# CONSOLIDATE Genre Labels

In [None]:
import operator
sorted(count_dict.items(), key=operator.itemgetter(1))[::-1]

In [None]:
remove = ["Animation", "Fantasy", "Mystery", 
          "Biography", "Music", "History", "War", "Sport", "Western", "Musical"]

def remove_genres(genres):
    return [x for x in genres if x not in remove]

full_data["list_genres_consol"] = full_data["list_genres"].apply(lambda row: remove_genres(row))

In [None]:
full_data = full_data[full_data.astype(str)["list_genres_consol"] != "[]"]

In [None]:
keep = ["comedy", "drama", "adventure", "action", "thriller", "family", "romance", "horror", "crime", "sci-fi", "scifi"]
for key in remove:
    if key in count_dict:
        del count_dict[key]

for key,val in count_dict.items():
    print("{:0.2f}% of the movies are {}".format(100*val/len(full_data), key))



fig, ax = plt.subplots(1,1) 
ax.bar(range(len(count_dict.keys())), list(count_dict.values()))
ax.set_xticks(range(len(count_dict.keys())))
ax.set_xticklabels(list(count_dict.keys()), rotation='vertical', fontsize=18)


In [None]:
#average length of plot summary (characters)
avg_num_chars = train_data["plots"].str.len().mean()
max_num_chars = train_data["plots"].str.len().max()
print(f'The average number of characters in a summary is {avg_num_chars}, max is {max_num_chars}')

In [None]:
avg_num_words = train_data["plots"].str.count(" ").mean() + 1
max_num_words = train_data["plots"].str.count(" ").max() + 1
median_num_words = train_data["plots"].str.count(" ").median() + 1
avg_num_periods = train_data["plots"].str.count("[\.\?!]").mean()
max_num_periods = train_data["plots"].str.count("[\.\?!]").max()
median_num_periods = train_data["plots"].str.count("[\.\?!]").median()
print(f'The average number of words in a summary is {avg_num_words}, max is {max_num_words}, median is {median_num_words}')
print(f'The average number of sentences in a summary is {avg_num_periods}, max is {max_num_periods}, median is {median_num_periods}')
print(avg_num_words/avg_num_periods)
print(median_num_words/median_num_periods)


In [None]:
train_data["plots"].str.count("\n").max()

In [None]:
#Alternative way to calculate plot length
print(train_data["plots"].str.lower().str.split().str.len().mean()) #Avg words
print(train_data["plots"].str.lower().str.split().str.len().max()) #Max words

In [None]:
dict_words = Counter(" ".join(train_data["plots"]).split(" "))
print(len(dict_words.keys())) #should be number of unique words
print(sum(dict_words.values())) #should be total number of words

# Preprocessing

In [None]:
#Lemmetizing Function
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
def nltk2wn_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return wordnet.NOUN

In [None]:
#Tokenize w/lemmetization AFTER removing stopwords 
#https://machinelearningmastery.com/clean-text-machine-learning-python/
def tokenize(plot, stop_words, lemmatize = False):
    
    def re_sub(pattern, replace):
        return re.sub(pattern, replace, plot)
    
    plot = plot.lower() #lowercase
    plot = re_sub(r"[-+]?[.\d]*[\d]+[:,/.\d]*", "DG") #generic tag for numbers
    plot = re_sub(r"([!?.]){2,}", r"\1") #Convert multiple punctuations to the last punctuation mark
    plot = plot.replace('-',' ') #separating hyphenated words
    plot = plot.replace('_','') #remove underscores
    plot = re_sub(r'(?<!\w)([a-zA-Z])\.', r'\1') #remove periods from abbreviations
    plot = re_sub('[^\w\s\.\?\!\']','') #remove punctuation besides sentence completers and apostrophes
    sentences = nltk.sent_tokenize(plot)
    words = list(map(nltk.word_tokenize, sentences))
    words = [[x for x in w if not x in stop_words] for w in words]
    words = [[x if not x in keep else "genre" for x in w] for w in words]
    if lemmatize:
        output_lem = [nltk.pos_tag(w) for w in words]
        return [[lemmatizer.lemmatize(x[0], pos = nltk2wn_tag(x[1])) for x in w] for w in output_lem]
    else:
        return words

In [None]:
def tokenize_USE(plot):
    def re_sub(pattern, replace):
        return re.sub(pattern, replace, plot)
    
    plot = plot.lower() #lowercase
    plot = re_sub(r"[-+]?[.\d]*[\d]+[:,/.\d]*", "DG") #generic tag for numbers
    plot = re_sub(r"([!?.]){2,}", r"\1") #Convert multiple punctuations to the last punctuation mark
    plot = plot.replace('-',' ') #separating hyphenated words
    plot = plot.replace('_','') #remove underscores
    plot = re_sub(r'(?<!\w)([a-zA-Z])\.', r'\1') #remove periods from abbreviations
    plot = re_sub('[^\w\s\.\?\!\']','') #remove punctuation besides sentence completers and apostrophes
    plot = plot.split()
    plot = [x if x not in keep else "genre" for x in plot]
    return " ".join(plot)

def tokenize_USE_sentence(plot):
    return nltk.sent_tokenize(plot)

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
stop = nltk.corpus.stopwords.words('english')

In [None]:
ss = "Hello my name is sharad 50 years. I like actionable comedy alot!"
tokenize_USE(ss)

In [None]:
start = time.time()
full_data["USE_tokens"] = full_data.apply(lambda row: tokenize_USE(row['plots']), axis=1)
end = time.time()
print("Total Time to tokenize plots:", end - start, "seconds")

In [None]:
start = time.time()
full_data['tokenized_words'] = full_data.apply(lambda row: tokenize(row['plots'], stop, lemmatize = True), axis=1)
end = time.time()
print("Total Time to tokenize plots:", end - start, "seconds")

In [None]:
full_data['flattened_tokens'] = full_data.apply(lambda l: [item for sublist in l['tokenized_words'] for item in sublist], axis=1)

In [None]:
start = time.time()
full_data["USE_tokens_sentences"] = full_data.apply(lambda row: tokenize_USE_sentence(row['USE_tokens']), axis=1)
end = time.time()
print("Total Time to tokenize plots:", end - start, "seconds")

# Binarize labels

In [None]:
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(full_data["list_genres_consol"])
full_data["binarized_labels"] = labels.tolist()

In [None]:
mlb.classes_

In [None]:
labels.shape

# Pickle data/Checkpoint

In [None]:
full_data.to_pickle("./full_data.pkl")

# READ IN UNIVERSAL SENTENCE ENCODER

In [3]:
#full_data = pd.read_pickle("./local/full_data.pkl")

In [11]:
!pip install tensorflow-hub
import tensorflow_hub as hub
import tensorflow as tf



W0401 21:27:09.933175 140405303584576 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [12]:
start = time.time()
embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
end = time.time()
print("Total Time to import sentence embeddings:", end - start, "seconds")

Total Time to import sentence embeddings: 73.64054298400879 seconds


In [13]:
#EMBED ENTIRE PLOTS
start = time.time()
messages = np.array(full_data["USE_tokens"])

# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  message_embeddings = session.run(embed(messages))

#   for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
#     print("Message: {}".format(messages[i]))
#     print("Embedding size: {}".format(len(message_embedding)))
#     message_embedding_snippet = ", ".join(
#         (str(x) for x in message_embedding[:3]))
#     print("Embedding: [{}, ...]\n".format(message_embedding_snippet))
end = time.time()
print("Total Time to embed entire plots with USE:", end - start, "seconds")

Total Time to embed entire plots with USE: 607.6711592674255 seconds


In [14]:
full_data["USE_token_plot_embeddings"] = message_embeddings.tolist()

In [15]:
#TRUNCATE AND PAD NUM SENTENCES FOR USE_SENTENCE_LEVEL_ENCODING
MAX_SENT = 5
def truncate_and_pad_plots(sentences, max_sent):
    if len(sentences) < max_sent:
        return sentences + ["EMPTY"] * (max_sent - len(sentences))
    elif len(sentences) > max_sent:
        return sentences[0:max_sent]
    else:
        return sentences

start = time.time() 
full_data["USE_tokens_sentences_padded"] = full_data["USE_tokens_sentences"].apply(lambda row: 
                                                                                   truncate_and_pad_plots(row, 
                                                                                                          MAX_SENT))
end = time.time()
print("Total Time to pad and truncate plots for USE:", end - start, "seconds")

Total Time to pad and truncate plots for USE: 0.2917203903198242 seconds


In [16]:
#EMBED INDIVID SENTENCES
MAX_SENT = 5
start = time.time()
messages = tf.reshape(np.stack(full_data["USE_tokens_sentences_padded"]), [-1]) #Make 1 dimensional for tf_HUB, will reshape later

# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  message_embeddings = session.run(embed(messages))

#   for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
#     print("Message: {}".format(messages[i]))
#     print("Embedding size: {}".format(len(message_embedding)))
#     message_embedding_snippet = ", ".join(
#         (str(x) for x in message_embedding[:3]))
#     print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

end = time.time()
print("Total Time to embed individual sentences in the plots with USE:", end - start, "seconds")

Total Time to embed individual sentences in the plots with USE: 1504.7398443222046 seconds


In [17]:
full_data_sent_embeddings = message_embeddings.reshape(len(full_data), MAX_SENT, 512)
full_data["USE_token_sentence_embeddings"] = full_data_sent_embeddings.tolist()
np.save("USE_token_sentence_embeddings.npy" , message_embeddings.reshape(len(full_data), MAX_SENT, 512))

## Checkpoint

In [None]:
full_data = pd.read_pickle("./full_data.pkl")
USE_sent_embed = np.load("USE_token_sentence_embeddings.npy")
full_data["USE_token_sentence_embeddings"] = USE_sent_embed.tolist()

In [None]:
full_data.head()

# Token analysis

Iterate over tokenized words and create dictionaries that keep track of number of tokens, length of sentences, and sentences per plot summary

In [None]:
word_dict = {}
sent_per_summary_dict = {}
word_per_sent_dict = {}
rows = len(full_data['tokenized_words'])
print(rows)#number of plot summaries
for i in range(len(full_data['tokenized_words'])):
    length = len(full_data['tokenized_words'][i])
    if length in sent_per_summary_dict:
        sent_per_summary_dict[length] += 1
    else:
        sent_per_summary_dict[length] = 1
    for j in range(length):
        word_count = len(full_data['tokenized_words'][i][j])
        if word_count in word_per_sent_dict:
            word_per_sent_dict[word_count] += 1
        else:
            word_per_sent_dict[word_count] = 1
        for word in full_data['tokenized_words'][i][j]:
            if word in word_dict:
                word_dict[word] += 1
            else:
                word_dict[word] = 1

In [None]:
print(len(word_dict.keys())) #should be number of unique words
print(sum(word_dict.values())) #should be total number of words

In [None]:
count = 0
twoOrOne = 0
for value in word_dict.values():
    if value == 1:
        count +=1
    if value <3:
        twoOrOne +=1
print(len(word_dict.keys()) - count) # words that appear more than once
print(len(word_dict.keys()) - twoOrOne) # words that appear more than twice

In [None]:
print(len(word_per_sent_dict.keys())) #should be number of unique sentence lengths
print(sum(word_per_sent_dict.values())) #should be number of sentences in all plots
print(max(word_per_sent_dict.keys())) #should be largest sentence length
total = 0
weight_sum = 0
for key, value in word_per_sent_dict.items():
    total += value
    weight_sum += key*value
print(weight_sum/total) #should be average sentence length
#print(word_per_sent_dict)

In [None]:
print(len(sent_per_summary_dict.keys())) #should be number of unique sentence lengths per summary
print(max(sent_per_summary_dict.keys())) #should be highest amount of sentences per summary
total = 0
weight_sum = 0
for key, value in sent_per_summary_dict.items():
    total += value
    weight_sum += key*value
print(weight_sum/total) #should be average sentence count per summary
#print(sent_per_summary_dict)

# Load GloVe Word Embeddings

In [None]:
#compute an index mapping words to known embeddings, by parsing the data dump of pre-trained embeddings
embeddings_index = {}
GLOVE_DIR = './glove.6B/'
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
#create average word vector. This will later be used in place of unknown words
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), 'r') as f:
    for i, line in enumerate(f):
        pass
n_vec = i + 1
hidden_dim = len(line.split(' ')) - 1

vecs = np.zeros((n_vec, hidden_dim), dtype=np.float32)

with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), 'r') as f:
    for i, line in enumerate(f):
        vecs[i] = np.array([float(n) for n in line.split(' ')[1:]], dtype=np.float32)

average_vec = np.mean(vecs, axis=0)
#print(average_vec)

# Keras Modeling

Based on the above token analysis, we set the following hyperparameters at these initial values

In [None]:
MAX_SENT_LENGTH = 15 #Gets ~80% of sentences
MAX_SENTS = 5 #Gets ~80% of summaries
MAX_NB_WORDS = 80000 #Eliminates words seen two or fewer times
EMBEDDING_DIM = 100

In [None]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(full_data['flattened_tokens'])

In [None]:
data = np.zeros((len(full_data), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
doc_lst = []

# keep the MAX_NB_WORDS most frequent words and replace the rest with 'UNK'
# truncate to the first MAX_SENTS sentences per doc and MAX_SENT_LENGTH words per sentence

for summary_num, row in full_data.iterrows():
    for sent_num, sent in enumerate(row['tokenized_words']):
        if sent_num < MAX_SENTS:
            word_num = 0
            words_in_sent = []
            for _, word in enumerate(sent):
                if word_num < MAX_SENT_LENGTH: 
                    if (word in tokenizer.word_index) and (tokenizer.word_index[word] < MAX_NB_WORDS):
                        data[summary_num, sent_num, word_num] = tokenizer.word_index[word]
                        words_in_sent.append(word)
                    else:
                        data[summary_num, sent_num, word_num] = MAX_NB_WORDS
                        words_in_sent.append('UNK')
                    word_num = word_num + 1
            doc_lst.append(words_in_sent)

In [None]:
word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

print('Shape of data tensor:', data.shape)

In [None]:
#leverage our embedding_index dictionary and our word_index to compute our embedding matrix
embedding_matrix = np.zeros((MAX_NB_WORDS+1, EMBEDDING_DIM))
count = 0
unknown =0
added =0
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    # words not found in embedding index will be all-zeros.
    if embedding_vector is not None and i < MAX_NB_WORDS:
        embedding_matrix[i] = embedding_vector
        added+=1
    elif i == MAX_NB_WORDS:
        # index MAX_NB_WORDS in data corresponds to 'UNK'
        embedding_matrix[i] = average_vec #use average vector for unknown
        unknown+=1
    else:
        count +=1
print(added) #of the MAX_NB_WORDS most frequent tokens in our corpus, this many have GloVe embeddings
print(unknown)
print(count)

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
absent_words = 0
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        absent_words += 1
print('Total absent words are', absent_words, 'which is', "%0.2f" % (absent_words * 100 / len(word_index)), '% of total words')

In [None]:
#load this embedding matrix into an Embedding layer.
REG_PARAM = 1e-13
l2_reg = regularizers.l2(REG_PARAM)

embedding_layer = Embedding(MAX_NB_WORDS + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            embeddings_regularizer=l2_reg,
                            mask_zero = True, #determines whether masking is performed, i.e. whether the layers ignore the padded zeros in shorter documents
                            trainable=False) #prevent weights from being updated during training

In [None]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix], 
                            input_length=MAX_SENT_LENGTH, 
                            trainable=False)

In [None]:
#https://medium.com/jatana/report-on-text-classification-using-cnn-rnn-han-f0e887214d5f
# sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
# embedded_sequences = embedding_layer(sentence_input)
# l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
# sentEncoder = Model(sentence_input, l_lstm)

# review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
# review_encoder = TimeDistributed(sentEncoder)(review_input)
# l_lstm_sent = Bidirectional(LSTM(100))(review_encoder)
# preds = Dense(20, activation='softmax')(l_lstm_sent)
# model = Model(review_input, preds)

In [None]:
#https://github.com/Hsankesara/DeepResearch/blob/master/Hierarchical_Attention_Network/attention_with_context.py
#https://medium.com/analytics-vidhya/hierarchical-attention-networks-d220318cf87e
from keras.preprocessing.text import Tokenizer,  text_to_word_sequence
from keras.engine.topology import Layer
from keras import initializers as initializers, regularizers, constraints
from keras.callbacks import Callback, ModelCheckpoint
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding, Input, Dense, LSTM, GRU, Bidirectional, TimeDistributed
from keras import backend as K
from keras import optimizers
from keras.models import Model
def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)

class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Note: The layer has been tested with Keras 2.0.6
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

In [None]:
# Words level attention model
word_input = Input(shape=(MAX_SENT_LENGTH,), dtype='float32')
word_sequences = embedding_layer(word_input)
word_lstm = Bidirectional(LSTM(150, return_sequences=True, kernel_regularizer=l2_reg))(word_sequences)
word_dense = TimeDistributed(Dense(200, kernel_regularizer=l2_reg))(word_lstm)
word_att = AttentionWithContext()(word_dense)
wordEncoder = Model(word_input, word_att)

# Sentence level attention model
sent_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='float32')
sent_encoder = TimeDistributed(wordEncoder)(sent_input)
sent_lstm = Bidirectional(LSTM(150, return_sequences=True, kernel_regularizer=l2_reg))(sent_encoder)
sent_dense = TimeDistributed(Dense(200, kernel_regularizer=l2_reg))(sent_lstm)
sent_att = Dropout(0.5)(AttentionWithContext()(sent_dense))
preds = Dense(20, activation='sigmoid')(sent_att)
model = Model(sent_input, preds)

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',  metrics=['accuracy'])

In [None]:
model.summary()

# Create train/test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.10, random_state=42)

In [None]:
#Separate training set into train and dev. Roughly 80% of original data is train, 10% dev, 10% test
x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size=0.11, random_state=42)

# Training the model

In [None]:
fname = 'han_food'
history = History()
csv_logger = CSVLogger('./{0}_{1}.log'.format(fname, REG_PARAM), separator=',', append=True)

In [None]:
BATCH_SIZE = 50
NUM_EPOCHS = 10

In [None]:
print(x_train.shape)
print(y_train.shape)

In [None]:
t1 = time.time()

model.fit(x_train, y_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, shuffle=False, 
          callbacks=[history, csv_logger], verbose=2)

t2 = time.time()

# Evaluation and Metrics

In [None]:
preds = model.predict(x_dev)

In [None]:
def indiv_class_scores(y_true, y_pred, threshold, metric = None):
    y_pred = y_pred >= threshold
    for i in range(len(mlb.classes_)):
        if metric == "precision":
            score = precision_score(y_true[:,i], y_pred[:,i])
            print("The {} for {} is {}".format(metric, mlb.classes_[i], score))
        elif metric == "recall":
            score = recall_score(y_true[:,i], y_pred[:,i])
            print("The {} for {} is {}".format(metric, mlb.classes_[i], score))
        elif metric == "f1":
            score = f1_score(y_true[:,i], y_pred[:,i])
            print("The {} for {} is {}".format(metric, mlb.classes_[i], score))
        else:
            return "Not a valid metric"

In [None]:
indiv_class_scores(y_dev, preds, threshold = 0.5, metric = "precision")

In [None]:
indiv_class_scores(y_dev, preds, threshold=0.5, metric = "recall")

In [None]:
indiv_class_scores(y_dev, preds, threshold= 0.5, metric= "f1")

In [None]:
y_pred = preds >= 0.5
micro_precision = precision_score(y_dev, y_pred, average = 'micro')
weighted_macro_precision = precision_score(y_dev, y_pred, average = 'weighted')
micro_recall = recall_score(y_dev, y_pred, average = 'micro')
weighted_macro_recall = recall_score(y_dev, y_pred, average = 'weighted')
micro_f1 = f1_score(y_dev, y_pred, average = 'micro')
weighted_macro_f1 = f1_score(y_dev, y_pred, average = 'weighted')


In [None]:
print("The micro precision is", micro_precision)
print("The weighted macro precision is", weighted_macro_precision)
print("The micro recall is", micro_recall)
print("The weighted macro recall is", weighted_macro_recall)
print("The micro f1 is", micro_f1)
print("The weighted macro f1 is", weighted_macro_f1)

In [None]:
from keras.models import load_model

model.save('HAN_10epochs.h5')  # creates a HDF5 file 'my_model.h5'