In [2]:
import numpy as np 
import pandas as pd 
from scipy import spatial
from collections import Counter


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
 
ds_path = "/kaggle/input/complete-poetryfoundationorg-dataset/"
glove_path = "/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.%dd.txt"

In [3]:
import keras.backend as K
from keras.models import Model
from keras.optimizers import Adam, SGD
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, Input, LSTM, GRU

import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
poems_df = pd.read_csv(os.path.join(ds_path, "kaggle_poem_dataset.csv"))
poems_df.head(10)

In [5]:
poems_df.groupby("Author").agg({"Content": "count"}).sort_values("Content", ascending=False).head(5)

In [6]:
william_poems = poems_df[poems_df["Author"] == "William Shakespeare"]
print("Some of the lines are: ")
print(william_poems.iloc[0, 4].split('\n')[:4])

In [7]:
poems_combined = "\n".join(william_poems.iloc[:, 4].values)
print("Total number of characters: ", len(poems_combined))

In [8]:
poem_lines = poems_combined.split('\n')
print("Number of lines in the dataset: ", len(poem_lines))

In [9]:
input_lines = ["<sos> "+line for line in poem_lines] # in each of the input we add <sos> token idicating the begining of a line
target_lines = [line+ " <eos>" for line in poem_lines] # while target lines are appended with with <eos> token indicating end of the line

In [10]:
tokenized_lines = map(str.split, input_lines)
len_of_lines = map(len, tokenized_lines)
len_frequencies = Counter(list(len_of_lines))

sorted(len_frequencies.items())

In [11]:
EPOCHS = 500 
BATCH_SIZE = 64 
LATENT_DIM = 200 
EMBEDDING_DIM = 200 
MAX_VOCAB_SIZE = 30000 
VALIDATION_SPLIT = 0.2

In [12]:
class SequenceGenerator():
    
    def __init__(self, input_lines, target_lines, max_seq_len=None, max_vocab_size=10000, embedding_dim=200):
    
        self.input_lines = input_lines
        self.target_lines = target_lines
        
        self.MAX_SEQ_LEN = max_seq_len
        self.MAX_VOCAB_SIZE = max_vocab_size
        self.EMBEDDING_DIM = embedding_dim
    
    
    def initialize_embeddings(self):
       
        self.word2vec = {}
        with open(glove_path%self.EMBEDDING_DIM, 'r') as file:
            for line in file:
                vectors = line.split()
                self.word2vec[vectors[0]] = np.asarray(vectors[1:], dtype="float32")

        self.num_words = min(self.MAX_VOCAB_SIZE, len(self.word2idx)+1)
        self.embeddings_matrix = np.zeros((self.num_words, self.EMBEDDING_DIM))
        
        for word, idx in self.word2idx.items():
            if idx <= self.num_words:
                word_embeddings = self.word2vec.get(word)
                if word_embeddings is not None:
                    self.embeddings_matrix[idx] = word_embeddings
                    
        self.idx2word = {v:k for k,v in self.word2idx.items()}
    
    
    def prepare_sequences(self, filters=''):
    
        self.tokenizer = Tokenizer(num_words=self.MAX_VOCAB_SIZE, filters='')
        self.tokenizer.fit_on_texts(self.input_lines+self.target_lines)
        self.word2idx = self.tokenizer.word_index
        self.initialize_embeddings()

        self.input_sequences = self.tokenizer.texts_to_sequences(self.input_lines)
        self.target_sequences = self.tokenizer.texts_to_sequences(self.target_lines)
        
        
        max_seq_len = max(list(map(len, self.input_lines+self.target_lines)))
        if self.MAX_SEQ_LEN:
            self.MAX_SEQ_LEN = min(self.MAX_SEQ_LEN, max_seq_len)
        else:
            self.MAX_SEQ_LEN = max_seq_len
            
        self.input_sequences = pad_sequences(self.input_sequences, maxlen=self.MAX_SEQ_LEN, padding="post")
        self.target_sequences = pad_sequences(self.target_sequences, maxlen=self.MAX_SEQ_LEN, padding="post")
        
        print("1st input sequence: ", self.input_sequences[0])
        print("1st target sequence: ", self.target_sequences[0])
        
        
    def one_hot_encoding(self):
        self.one_hot_targets = np.zeros((len(self.target_sequences), self.MAX_SEQ_LEN, self.num_words))
        
        for seq_idx, seq in enumerate(self.target_sequences):
            for word_idx, word_id in enumerate(self.target_sequences[seq_idx]):
                if word_id > 0:
                    self.one_hot_targets[seq_idx, word_idx, word_id] = 1
    
    
    def get_closest_word(self, word_vec):
     
        
        max_dist = 9999999999
        closest_word = "NULL"
       
        for word, vec in self.word2vec.items():
        
            dist = spatial.distance.cosine(word_vec, vec)
            
            if dist < max_dist:
                max_dist = dist
                closest_word = word
        
        return closest_word

sg_obj = SequenceGenerator(input_lines, target_lines, max_seq_len=12, 
                           max_vocab_size=MAX_VOCAB_SIZE, embedding_dim=EMBEDDING_DIM)


sg_obj.prepare_sequences()
sg_obj.one_hot_encoding()

assert '<sos>' in sg_obj.word2idx
assert '<eos>' in sg_obj.word2idx

# Create Seq2Seq model

## Create the Encoder

In [13]:

embedding = Embedding(
    input_dim=sg_obj.num_words,
    output_dim=sg_obj.EMBEDDING_DIM,
    weights=[sg_obj.embeddings_matrix]
)


state_h = Input(shape=(LATENT_DIM,))
state_c = Input(shape=(LATENT_DIM,))
sequence_input = Input(shape=(sg_obj.MAX_SEQ_LEN,))

embedding_ = embedding(sequence_input)


lstm = LSTM(LATENT_DIM, return_state=True, return_sequences=True)
x, h_, c_ = lstm(embedding_, initial_state=[state_h, state_c])
dense = Dense(sg_obj.num_words, activation="softmax")
output = dense(x)

Encoder = Model([sequence_input, state_h, state_c], output)

## Create the Decoder

In [14]:

deco_inp = Input(shape=(1,))
deco_embed = embedding(deco_inp)
deco_x, h, c = lstm(deco_embed, initial_state=[state_h, state_c])
deco_output = dense(deco_x)
Decoder = Model([deco_inp, state_h, state_c], [deco_output, h, c])

In [15]:
Encoder.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(lr=0.01),
    metrics=['accuracy']
)

initial_state = np.zeros((len(sg_obj.input_sequences), LATENT_DIM))
history = Encoder.fit(
    [sg_obj.input_sequences, initial_state, initial_state], 
    sg_obj.one_hot_targets, 
    batch_size=BATCH_SIZE, 
    epochs=EPOCHS, 
    validation_split=VALIDATION_SPLIT,
    verbose=0
)

In [16]:
def get_context(sequences, query_word):

    
    assert query_word in sg_obj.word2idx
    
    
    query_word_embed = sg_obj.word2vec.get(query_word, np.zeros(shape=(EMBEDDING_DIM)))
    
    if sequences == []:
        return query_word_embed
   
    seq_embeddings = []
    for seq in sequences:
        
        
        zero_vector = np.zeros(shape=(EMBEDDING_DIM))
        for word in seq:
            zero_vector += sg_obj.word2vec.get(word, np.zeros(shape=(EMBEDDING_DIM)))
            
        seq_embeddings.append(zero_vector)
    seq_embeddings = np.array(seq_embeddings)
            
    weights = []
    for seq_embed in seq_embeddings:
        
        dist = spatial.distance.cosine(seq_embed, query_word_embed)
        weights.append(np.array([dist]))
        
    
    weights = np.array(weights/max(weights))
        
   
    context = sum(weights * seq_embeddings)
    
    return context

In [17]:
def get_sample_line(context):
 
    sos_token = np.array([[sg_obj.word2idx.get("<sos>")]])
   
    h = np.array([context])    
    c = np.zeros(shape=(1, LATENT_DIM))
   
    eos_token = sg_obj.word2idx['<eos>']
    
    output_sequence = []

    for i in range(sg_obj.MAX_SEQ_LEN):
        o, h, c = Decoder.predict([sos_token, h, c])
        probs = o[0,0]
        
        if np.argmax(probs) ==0:
            print("Something went wrong!!")
        
        probs = np.nan_to_num(probs)
        probs[0] = 0 
        probs /= probs.sum()
        selected_idx = np.random.choice(len(probs), p=probs)
        if selected_idx == eos_token:
            break
        output_sequence.append(sg_obj.idx2word.get(selected_idx, "Error <%d>" % selected_idx))
        
        sos_token[0][0] = selected_idx
    return output_sequence

In [18]:

query_word = "sun"
poem_lines = []
sequences = []
for line_no in range(8):
    context = get_context(sequences, query_word)
    
    try:
        sequences.append(get_sample_line(context))
    except:
        pass
    
    poem_lines.append(" ".join(sequences[-1]))
    
print("\n\n")
print("\n".join(poem_lines))