<a href="https://colab.research.google.com/github/shobhitsundriyal/Revisit_ML/blob/master/Poetry_gen_with_seq2seq_keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!wget 'http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip'
!unzip 'glove.6B.zip'


2020-05-23 12:52:59 (2.12 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [0]:
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding
from keras.models import Model
from sklearn.metrics import roc_auc_score
from keras.optimizers import Adam, SGD

In [0]:
MAX_SEQUENCE_LENGTH = 100
MAX_VOCAB_SIZE = 3000
EMBEDDING_DIM = 50 #little less
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 2000
LATENT_DIM = 25

#### Preparing data


In [0]:
# Preparing data
input_texts = []
target_texts = []

for line in open('robert_frost.txt'):
  line = line.rstrip()
  if not line:
    continue
  
  input_line = '<sos> ' + line
  target_line = line + ' <eos>'

  input_texts.append(input_line)
  target_texts.append(target_line)

all_lines = input_texts + target_texts

#### Tokenization

In [0]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='') #don't filter anything otherwise special charecters will be removed
tokenizer.fit_on_texts(all_lines)
input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)

In [6]:
max_seq_len =  max(len(s) for s in input_sequences)
print('Max Sentence length:', max_seq_len)

Max Sentence length: 12


In [7]:
word2idx = tokenizer.word_index
print('No of unique tokens', len(word2idx))

No of unique tokens 3056


In [8]:
print('<sos>' in word2idx)
print('<eos>' in word2idx)

True
True


In [12]:
max_seq_len = min(max_seq_len, MAX_SEQUENCE_LENGTH)
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_seq_len, padding='post')
input_sequences.shape

(1436, 12)

In [13]:
#Loding word vectors
word2vec = {}
with open(f'glove.6B.{EMBEDDING_DIM}d.txt') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.array(values[1:], dtype='float32')
    word2vec[word] = vec
print(f'Found {len(word2vec)} word vectors')

Found 400000 word vectors


In [15]:
len(word2idx) # almost 3000

3056

In [0]:
#Preparing embedding Matrix

num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word2idx.items():
  if i < MAX_VOCAB_SIZE:
    embedding_vector = word2vec.get(word) 
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

In [26]:
# one-hot the targets coz, can't use spare cross entropy
one_hot_targets = np.zeros((len(input_sequences), max_seq_len, num_words))
print(one_hot_targets.shape)

k = 1

for i, target_sequence in enumerate(target_sequences):
  '''#little check
  if k == 1:
    print(target_sequence)
    k = 0'''
  for t, word in enumerate(target_sequence):
    '''
    if k == 1:
      pass
      print(word)
    '''
    if word > 0:
      one_hot_targets[i, t, word] = 1

(1436, 12, 3000)
[104 537 538   9   7 539 540   2   0   0   0   0]


In [0]:
# Loading pre-trained embeddings

embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    #trainable=False
)

#### Model Building

In [0]:
input_ = Input(shape=(max_seq_len,))
initial_h = Input(shape=(LATENT_DIM,))
initial_c = Input(shape=(LATENT_DIM,))

x = embedding_layer(input_)
lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
x, _, _ = lstm(x, initial_state=[initial_h, initial_c])
dense = Dense(num_words, activation='softmax')
output = dense(x)

model = Model([input_, initial_h, initial_c], output)
model.compile(
    loss = 'categorical_crossentropy',
    optimizer = Adam(lr=0.01),
    metrics = ['accuracy']
)

#### Training

In [35]:
z = np.zeros((len(input_sequences), LATENT_DIM))

his = model.fit(
    [input_sequences, z, z],
    one_hot_targets,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=VALIDATION_SPLIT
)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1148 samples, validate on 288 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2

KeyboardInterrupt: ignored

In [0]:
plt.plot(his.history['loss'], label='Train Loss')
plt.plot(his.history['val_loss'], label='Val Loss')
plt.legend()
plt.show()

In [0]:
plt.plot(his.history['accuracy'], label='Train Acc')
plt.plot(his.history['val_accuracy'], label='Val Acc')
plt.legend()
plt.show()

Accuracy is not best metric coz, for eg word after "The" can be more than one words and no any one word specificially.

Encoder Done

---