In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import SimpleRNN
from keras.layers import Embedding

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
data = """ Jack and Jill went up the hill .\n To fetch a pail of water .\n Jack fell down and broke his crown .\n And Jill came tumbling after ."""
print(data)

In [None]:
data_splitted=data.split('\n')
tokenizer=Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\]^_`{|}~')
tokenizer.fit_on_texts(data_splitted)
# +1 for 0' character at starting 
vocab_size = len(tokenizer.word_index) + 1
sequences=tokenizer.texts_to_sequences(data_splitted)

In [None]:
print('','\t\t\tVocab Dictionary:',tokenizer.word_index,'',sep='\n'+('-'*100)+'\n')
print('',"\t\tSentences:", *data_splitted,'',sep='\n'+('-'*50)+'\n')
print('',"\tSequences of words in Sentences:",*sequences,'',sep='\n'+('-'*50)+'\n')

In [None]:
X=list()
y=list()

for i in range(len(sequences)):
  X.insert(i,sequences[i][:-1])
  y.insert(i,sequences[i])

print('',"\t\tTraining X:", *X,'', "\t\tGround Truth Y: ",*y,'',sep='\n'+('-'*50)+'\n')

In [None]:
# Padding X train to make it equal length/ symmetric
length = max([len(sequence) for sequence in X])
# length+1 to have 0 as the first input
X=pad_sequences(X,maxlen=length+1,padding='pre')
y=pad_sequences(y,maxlen=length+1,padding='pre')

In [None]:
print('',"\t\t Training X:",X,f'Shape: {X.shape}','',sep='\n'+('-'*50)+'\n')
print('',"\t\tGround Truth Y:",y,f'Shape: {y.shape}','',sep='\n'+('-'*50)+'\n')

In [None]:
# One Hot Encodeing /convert Y to one hot vector/s
y=keras.utils.to_categorical(y,num_classes=vocab_size) 

In [None]:
model=Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=10))
model.add(SimpleRNN(units=100, return_sequences=True))
model.add(Dense(units=vocab_size,activation='softmax'))

In [None]:
model.summary()

In [None]:
len(str(model.layers[0].input_shape))

In [None]:
print('',"\tInput Shape of all Layers:",'',sep='\n'+(50*'=')+'\n')
for i in model.layers:
    l=i.name
    k=i.input_shape
    print(f'{l}:',(25-len(l))*' ',k,end='\n'+(50*'=')+'\n')

In [None]:
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=['accuracy'])

In [None]:
history=model.fit(X, y, epochs=200, verbose=0)

In [None]:
historydf=pd.DataFrame(history.history)

In [None]:
historydf.plot()

In [None]:
def sample_all_seq_wo_seed(model, tokenizer, n_words, vocab_size): #all the words are sampled
  encoded=list()
  in_text = ''
  # generate a fixed number of words = n_words
  for i in range(n_words):
    # encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0] # for words not in the vocab it returns []
    #print("i:", i, "Encoded:",encoded)
    encoded.insert(0,0)
    encoded=np.array(encoded)
    encoded=np.reshape(encoded,newshape=(1,-1))
    #print("Encoded:",encoded, encoded.shape)
    # predict probability and sample a word from vocab
    if i == 0:
      prob = model.predict(encoded, verbose=0)
      #print("Prob:", prob, prob.shape)
      yhat=0
    # Choose Random Sample for y^0 (i.e. for starting case only)
      while yhat == 0:
        yhat=np.random.choice(range(vocab_size),p=prob.ravel())
      yhat=np.array([yhat]).reshape((1,-1))
      #print(f"Y\u0302{i}:", yhat, yhat.shape)
    else:
      prob = model.predict(encoded, verbose=0)
      #print("i=", i, "Prob:", prob, prob.shape)
      yhat=np.append(yhat,0)  #just creating space for the next yhat
      yhat=np.reshape(yhat,newshape=(1,-1))
      while yhat[0,i] == 0:
        yhat[0,i]=np.random.choice(range(vocab_size),p=prob[0,i].ravel())
      #print(f"Y\u0302{i}:", yhat, yhat.shape)
        
    
    # map predicted word index to word
    out_word = ''
    for word, index in tokenizer.word_index.items():
      if index == yhat[0,i]:
        out_word = word
        #print("index:", index, "out_word:", out_word)
        break
    
    # append to input
    in_text = in_text + out_word + ' '
    # for loop ends here
  return in_text

In [None]:
print('','\t\tSampling without seed',f'Generate 8 word sentence:\n {sample_all_seq_wo_seed(model, tokenizer, 8, vocab_size)}','',sep='\n'+(50*'-')+'\n')

In [None]:
print('','\t\tSampling without seed',f'Generate 8 word sentence:\n {sample_all_seq_wo_seed(model, tokenizer, 7, vocab_size)}','',sep='\n'+(50*'-')+'\n')

In [None]:
print('','\t\tSampling without seed',f'Generate 8 word sentence:\n {sample_all_seq_wo_seed(model, tokenizer, 12, vocab_size)}','',sep='\n'+(50*'-')+'\n')

In [None]:
print('','\t\tSampling without seed',f'Generate 8 word sentence:\n {sample_all_seq_wo_seed(model, tokenizer, 15, vocab_size)}','',sep='\n'+(50*'-')+'\n')

In [None]:
def prob_of_input_sentence(model, tokenizer, sentence):
  print("Input Sentence:", sentence)
  encoded=tokenizer.texts_to_sequences([sentence])[0]
  print("encoded before insert:", encoded)
  encoded.insert(0,0)
  encoded=np.array(encoded)
  encoded=np.reshape(encoded,newshape=(1,-1))
  print("Encoded:", encoded, encoded.shape)
  prob=model.predict(encoded, verbose=0)
  print("Prob:", prob, prob.shape)
  probability=1
  for i in range(prob.shape[1]-1):
    probability = probability * prob[0,i,encoded[0,i+1]]
  print("Probability of Sentence", "\"", sentence, "\"", "is:", probability)


In [None]:
print("-------------------Probability of Input Sentence------------------------------")  
prob_of_input_sentence(model, tokenizer, "Jack and Jill Went up the hill")
prob_of_input_sentence(model, tokenizer, "jack and jill went up the hill .")

prob_of_input_sentence(model, tokenizer, "jack fell down and broke")
prob_of_input_sentence(model, tokenizer, "and jill came tumbling after .")
print("-------------------------------------------------------------------------------")