## Import Essential library

In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

Using TensorFlow backend.


## Connecting to Drive 
Because I am using colab.research.googe.com

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


## Read File from Destination Path

In [0]:
filename ='/content/drive/My Drive/LSTM/dataset/ravi.txt'            #Change According to your path in your system.
raw_text = open(filename).read()
raw_text = raw_text.lower()

### Mapping every unique character present in file to number.

In [0]:
chars = sorted(list(set(raw_text)))
char_to_int = dict((c,i) for i, c in enumerate(chars))

### Dictionary of Mapping

In [5]:
print (char_to_int)

{'\n': 0, ' ': 1, '!': 2, '(': 3, ')': 4, '*': 5, ',': 6, '-': 7, '.': 8, ':': 9, ';': 10, '?': 11, '[': 12, ']': 13, '_': 14, 'a': 15, 'b': 16, 'c': 17, 'd': 18, 'e': 19, 'f': 20, 'g': 21, 'h': 22, 'i': 23, 'j': 24, 'k': 25, 'l': 26, 'm': 27, 'n': 28, 'o': 29, 'p': 30, 'q': 31, 'r': 32, 's': 33, 't': 34, 'u': 35, 'v': 36, 'w': 37, 'x': 38, 'y': 39, 'z': 40, '‘': 41, '’': 42, '“': 43, '”': 44, '\ufeff': 45}


## Analyzing Text file

In [6]:
n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters=",n_chars)
print ("Total Vocab=",n_vocab)

Total Characters= 144354
Total Vocab= 46


## Prepare the Dataset to train Our Model
Here I choose the sequence length=100 i.e each character of sequence length is input for LSTM time
step. This sequence act as input and output is 101th character of text.

In [23]:
# prepare the dataset of input to output pairs encoded as integers
seq_length=100
dataX = []
dataY = []
for i in range(0,n_chars-seq_length,1):
  seq_in = raw_text[i:i + seq_length]
  seq_out = raw_text[i+seq_length]
  dataX.append([char_to_int[char] for char in seq_in])
  dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print ("Total Patterns:", n_patterns)   # Total Pattern means number of input example
print ("Number of example in input:",len(dataX))
print ("Number of target:",len(dataY))

Total Patterns: 144254
Number of example in input: 144254
Number of target: 144254


## Lets take look of one training example
Sequence is in integer.

In [10]:
print (dataX[0])

[45, 17, 22, 15, 30, 34, 19, 32, 1, 23, 8, 1, 18, 29, 37, 28, 1, 34, 22, 19, 1, 32, 15, 16, 16, 23, 34, 7, 22, 29, 26, 19, 0, 0, 15, 26, 23, 17, 19, 1, 37, 15, 33, 1, 16, 19, 21, 23, 28, 28, 23, 28, 21, 1, 34, 29, 1, 21, 19, 34, 1, 36, 19, 32, 39, 1, 34, 23, 32, 19, 18, 1, 29, 20, 1, 33, 23, 34, 34, 23, 28, 21, 1, 16, 39, 1, 22, 19, 32, 1, 33, 23, 33, 34, 19, 32, 1, 29, 28, 1]


Now how data look in character.

In [24]:
int_to_char =dict((i,c) for i,c in enumerate(chars))
#this dataX[0] is in real
s=[]
print ("\"",''.join([int_to_char[value] for value in dataX[0]]),"\"")

" ﻿chapter i. down the rabbit-hole

alice was beginning to get very tired of sitting by her sister on  "


## Now we have to reshape the Data
1. Input data in [samples,time steps, features]<br>
2.Normalize the input data
3. One hot encode the output variable.

In [0]:
#reshape X to be [samples,time steps, features]
X =np.reshape(dataX, (n_patterns, seq_length,1))      #Here feature is 1 becasue our every character is input to LSTM time step and each is represented by one integer.
#normalize
X=X/float(n_vocab)
#one hot encode the output variable
y = np_utils.to_categorical(dataY)

## Now its time to define our RNN model using LSMT cell.

In [0]:
# define the LSTM model
model =Sequential()
model.add(LSTM(256,input_shape=(X.shape[1],X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1],activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

## Train the model on Data.

In [19]:
model.fit(X, y, epochs=20, batch_size=128)

Instructions for updating:
Use tf.cast instead.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
  8576/144254 [>.............................] - ETA: 3:28 - loss: 2.4507

KeyboardInterrupt: ignored

## Generate the text using above trained model.

In [21]:
import sys
# pick a random seed
start =np.random.randint(0,len(dataX)-1)
pattern = dataX[start]
print ("Seed")
print ("\"",''.join([int_to_char[value] for value in pattern]),"\"")
# generate characters
for i in range(1000):
  x = np.reshape(pattern,(1,len(pattern),1))
  x = x/float(n_vocab)
  prediction =model.predict(x,verbose=0)
  index = np.argmax(prediction)
  result = int_to_char[index]
  seq_in = [int_to_char[value] for value in pattern]
  sys.stdout.write(result)
  pattern.append(index)
  pattern = pattern[1:len(pattern)]


Seed
" her was sitting on the ground near the
door, staring stupidly up into the sky.

alice went timidly u "
o the soiee  the woued to tee io the wooee  and the wooed to the soiee to the sooee  and the wooed to the sooee  an cel toe toeee to the sooee  and the wooed to the sooee  an cel toe toeee to the sooee  and the wooed to the sooee  an cel toe toeee to the sooee  and the wooed to the sooee  an cel toe toeee to the sooee  and the wooed to the sooee  an cel toe toeee to the sooee  and the wooed to the sooee  an cel toe toeee to the sooee  and the wooed to the sooee  an cel toe toeee to the sooee  and the wooed to the sooee  an cel toe toeee to the sooee  and the wooed to the sooee  an cel toe toeee to the sooee  and the wooed to the sooee  an cel toe toeee to the sooee  and the wooed to the sooee  an cel toe toeee to the sooee  and the wooed to the sooee  an cel toe toeee to the sooee  and the wooed to the sooee  an cel toe toeee to the sooee  and the wooed to the sooee  an cel toe

### Above sentence is repeating because it is trained on very shallow network and less number of epoch, for better result we have to trained model for large number of epochs.