In [22]:
import numpy as np
import pandas as pd
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, GRU, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [56]:
filename = 'character_encoding_text.txt'
txt_file = open(filename, 'r')
text_data = txt_file.read()
text_data

"The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve the political bands which have connected them with another, and to assume among the powers of the earth, the separate and equal station to which the Laws of Nature and of Nature's God entitle them, a decent respect to the opinions of mankind requires that they should declare the causes which impel them to the separation.\n\nWe hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness.--That to secure these rights, Governments are instituted among Men, deriving their just powers from the consent of the governed, --That whenever any Form of Government becomes destructive of these ends, it is the Right of the People to alter or to abolish it, and to institute new Government, laying its foundat

### Preprocessing the Text Data

In [57]:
import re
def text_cleaner(text):
    new_string = text.lower()
    new_string = re.sub(r"'s\b","",new_string)
    # remove punctuations
    new_string = re.sub("[^a-zA-Z]", " ", new_string) 
    long_words =[]
    for i in new_string.split():
        if len(i)>= 3:
            long_words.append(i)
    return(" ".join(long_words)).strip()        

In [58]:
new_data = text_cleaner(text_data)

In [59]:
new_data[0:500]

'the unanimous declaration the thirteen united states america when the course human events becomes necessary for one people dissolve the political bands which have connected them with another and assume among the powers the earth the separate and equal station which the laws nature and nature god entitle them decent respect the opinions mankind requires that they should declare the causes which impel them the separation hold these truths self evident that all men are created equal that they are e'

### Creating Sequences

In [60]:
def create_sequences(text):
    length = 30  # 30 sequence per head
    sequences = []
    for i in range(length, len(text)):
        # select sequence of tokens
        seq = text[i-length: i+1]
        #print("length of sequeces", len(sequences))
        #30 to 7082 iterating and taking 30 as batch per call.
        # 30= i 1st iteration took 30 sequence
        #31 = i 2nd iteration took anather 30 seq 
        #32=i 3rd iteration
        sequences.append(seq) # storing the sequences
    print('Total Sequences: %d' % len(sequences))
    return sequences
    
    

In [61]:
#create sequences
sequences = create_sequences(new_data)

Total Sequences: 7052


In [62]:
sequences

['the unanimous declaration the t',
 'he unanimous declaration the th',
 'e unanimous declaration the thi',
 ' unanimous declaration the thir',
 'unanimous declaration the thirt',
 'nanimous declaration the thirte',
 'animous declaration the thirtee',
 'nimous declaration the thirteen',
 'imous declaration the thirteen ',
 'mous declaration the thirteen u',
 'ous declaration the thirteen un',
 'us declaration the thirteen uni',
 's declaration the thirteen unit',
 ' declaration the thirteen unite',
 'declaration the thirteen united',
 'eclaration the thirteen united ',
 'claration the thirteen united s',
 'laration the thirteen united st',
 'aration the thirteen united sta',
 'ration the thirteen united stat',
 'ation the thirteen united state',
 'tion the thirteen united states',
 'ion the thirteen united states ',
 'on the thirteen united states a',
 'n the thirteen united states am',
 ' the thirteen united states ame',
 'the thirteen united states amer',
 'he thirteen united states 

### Encoding Sequences

- Once the sequences are generated, the next step is to encode each character. This would give us a sequence of numbers.
- we need to extract unique characters from text and then encode it with integer numbers bcz computer understand only numbers

In [63]:

# create a character mapping index

chars = sorted(list(set(new_data)))  # set() is used to extract unique letters from the text
mapping = dict((c, i) for i,c in enumerate(chars))
mapping

{' ': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26}

In [65]:
# a b c d e f g ....z
# 1 2 3 4 5 6 7..10

# now a sequence of text 
# ex : seq-1:  the mall is big
# encoded seq 
#seq-1: 64107830450234 for the mall is big   #This is encoding of the sequence of text



# create a character mapping index
chars = sorted(list(set(new_data)))
mapping = dict((c, i) for i, c in enumerate(chars))

def encode_seq(seq):
    sequences = list()
    for line in seq:
        # integer encode line
        encoded_seq = [mapping[char] for char in line]
        # store
        sequences.append(encoded_seq)
    return sequences

# encode the sequences
sequences = encode_seq(sequences)

In [74]:
sequences
np.array(sequences)

array([[20,  8,  5, ...,  5,  0, 20],
       [ 8,  5,  0, ...,  0, 20,  8],
       [ 5,  0, 21, ..., 20,  8,  9],
       ...,
       [15, 21, 18, ...,  8, 15, 14],
       [21, 18,  0, ..., 15, 14, 15],
       [18,  0,  6, ..., 14, 15, 18]])

### Create Training and Validation set

Once we are ready with our sequences, we split the data into training and validation splits. This is because while training, I want to keep a track of how good my language model is working with unseen data.

In [76]:
from sklearn.model_selection import train_test_split

In [93]:
# vocabulary size
vocab = len(mapping)
sequences = np.array(sequences)

#li = [2, 4, 6, 7, 8]
#li[: :-1]
# [8, 7, 6, 4, 2]
# li[:]
# [2,4,6,7]
#create X and Y 
X, y = sequences[:, :-1], sequences[:, -1] 

# one hot encode y
y = to_categorical(y, num_classes=vocab)
# this will encode the values of sequences like [[20, 3, 55, 32]] into [[0, 0, 1, 0, 1,1]]


In [94]:
# create train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1, random_state=42)
print('Train shape:', X_train.shape, 'Val shape:', X_val.shape)

Train shape: (6346, 30) Val shape: (706, 30)


### Model Building
Time to build our language model!

I have used the embedding layer of Keras to learn a 50 dimension embedding for each character. This helps the model in understanding complex relationships between characters. I have also used a GRU layer as the base model, which has 150 timesteps. Finally, a Dense layer is used with a softmax activation for prediction.

In [97]:
# define model 
model = Sequential()
model.add(Embedding(vocab, 50, input_length=30, trainable = True))
model.add(GRU(150, recurrent_dropout = 0.1, dropout=0.1))
model.add(Dense(vocab, activation='softmax'))
model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 50)            1350      
_________________________________________________________________
gru_1 (GRU)                  (None, 150)               90900     
_________________________________________________________________
dense_1 (Dense)              (None, 27)                4077      
Total params: 96,327
Trainable params: 96,327
Non-trainable params: 0
_________________________________________________________________


In [100]:
#compile the model 
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')

#fit the model
model.fit(X_train, y_train, epochs =100, verbose=2, validation_data=(X_val, y_val))

Epoch 1/100
199/199 - 21s - loss: 2.7422 - acc: 0.2071 - val_loss: 2.3963 - val_acc: 0.3059
Epoch 2/100
199/199 - 13s - loss: 2.2939 - acc: 0.3275 - val_loss: 2.2388 - val_acc: 0.3513
Epoch 3/100
199/199 - 13s - loss: 2.1613 - acc: 0.3580 - val_loss: 2.1744 - val_acc: 0.3612
Epoch 4/100
199/199 - 13s - loss: 2.0585 - acc: 0.3766 - val_loss: 2.0971 - val_acc: 0.3810
Epoch 5/100
199/199 - 13s - loss: 1.9753 - acc: 0.4070 - val_loss: 2.0323 - val_acc: 0.3924
Epoch 6/100
199/199 - 13s - loss: 1.8969 - acc: 0.4354 - val_loss: 1.9951 - val_acc: 0.4065
Epoch 7/100
199/199 - 13s - loss: 1.8185 - acc: 0.4513 - val_loss: 1.9582 - val_acc: 0.4348
Epoch 8/100
199/199 - 13s - loss: 1.7371 - acc: 0.4715 - val_loss: 1.9136 - val_acc: 0.4476
Epoch 9/100
199/199 - 13s - loss: 1.6642 - acc: 0.4989 - val_loss: 1.8825 - val_acc: 0.4504
Epoch 10/100
199/199 - 13s - loss: 1.5905 - acc: 0.5164 - val_loss: 1.8744 - val_acc: 0.4745
Epoch 11/100
199/199 - 13s - loss: 1.5275 - acc: 0.5432 - val_loss: 1.8647 - va

Epoch 90/100
199/199 - 12s - loss: 0.2940 - acc: 0.9003 - val_loss: 3.1787 - val_acc: 0.4745
Epoch 91/100
199/199 - 12s - loss: 0.2852 - acc: 0.9037 - val_loss: 3.1394 - val_acc: 0.4646
Epoch 92/100
199/199 - 12s - loss: 0.2934 - acc: 0.9014 - val_loss: 3.2089 - val_acc: 0.4660
Epoch 93/100
199/199 - 12s - loss: 0.2770 - acc: 0.9095 - val_loss: 3.2419 - val_acc: 0.4603
Epoch 94/100
199/199 - 12s - loss: 0.2934 - acc: 0.9015 - val_loss: 3.2401 - val_acc: 0.4646
Epoch 95/100
199/199 - 12s - loss: 0.2781 - acc: 0.9083 - val_loss: 3.2457 - val_acc: 0.4703
Epoch 96/100
199/199 - 12s - loss: 0.2900 - acc: 0.9053 - val_loss: 3.2599 - val_acc: 0.4589
Epoch 97/100
199/199 - 12s - loss: 0.2772 - acc: 0.9064 - val_loss: 3.2960 - val_acc: 0.4646
Epoch 98/100
199/199 - 12s - loss: 0.2930 - acc: 0.9050 - val_loss: 3.2990 - val_acc: 0.4773
Epoch 99/100
199/199 - 12s - loss: 0.2824 - acc: 0.9059 - val_loss: 3.2582 - val_acc: 0.4547
Epoch 100/100
199/199 - 12s - loss: 0.2664 - acc: 0.9070 - val_loss: 3

<tensorflow.python.keras.callbacks.History at 0x1d4f35173c8>

### Inference
Once the model has finished training, we can generate text from the model given an input sequence using the below code:

In [101]:
# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
    in_text = seed_text
    # generate a fixed number of characters
    
    for _ in range(n_chars):
        
        # encode the characters as integers
        encoded = [mapping[char] for char in in_text]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], max_length=seq_length, truncating='pre')
        # predict character
        y_hat = model.predict_classes(encoded, verbose=0)
        # reverse map integer to character
        to_char = ''
        for char, index in mapping.items():
            if index==y_hat:
                to_char = char
                break
                
        # append to input
        in_text+=char
        
    return in_text     
        

TypeError: unhashable type: 'numpy.ndarray'