# Language models with Keras

Keras does not afford language models out of the box.
Instead, we will build them ourselves using neural networks.

In [1]:
# suppose we have already segmented sentences
sentences = [
    "un ordianteur peut vous aider",
    "il veut vous aider",
    "il veut un ordinateur",
    "il peut nager"
]

sentences

['un ordianteur peut vous aider',
 'il veut vous aider',
 'il veut un ordinateur',
 'il peut nager']

## I. Simple FeedForward 3-gram model

In [2]:
N = 3

# Padding the sentences
# we can use NLTK, but I will showcase how to do it just using Keras
# in case you didn't install NLTK
sentencesPad = []
for sentence in sentences:
    new_sentence = sentence
    for i in range(N-1):
        new_sentence = "<s> " + new_sentence + " </s>"
    sentencesPad.append(new_sentence)

sentencesPad

['<s> <s> un ordianteur peut vous aider </s> </s>',
 '<s> <s> il veut vous aider </s> </s>',
 '<s> <s> il veut un ordinateur </s> </s>',
 '<s> <s> il peut nager </s> </s>']

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

# We can use NLTK vocab to simplify this
tokenizer = Tokenizer(oov_token='<UNK>')
tokenizer.fit_on_texts(sentencesPad)
# 10 tokens + UNK
total_words = len(tokenizer.word_index)

sentencesOrdinal = []
for line in sentencesPad:
    token_list = tokenizer.texts_to_sequences([line])[0]
    sentencesOrdinal.append(token_list)

sentencesOrdinal

[[2, 2, 4, 9, 5, 6, 7, 2, 2],
 [2, 2, 3, 8, 6, 7, 2, 2],
 [2, 2, 3, 8, 4, 10, 2, 2],
 [2, 2, 3, 5, 11, 2, 2]]

In [4]:
# create ngrams 
# we can use NLTK, but I will showcase how to do it just using Keras
# in case you didn't install NLTK

# the inputs are two consecutive words
# So, we will have a two dimentinal array
X = []
# the outputs are the third word
# So, we will have a one dimentional array
Y = []

for sentence in sentencesOrdinal:
    slen = len(sentence)
    for i in range(slen-N):
        Xi = []
        for j in range(N-1):
            Xi.append(i+j)
        X.append(Xi)
        Y.append(i+N-1)

X, Y

([[0, 1],
  [1, 2],
  [2, 3],
  [3, 4],
  [4, 5],
  [5, 6],
  [0, 1],
  [1, 2],
  [2, 3],
  [3, 4],
  [4, 5],
  [0, 1],
  [1, 2],
  [2, 3],
  [3, 4],
  [4, 5],
  [0, 1],
  [1, 2],
  [2, 3],
  [3, 4]],
 [2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 2, 3, 4, 5, 6, 2, 3, 4, 5])

In [5]:
from tensorflow.keras.utils import to_categorical
import numpy as np

Ybin = to_categorical(Y, num_classes=total_words)
Xbin = np.array(to_categorical(X, num_classes=total_words))
input_len = Xbin.shape[1] * Xbin.shape[2]
Xbin = Xbin.reshape(Xbin.shape[0], input_len)

Xbin[0,:], Ybin[0]

(array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32))

### I.1. Without embedding

In [6]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Sequential
ff_model = Sequential()
ff_model.add(Dense(10, input_dim=input_len, activation='relu'))
ff_model.add(Dense(total_words, activation='softmax'))

ff_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10)                230       
_________________________________________________________________
dense_1 (Dense)              (None, 11)                121       
Total params: 351
Trainable params: 351
Non-trainable params: 0
_________________________________________________________________


In [7]:
ff_model.compile(loss='categorical_crossentropy', optimizer='adam')
ff_model.fit(Xbin, Ybin, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fb43d717700>

In [8]:
# the function which takes N-1 words separated by space
# and returns the Nth most probable word
def estimate(words):
    Xp = tokenizer.texts_to_sequences([words])[0]
    Xp = to_categorical(Xp, num_classes=total_words)
    Xp = np.array([(np.array(Xp)).flatten()])
    prob = ff_model.predict(Xp)
    i = prob.argmax()
    return tokenizer.sequences_to_texts([[i]])[0]
    
estimate('peut aider')

'un'

### I.2. With embedding

In [9]:
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten
from tensorflow.keras.models import Sequential
ff_model_emb = Sequential()
ff_model_emb.add(Embedding(total_words, 10, input_length=N-1))
ff_model_emb.add(Flatten())
ff_model_emb.add(Dense(total_words, activation='softmax'))

ff_model_emb.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2, 10)             110       
_________________________________________________________________
flatten (Flatten)            (None, 20)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 11)                231       
Total params: 341
Trainable params: 341
Non-trainable params: 0
_________________________________________________________________


In [10]:
ff_model_emb.compile(loss='categorical_crossentropy', optimizer='adam')
#Here we use X instead of Xbin
ff_model_emb.fit(np.array(X), Ybin, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fb43c02a820>

In [11]:
# the function which takes N-1 words separated by space
# and returns the Nth most probable word
def estimate(words):
    Xp = tokenizer.texts_to_sequences([words])[0]
    Xp = np.array([Xp])
    prob = ff_model_emb.predict(Xp)
    i = prob.argmax()
    return tokenizer.sequences_to_texts([[i]])[0]
    
estimate('peut aider')

'un'

## II. LSTM model

https://machinelearningmastery.com/develop-word-based-neural-language-models-python-keras/

In recurrent models, we add just one padding.

In [12]:
# Padding the sentences
# we can use NLTK, but I will showcase how to do it just using Keras
# in case you didn't install NLTK
sentencesPad2 = []
for sentence in sentences:
    new_sentence = "<s> " + sentence + " </s>"
    sentencesPad2.append(new_sentence)

sentencesPad2

['<s> un ordianteur peut vous aider </s>',
 '<s> il veut vous aider </s>',
 '<s> il veut un ordinateur </s>',
 '<s> il peut nager </s>']

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer

# We can use NLTK vocab to simplify this
tokenizer2 = Tokenizer(oov_token='<UNK>')
tokenizer2.fit_on_texts(sentencesPad2)
# 10 tokens + UNK + padding code
total_words2 = len(tokenizer2.word_index) + 1

# each sentence is encoded as lists of 2 to its length
sequences = []
for line in sentencesPad2:
    encoded = tokenizer2.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

sequences

[[2, 4],
 [2, 4, 9],
 [2, 4, 9, 5],
 [2, 4, 9, 5, 6],
 [2, 4, 9, 5, 6, 7],
 [2, 4, 9, 5, 6, 7, 2],
 [2, 3],
 [2, 3, 8],
 [2, 3, 8, 6],
 [2, 3, 8, 6, 7],
 [2, 3, 8, 6, 7, 2],
 [2, 3],
 [2, 3, 8],
 [2, 3, 8, 4],
 [2, 3, 8, 4, 10],
 [2, 3, 8, 4, 10, 2],
 [2, 3],
 [2, 3, 5],
 [2, 3, 5, 11],
 [2, 3, 5, 11, 2]]

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

#get the maximum length of all sentences in term of words number
max_length = max([len(seq) for seq in sequences])
#add 
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
sequences = np.array(sequences)

sequences

array([[ 0,  0,  0,  0,  0,  2,  4],
       [ 0,  0,  0,  0,  2,  4,  9],
       [ 0,  0,  0,  2,  4,  9,  5],
       [ 0,  0,  2,  4,  9,  5,  6],
       [ 0,  2,  4,  9,  5,  6,  7],
       [ 2,  4,  9,  5,  6,  7,  2],
       [ 0,  0,  0,  0,  0,  2,  3],
       [ 0,  0,  0,  0,  2,  3,  8],
       [ 0,  0,  0,  2,  3,  8,  6],
       [ 0,  0,  2,  3,  8,  6,  7],
       [ 0,  2,  3,  8,  6,  7,  2],
       [ 0,  0,  0,  0,  0,  2,  3],
       [ 0,  0,  0,  0,  2,  3,  8],
       [ 0,  0,  0,  2,  3,  8,  4],
       [ 0,  0,  2,  3,  8,  4, 10],
       [ 0,  2,  3,  8,  4, 10,  2],
       [ 0,  0,  0,  0,  0,  2,  3],
       [ 0,  0,  0,  0,  2,  3,  5],
       [ 0,  0,  0,  2,  3,  5, 11],
       [ 0,  0,  2,  3,  5, 11,  2]], dtype=int32)

In [15]:
# split into input and output elements
# The last word is the destination 
# The other words are the context
X2, Y2 = sequences[:,:-1], sequences[:,-1]
#We will encode only the output to one-hot encoding
#The input will not be encoded since we use an embedding layer 
#which will hadle the encoding part
Y2 = to_categorical(Y2, num_classes=total_words2)

X2[0,:], Y2[0]

(array([0, 0, 0, 0, 0, 2], dtype=int32),
 array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.], dtype=float32))

In [16]:
from tensorflow.keras.layers import LSTM
# define model
model_lstm_emb = Sequential()
model_lstm_emb.add(Embedding(total_words2, 10, input_length=max_length-1))
model_lstm_emb.add(LSTM(50))
model_lstm_emb.add(Dense(total_words2, activation='softmax'))

model_lstm_emb.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 6, 10)             120       
_________________________________________________________________
lstm (LSTM)                  (None, 50)                12200     
_________________________________________________________________
dense_3 (Dense)              (None, 12)                612       
Total params: 12,932
Trainable params: 12,932
Non-trainable params: 0
_________________________________________________________________


In [17]:
# compile network
model_lstm_emb.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model_lstm_emb.fit(X2, Y2, epochs=100, verbose=2)

Epoch 1/100
1/1 - 2s - loss: 2.4837 - accuracy: 0.1500
Epoch 2/100
1/1 - 0s - loss: 2.4813 - accuracy: 0.2000
Epoch 3/100
1/1 - 0s - loss: 2.4790 - accuracy: 0.3000
Epoch 4/100
1/1 - 0s - loss: 2.4767 - accuracy: 0.3000
Epoch 5/100
1/1 - 0s - loss: 2.4743 - accuracy: 0.2500
Epoch 6/100
1/1 - 0s - loss: 2.4719 - accuracy: 0.4000
Epoch 7/100
1/1 - 0s - loss: 2.4694 - accuracy: 0.4000
Epoch 8/100
1/1 - 0s - loss: 2.4668 - accuracy: 0.4000
Epoch 9/100
1/1 - 0s - loss: 2.4641 - accuracy: 0.4000
Epoch 10/100
1/1 - 0s - loss: 2.4613 - accuracy: 0.4000
Epoch 11/100
1/1 - 0s - loss: 2.4583 - accuracy: 0.3500
Epoch 12/100
1/1 - 0s - loss: 2.4552 - accuracy: 0.3500
Epoch 13/100
1/1 - 0s - loss: 2.4520 - accuracy: 0.3500
Epoch 14/100
1/1 - 0s - loss: 2.4485 - accuracy: 0.3500
Epoch 15/100
1/1 - 0s - loss: 2.4449 - accuracy: 0.3500
Epoch 16/100
1/1 - 0s - loss: 2.4410 - accuracy: 0.3500
Epoch 17/100
1/1 - 0s - loss: 2.4370 - accuracy: 0.3500
Epoch 18/100
1/1 - 0s - loss: 2.4326 - accuracy: 0.3500
E

<tensorflow.python.keras.callbacks.History at 0x7fb4355d5400>

In [18]:
# the function which takes many words separated by space
# and returns the next most probable word
def estimate_lstm(words):
    Xp = tokenizer2.texts_to_sequences([words])[0]
    Xp = pad_sequences([Xp], maxlen=max_length, padding='pre')
    prob = model_lstm_emb.predict(Xp)
    i = prob.argmax()
    return tokenizer.sequences_to_texts([[i]])[0]
    
estimate_lstm('<s> il peut aider')



'vous'