# Language models with Keras

Keras does not afford language models out of the box.
Instead, we will build them ourselves using neural networks.

In [5]:
# suppose we have already segmented sentences
sentences = [
    "un ordianteur peut vous aider",
    "il veut vous aider",
    "il veut un ordinateur",
    "il peut nager"
]

sentences

['un ordianteur peut vous aider',
 'il veut vous aider',
 'il veut un ordinateur',
 'il peut nager']

## I. Simple FeedForward 3-gram model

In [14]:
N = 3

# Padding the sentences
# we can use NLTK, but I will showcase how to do it just using Keras
# in case you didn't install NLTK
sentencesPad = []
for sentence in sentences:
    new_sentence = sentence
    for i in range(N-1):
        new_sentence = "<s> " + new_sentence + " </s>"
    sentencesPad.append(new_sentence)

sentencesPad

['<s> <s> un ordianteur peut vous aider </s> </s>',
 '<s> <s> il veut vous aider </s> </s>',
 '<s> <s> il veut un ordinateur </s> </s>',
 '<s> <s> il peut nager </s> </s>']

In [36]:
from tensorflow.keras.preprocessing.text import Tokenizer

# We can use NLTK vocab to simplify this
tokenizer = Tokenizer(oov_token='<UNK>')
tokenizer.fit_on_texts(sentencesPad)
# 10 tokens + UNK
total_words = len(tokenizer.word_index)

sentencesOrdinal = []
for line in sentencesPad:
    token_list = tokenizer.texts_to_sequences([line])[0]
    sentencesOrdinal.append(token_list)

sentencesOrdinal

[[2, 2, 4, 9, 5, 6, 7, 2, 2],
 [2, 2, 3, 8, 6, 7, 2, 2],
 [2, 2, 3, 8, 4, 10, 2, 2],
 [2, 2, 3, 5, 11, 2, 2]]

In [37]:
# create ngrams 
# we can use NLTK, but I will showcase how to do it just using Keras
# in case you didn't install NLTK

# the inputs are two consecutive words
# So, we will have a two dimentinal array
X = []
# the outputs are the third word
# So, we will have a one dimentional array
Y = []

for sentence in sentencesOrdinal:
    slen = len(sentence)
    for i in range(slen-N):
        Xi = []
        for j in range(N-1):
            Xi.append(i+j)
        X.append(Xi)
        Y.append(i+N-1)

X, Y

([[0, 1],
  [1, 2],
  [2, 3],
  [3, 4],
  [4, 5],
  [5, 6],
  [0, 1],
  [1, 2],
  [2, 3],
  [3, 4],
  [4, 5],
  [0, 1],
  [1, 2],
  [2, 3],
  [3, 4],
  [4, 5],
  [0, 1],
  [1, 2],
  [2, 3],
  [3, 4]],
 [2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 2, 3, 4, 5, 6, 2, 3, 4, 5])

In [38]:
from tensorflow.keras.utils import to_categorical
import numpy as np

Ybin = to_categorical(Y, num_classes=total_words)
Xbin = np.array(to_categorical(X, num_classes=total_words))
input_len = Xbin.shape[1] * Xbin.shape[2]
Xbin = Xbin.reshape(Xbin.shape[0], input_len)

Xbin[0,:], Ybin[0]

(array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32))

### I.1. Without embedding

In [39]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Sequential
ff_model = Sequential()
ff_model.add(Dense(10, input_dim=input_len, activation='relu'))
ff_model.add(Dense(total_words, activation='softmax'))
ff_model.compile(loss='categorical_crossentropy', optimizer='adam')
ff_model.fit(Xbin, Ybin, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f1df2439730>

In [54]:
# the function which takes N-1 words separated by space
# and returns the Nth most probable word
def estimate(words):
    Xp = tokenizer.texts_to_sequences([words])[0]
    Xp = to_categorical(Xp, num_classes=total_words)
    Xp = np.array([(np.array(Xp)).flatten()])
    prob = ff_model.predict(Xp)
    i = prob.argmax()
    return tokenizer.sequences_to_texts([[i]])[0]
    
estimate('peut aider')

'vous'

### I.2. With embedding

In [61]:
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten
from tensorflow.keras.models import Sequential
ff_model_emb = Sequential()
ff_model_emb.add(Embedding(total_words, 10, input_length=N-1))
ff_model_emb.add(Flatten())
ff_model_emb.add(Dense(total_words, activation='softmax'))
ff_model_emb.compile(loss='categorical_crossentropy', optimizer='adam')
#Here we use X instead of Xbin
ff_model_emb.fit(np.array(X), Ybin, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f1df240f4f0>

In [62]:
# the function which takes N-1 words separated by space
# and returns the Nth most probable word
def estimate(words):
    Xp = tokenizer.texts_to_sequences([words])[0]
    Xp = np.array([Xp])
    prob = ff_model_emb.predict(Xp)
    i = prob.argmax()
    return tokenizer.sequences_to_texts([[i]])[0]
    
estimate('peut aider')

'vous'