Building a Language Model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Sample text data
text = "The quick brown fox jumps over the lazy dog"
corpus = text.split()

In [None]:
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for i in range(1, len(corpus)):
    n_gram_sequence = corpus[:i+1]
    input_sequences.append(tokenizer.texts_to_sequences([n_gram_sequence])[0])

In [None]:
# Pad sequences
max_seq_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

In [None]:
# Split data into predictors and label
X, y = input_sequences[:,:-1], input_sequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [None]:
# Build model
model = Sequential([
    Embedding(total_words, 50, input_length=max_seq_len-1),
    LSTM(100),
    Dense(total_words, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

In [None]:
# Train model
model.fit(X, y, epochs=20, batch_size=128)