### Environment setup

In [3]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download("stopwords")

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Dropout, LSTM, Dense, Bidirectional
from keras.callbacks import EarlyStopping
from keras.models import Sequential


#from google.colab import drive
#drive.mount("/content/drive")
data_path = "/content/drive/MyDrive/elm.txt"
model_save_path = "/content/drive/MyDrive/lstm_med_elm.h5"

### Process data

In [None]:
with open(data_path, "r", encoding="utf-8") as file:
    text = file.read()

# Remove links from the text.
text = re.sub(r"http\S+", "", text)

# Tokenize text into sentences.
sentences = sent_tokenize(text)

In [5]:
df = pd.DataFrame(sentences, columns=["sentence"])

# Remove any leading or trailing whitespaces.
df["sentence"] = df["sentence"].str.strip()
df = df.dropna()
df = df[df["sentence"] != ""]
df = df.reset_index(drop=True)

In [None]:
def remove_stopwords(sentence):
    word_tokens = word_tokenize(sentence)
    filtered_sentence = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_sentence)

stop_words = set(stopwords.words("english"))
df["sentence"] = df["sentence"].apply(remove_stopwords)

In [42]:
def clean_sentence(sentence):
    # Remove non-alphabetical characters and leave single whitespaces.
    cleaned_sentence = re.sub(r"[^a-zA-Z\s]", "", sentence)
    # Replace multiple whitespaces with a single whitespace.
    cleaned_sentence = re.sub(r"\s+", " ", cleaned_sentence)
    return cleaned_sentence.lower().strip()  # Strip leading and trailing whitespaces and lowercase.

df["sentence"] = df["sentence"].apply(clean_sentence)

In [9]:
def extract_last_word(df):
    # Extract last word from each sentence and place it in a new column.
    df["last_word"] = df["sentence"].apply(lambda x: x.split(" ")[-1])
    # Remove the last word from each sentence.
    df["sentence"] = df["sentence"].apply(lambda x: " ".join(x.split()[:-1]))
    return df

df = extract_last_word(df)

### Preprocess data

In [10]:
# Tokenization.
tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(df["sentence"])
vocab_size = len(tokenizer.word_index) + 1

In [11]:
# Convert sentences to sequences.
sequences = tokenizer.texts_to_sequences(df["sentence"])

# Pad sequences.
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding="post")

In [12]:
# Create input and output sequences.
X = np.array(padded_sequences)
y_text = df["last_word"]

# Tokenize output data-
y_sequences = tokenizer.texts_to_sequences(y_text)
for i, seq in enumerate(y_sequences):
    if len(seq) == 0:
      y_sequences[i] = [tokenizer.word_index["<UNK>"]]
# Convert to numpy array
y = np.array([sequence[0] for sequence in y_sequences if len(sequence) != 0])

### Model configuration and training

In [14]:
# Model architecture
embedding_dim = 50
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(vocab_size, activation="softmax"))

# Compile the model
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
model.fit(X, y, epochs=30, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7d82511c9ba0>

In [19]:
model.save(model_save_path)

  saving_api.save_model(


### Query completion

In [27]:
def complete_query(seed_text, next_words, tokenizer, model, max_sequence_len):
    """Utility function for generating a select number of words from the initial seed text."""
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences(
            [token_list], maxlen=max_sequence_len, padding="pre"
        )  # Pad sequences to match model input shape.

        predictions = model.predict(token_list, verbose=0)
        predicted_class_index = np.argmax(predictions)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_class_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [35]:
seed_text = ""
output = complete_query(
        seed_text=seed_text,
        next_words=1,
        tokenizer=tokenizer,
        model=model,
        max_sequence_len=max_len,
    )
print(output)

fever feb
