In [None]:
!pip install numpy keras matplotlib

In [None]:
import numpy as np
import re
from collections import Counter
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense
from keras.callbacks import ModelCheckpoint, EarlyStopping
import matplotlib.pyplot as plt

In [None]:
# List your training files here:
training_files = [
    'dataset/conversation_1.txt',
    'dataset/movie_scripts.txt',
    'dataset/sherlock.txt'
]

# Load and concatenate all text
all_text = ''
for fname in training_files:
    with open(fname, 'r', encoding='utf-8') as f:
        all_text += f.read() + ' '
print(f"Loaded {len(all_text):,} characters from {len(training_files)} files.")

In [None]:
def tokenize(text):
    # Split on word boundaries
    return re.findall(r"\b\w+\b", text)

tokens = tokenize(all_text)
print(f"Total tokens: {len(tokens):,}")

# Optionally, filter out rare words
min_freq = 1  # set >1 to filter rare words
freq = Counter(tokens)
vocab = sorted([w for w, c in freq.items() if c >= min_freq])
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")

In [None]:
seq_lenght = 10  # context window, adjust as needed
step = 1         # sliding window step size

input_sequences = []
target_words = []
for i in range(0, len(tokens) - seq_lenght, step):
    seq = tokens[i:i+seq_lenght]
    target = tokens[i+seq_lenght]
    # Only keep if all words in vocab
    if all(w in word2idx for w in seq + [target]):
        input_sequences.append([word2idx[w] for w in seq])
        target_words.append(word2idx[target])
input_sequences = np.array(input_sequences)
target_words = np.array(target_words)
print(f"Number of sequences: {len(input_sequences):,}")

In [None]:
embedding_dim = 100
hidden_dim = 256

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=seq_lenght),
    LSTM(hidden_dim, return_sequences=False),
    Dense(vocab_size, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model.build(input_shape=(None, seq_lenght))
model.summary()

In [None]:
batch_size = 128
epochs = 40
callbacks = [
    ModelCheckpoint('model.keras', save_best_only=True),
    EarlyStopping(patience=3, restore_best_weights=True)
]

history = model.fit(
    input_sequences, target_words,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.1,
    callbacks=callbacks
)

In [None]:
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
np.savez_compressed('word_vocab.npz', word2idx=word2idx, idx2word=idx2word)

In [10]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

def predict_next_words(model, seed_text, word2idx, idx2word, seq_lenght, num_words=10, temperature=1.0):
    words = seed_text.split()
    for _ in range(num_words):
        seq = [word2idx.get(w, 0) for w in words[-seq_lenght:]]
        if len(seq) < seq_lenght:
            seq = [0]*(seq_lenght - len(seq)) + seq
        x = np.array([seq])
        preds = model.predict(x, verbose=0)[0]
        next_idx = sample(preds, temperature)
        next_word = idx2word.get(next_idx, '')
        words.append(next_word)
    return ' '.join(words)

In [None]:
seed = "My most valuable posession is a "
generated = predict_next_words(
    model, seed, word2idx, idx2word, seq_lenght=seq_lenght, num_words=20, temperature=0.8
)
print(generated)