### Load and pre-process the data

In [1]:
import re

with open('/kaggle/input/the-mysterious-island/The Mysterious Island.txt', 'r', encoding='utf-8') as f:
    text = f.read()

text = re.sub(r"[^a-zA-Z\s]", '', text)
text = re.sub(r'\s+', ' ', text).lower().strip()

In [2]:
len(text)

1067356

### Tokenization

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

2025-05-16 16:37:26.511193: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747413446.726967      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747413446.787381      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

In [5]:
print(f"Total unique words: {vocab_size}")

Total unique words: 10067


In [6]:
word_index

{'the': 1,
 'of': 2,
 'to': 3,
 'and': 4,
 'a': 5,
 'was': 6,
 'in': 7,
 'it': 8,
 'that': 9,
 'which': 10,
 'had': 11,
 'not': 12,
 'he': 13,
 'they': 14,
 'be': 15,
 'on': 16,
 'at': 17,
 'his': 18,
 'by': 19,
 'this': 20,
 'but': 21,
 'as': 22,
 'for': 23,
 'with': 24,
 'is': 25,
 'were': 26,
 'pencroft': 27,
 'from': 28,
 'would': 29,
 'harding': 30,
 'their': 31,
 'have': 32,
 'we': 33,
 'been': 34,
 'island': 35,
 'no': 36,
 'there': 37,
 'could': 38,
 'herbert': 39,
 'all': 40,
 'i': 41,
 'cyrus': 42,
 'said': 43,
 'you': 44,
 'them': 45,
 'an': 46,
 'him': 47,
 'if': 48,
 'will': 49,
 'then': 50,
 'engineer': 51,
 'or': 52,
 'some': 53,
 'one': 54,
 'so': 55,
 'replied': 56,
 'spilett': 57,
 'neb': 58,
 'who': 59,
 'what': 60,
 'these': 61,
 'granite': 62,
 'its': 63,
 'more': 64,
 'two': 65,
 'are': 66,
 'sailor': 67,
 'time': 68,
 'when': 69,
 'house': 70,
 'did': 71,
 'now': 72,
 'very': 73,
 'into': 74,
 'only': 75,
 'any': 76,
 'well': 77,
 'reporter': 78,
 'without': 79,


### Input Sequence Generation

In [7]:
from tensorflow.keras.utils import to_categorical
import numpy as np

input_sequences = []
token_list = tokenizer.texts_to_sequences([text])[0]

In [8]:
print(len(token_list))

193276


In [9]:
token_list = token_list[:100000]

In [10]:
seq_length = 15

for i in range(seq_length ,len(token_list)):
    n_gram_sequence = token_list[i-seq_length: i+1]
    input_sequences.append(n_gram_sequence)

In [11]:
input_sequences = np.array(input_sequences)

In [12]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = to_categorical(y, num_classes=vocab_size)

In [13]:
print("Data:", X[:1])
print("Response:", y[:1])

Data: [[   1  947 3696 2476    2    1  803   35   20 2476   25   23    1  271
     2]]
Response: [[0. 0. 0. ... 0. 0. 0.]]


In [14]:
print("X" , X.shape)
print("y" , y.shape)

X (99985, 15)
y (99985, 10067)


In [15]:
len(y)

99985

In [16]:
print(input_sequences)

[[   1  947 3696 ...  271    2 4505]
 [ 947 3696 2476 ...    2 4505 3172]
 [3696 2476    2 ... 4505 3172    7]
 ...
 [  45   22  114 ...  463    8    4]
 [  22  114   22 ...    8    4  172]
 [ 114   22   14 ...    4  172 5526]]


### Model Building and Training

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))

model.build(input_shape=(None, seq_length))
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
model.summary()

I0000 00:00:1747413489.841193      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [20]:
# from tensorflow.keras.models import load_model
# model = load_model('/kaggle/input/model_best2/keras/default/1/my_model_best_2.keras')

In [18]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='loss', patience=5)
model.fit(X, y, epochs=100, batch_size=64, callbacks=[early_stop])

Epoch 1/100


I0000 00:00:1747413513.414331      91 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 12ms/step - accuracy: 0.0913 - loss: 6.7934
Epoch 2/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 12ms/step - accuracy: 0.1142 - loss: 5.8972
Epoch 3/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 12ms/step - accuracy: 0.1425 - loss: 5.5875
Epoch 4/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 12ms/step - accuracy: 0.1561 - loss: 5.3851
Epoch 5/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 12ms/step - accuracy: 0.1652 - loss: 5.2231
Epoch 6/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 12ms/step - accuracy: 0.1695 - loss: 5.0890
Epoch 7/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 12ms/step - accuracy: 0.1750 - loss: 4.9503
Epoch 8/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 12ms/step - accuracy: 0.1824 - loss: 4.8099
Epoch 9/100


<keras.src.callbacks.history.History at 0x7ac7ca2d8850>

In [19]:
early_stop = EarlyStopping(monitor='loss', patience=5)
model.fit(X, y, epochs=50, batch_size=64, callbacks=[early_stop])

Epoch 1/50
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.8356 - loss: 0.7474
Epoch 2/50
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.8371 - loss: 0.7396
Epoch 3/50
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.8406 - loss: 0.7242
Epoch 4/50
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.8430 - loss: 0.7130
Epoch 5/50
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.8419 - loss: 0.7079
Epoch 6/50
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.8488 - loss: 0.6841
Epoch 7/50
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.8502 - loss: 0.6813
Epoch 8/50
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.8500 - loss: 0.6741
Epoch 9/

<keras.src.callbacks.history.History at 0x7ac7ca022910>

In [20]:
model.save('my_model_best_2.keras')

In [21]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def generate_text(seed_text, next_words=50):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=seq_length, padding='pre')

        predicted_probs = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted_probs)

        output_word = tokenizer.index_word[predicted_word_index]
        seed_text += ' ' + output_word

    return seed_text

seed_text = "the"
generated_text = generate_text(seed_text, next_words=50)
print(generated_text)


the skill again disappeared on the beach below granite house could be who had been a good opportunity in collecting the water above the vast forest herbert and pencroft having cast him in the sea he would reach but had surveyed up the island if you must retrace this few minutes


In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [25]:
from tensorflow.keras.models import load_model

model = load_model('/kaggle/input/model_best2/keras/default/1/my_model_best_2.keras')  
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Loss:{loss:.4f}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.9167 - loss: 0.3847
Test Accuracy: 0.9156
Loss:0.3864
