<a href="https://colab.research.google.com/github/rasharahim/LSTM/blob/main/WordPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [27]:
text = """
Ice Hockey Rules.
Game time is forfeit time.
If you do not have a legal team (5 skaters and a goalie) at game time, your team must forfeit. You can play a game with a less-than-legal team, but the game is recorded as a forfeit.
No more than 11 can dress per game; only one manager/coach allowed on the bench per team - they must wear a helmet at all times!
Skates only allowed on the ice - no dress shoes. A manager or coach MUST enter the bench from the stands, not the ice.
Ice time is tight to the minute; the clock starts at game time. Come plenty early to get your equipment.
No one is allowed on the ice once the Zamboni doors are open and until they are closed after an ice make.
No varsity hockey players are allowed on IM teams
"""

In [28]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

total_words = len(tokenizer.word_index) + 1
print("Total words:", total_words)
tokenizer.word_index

Total words: 79


{'the': 1,
 'a': 2,
 'ice': 3,
 'game': 4,
 'time': 5,
 'is': 6,
 'team': 7,
 'no': 8,
 'allowed': 9,
 'on': 10,
 'forfeit': 11,
 'at': 12,
 'must': 13,
 'are': 14,
 'hockey': 15,
 'you': 16,
 'not': 17,
 'legal': 18,
 'and': 19,
 'your': 20,
 'can': 21,
 'than': 22,
 'dress': 23,
 'per': 24,
 'only': 25,
 'one': 26,
 'manager': 27,
 'coach': 28,
 'bench': 29,
 'they': 30,
 'to': 31,
 'rules': 32,
 'if': 33,
 'do': 34,
 'have': 35,
 '5': 36,
 'skaters': 37,
 'goalie': 38,
 'play': 39,
 'with': 40,
 'less': 41,
 'but': 42,
 'recorded': 43,
 'as': 44,
 'more': 45,
 '11': 46,
 'wear': 47,
 'helmet': 48,
 'all': 49,
 'times': 50,
 'skates': 51,
 'shoes': 52,
 'or': 53,
 'enter': 54,
 'from': 55,
 'stands': 56,
 'tight': 57,
 'minute': 58,
 'clock': 59,
 'starts': 60,
 'come': 61,
 'plenty': 62,
 'early': 63,
 'get': 64,
 'equipment': 65,
 'once': 66,
 'zamboni': 67,
 'doors': 68,
 'open': 69,
 'until': 70,
 'closed': 71,
 'after': 72,
 'an': 73,
 'make': 74,
 'varsity': 75,
 'players': 76,

In [29]:
input_sequences = []

for line in text.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    print(token_list,"\n")
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

print(input_sequences)

[] 

[3, 15, 32] 

[4, 5, 6, 11, 5] 

[33, 16, 34, 17, 35, 2, 18, 7, 36, 37, 19, 2, 38, 12, 4, 5, 20, 7, 13, 11, 16, 21, 39, 2, 4, 40, 2, 41, 22, 18, 7, 42, 1, 4, 6, 43, 44, 2, 11] 

[8, 45, 22, 46, 21, 23, 24, 4, 25, 26, 27, 28, 9, 10, 1, 29, 24, 7, 30, 13, 47, 2, 48, 12, 49, 50] 

[51, 25, 9, 10, 1, 3, 8, 23, 52, 2, 27, 53, 28, 13, 54, 1, 29, 55, 1, 56, 17, 1, 3] 

[3, 5, 6, 57, 31, 1, 58, 1, 59, 60, 12, 4, 5, 61, 62, 63, 31, 64, 20, 65] 

[8, 26, 6, 9, 10, 1, 3, 66, 1, 67, 68, 14, 69, 19, 70, 30, 14, 71, 72, 73, 3, 74] 

[8, 75, 15, 76, 14, 9, 10, 77, 78] 

[] 

[[3, 15], [3, 15, 32], [4, 5], [4, 5, 6], [4, 5, 6, 11], [4, 5, 6, 11, 5], [33, 16], [33, 16, 34], [33, 16, 34, 17], [33, 16, 34, 17, 35], [33, 16, 34, 17, 35, 2], [33, 16, 34, 17, 35, 2, 18], [33, 16, 34, 17, 35, 2, 18, 7], [33, 16, 34, 17, 35, 2, 18, 7, 36], [33, 16, 34, 17, 35, 2, 18, 7, 36, 37], [33, 16, 34, 17, 35, 2, 18, 7, 36, 37, 19], [33, 16, 34, 17, 35, 2, 18, 7, 36, 37, 19, 2], [33, 16, 34, 17, 35, 2, 18, 7, 36, 3

In [33]:
max_sequence_len = max(len(seq) for seq in input_sequences)

input_sequences = pad_sequences(
    input_sequences,
    maxlen=max_sequence_len,
    padding='pre'
)

X = input_sequences[:, :-1]
y = input_sequences[:, -1]

y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [34]:
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

In [35]:
model.fit(X, y, epochs=300, verbose=1)

Epoch 1/300
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.0400 - loss: 4.3703
Epoch 2/300
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.0626 - loss: 4.3548
Epoch 3/300
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.0682 - loss: 4.3291
Epoch 4/300
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.0287 - loss: 4.2525
Epoch 5/300
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.0535 - loss: 4.1673
Epoch 6/300
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.0468 - loss: 4.1612
Epoch 7/300
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.0402 - loss: 4.0783
Epoch 8/300
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.0782 - loss: 4.0740
Epoch 9/300
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x7dc2c5e42ab0>

In [36]:
def generate_text(seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences(
            [token_list],
            maxlen=max_sequence_len-1,
            padding='pre'
        )

        predicted = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted)

        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                seed_text += " " + word
                break

    return seed_text

In [39]:
generate_text("if you", 5)

'if you do not have a legal'