### Prerequisites

In [4]:
!pip install tensorflow keras numpy



### Step 1: Import Necessary Libraries

In [12]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical

### Step 2: Load and Read the Text File

In [31]:
# Read the text file
with open('corpus2.txt', 'r', encoding='utf-8') as file:
    data = file.read()

### Step 3: Preprocess the Text Data

#### 1. Tokenize the Text



In [32]:
# Split data into individual sentences
sentences = data.lower().split('\n')

# Initialize the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

# Get total number of words
total_words = len(tokenizer.word_index) + 1

#### 2. Create Sequences

We'll create input sequences where each sequence adds one more word.

In [33]:
input_sequences = []

for line in sentences:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

#### 3.Pad Sequences

In [34]:
# Find the maximum sequence length
max_sequence_len = max([len(x) for x in input_sequences])

# Pad sequences
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))


#### 4. Create Predictors and Labels



In [35]:
# Split data into predictors and labels
predictors = input_sequences[:,:-1]
labels = input_sequences[:,-1]

# One-hot encode the labels
labels = to_categorical(labels, num_classes=total_words)

### Step 4: Build the Neural Language Model

In [36]:
# Define the model
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# View the model summary
model.summary()




### Step 5: Train the Model

In [37]:
# Train the model
history = model.fit(predictors, labels, epochs=50, verbose=1)

Epoch 1/50
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.1120 - loss: 5.7937
Epoch 2/50
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.1307 - loss: 4.8309
Epoch 3/50
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.1455 - loss: 4.4265
Epoch 4/50
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.1853 - loss: 4.1079
Epoch 5/50
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.2154 - loss: 3.9271
Epoch 6/50
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.2210 - loss: 3.6652
Epoch 7/50
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.2499 - loss: 3.4236
Epoch 8/50
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.2587 - loss: 3.2973
Epoch 9/50
[1m133/133[0m [32m

### Step 6: Generate New Sentences

In [41]:
import random

def generate_sentence(seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)

        # Convert predicted index back to word
        output_word = ''
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += ' ' + output_word
    return seed_text

# Example usage
seed_sentences = [
    "The",
    'He',
    'She',
    'We'
]

# Generate sentences
for seed in seed_sentences:
    print(generate_sentence(seed, 6))

The cat was napping in the sun
He will be baking cookies for the
She will be baking cookies for the
We will be visiting the museum next


####  Explanation of the Code
- **Tokenizer**: Converts words to numeric tokens.
-**Sequences**: We create sequences of words to train the model on context.
-**Padding**: Ensures all sequences are the same length.
-**Embedding Layer**: Learns word representations.
-**LSTM Layer**: Captures temporal dependencies.
-**Dense Layer**: Outputs a probability distribution over the vocabulary.
-**Training**: Model learns to predict the next word in a sequence.
-**Generation**: Starting with a seed text, the model predicts subsequent words.

### Saving and Loading the Model (Optional)

In [28]:
# Save the model
model.save('language_model.keras')

In [29]:
from tensorflow.keras.models import load_model

# Load the model
model = load_model('language_model.keras')

  saveable.load_own_variables(weights_store.get(inner_path))
