# **Importing Section of model with text file**

In [6]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Correct way to read a text file
file_path = ("/content/Nepal is a beautiful, landlocked co.txt")
try:
    with open(file_path, 'r', encoding='utf-8') as myfile:
        mytext = myfile.read()
    print(mytext)
except FileNotFoundError:
    print(f"File {file_path} not found.")




Nepal is a beautiful, landlocked country located in South Asia, nestled mainly in the Himalayas between China (to the north) and India (to the south, east, and west). It's famous for its stunning natural landscapes, including Mount Everest (Sagarmatha), the world’s highest peak.

Nepal has a rich and diverse cultural heritage, home to multiple ethnic groups, languages, and religions — with Hinduism and Buddhism being the most prominent. Its capital city, Kathmandu, is known for its historic temples, bustling markets, and vibrant traditions.

Nepal is also the birthplace of Siddhartha Gautama, who became the Buddha, making it an important pilgrimage site for Buddhists worldwide.

Would you like me to continue and expand it into a full paragraph or a few sections (like geography, culture, tourism)











**Sentence to tokenizer**

In [7]:
mytokenizer = Tokenizer()
mytokenizer.fit_on_texts([mytext])
total_words = len(mytokenizer.word_index) + 1


In [8]:
mytokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'to': 4,
 'nepal': 5,
 'is': 6,
 'for': 7,
 'its': 8,
 'in': 9,
 'south': 10,
 'it': 11,
 'like': 12,
 'beautiful': 13,
 'landlocked': 14,
 'country': 15,
 'located': 16,
 'asia': 17,
 'nestled': 18,
 'mainly': 19,
 'himalayas': 20,
 'between': 21,
 'china': 22,
 'north': 23,
 'india': 24,
 'east': 25,
 'west': 26,
 "it's": 27,
 'famous': 28,
 'stunning': 29,
 'natural': 30,
 'landscapes': 31,
 'including': 32,
 'mount': 33,
 'everest': 34,
 'sagarmatha': 35,
 'world’s': 36,
 'highest': 37,
 'peak': 38,
 'has': 39,
 'rich': 40,
 'diverse': 41,
 'cultural': 42,
 'heritage': 43,
 'home': 44,
 'multiple': 45,
 'ethnic': 46,
 'groups': 47,
 'languages': 48,
 'religions': 49,
 '—': 50,
 'with': 51,
 'hinduism': 52,
 'buddhism': 53,
 'being': 54,
 'most': 55,
 'prominent': 56,
 'capital': 57,
 'city': 58,
 'kathmandu': 59,
 'known': 60,
 'historic': 61,
 'temples': 62,
 'bustling': 63,
 'markets': 64,
 'vibrant': 65,
 'traditions': 66,
 'also': 67,
 'birthplac

In [36]:
my_input_sequence = []
for line in mytext.split('\n') :
  print(line)
  token_list = mytokenizer.texts_to_sequences([line])[0]
  print(token_list)
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    my_input_sequence.append(n_gram_sequence)


Nepal is a beautiful, landlocked country located in South Asia, nestled mainly in the Himalayas between China (to the north) and India (to the south, east, and west). It's famous for its stunning natural landscapes, including Mount Everest (Sagarmatha), the world’s highest peak.
[5, 6, 3, 13, 14, 15, 16, 9, 10, 17, 18, 19, 9, 1, 20, 21, 22, 4, 1, 23, 2, 24, 4, 1, 10, 25, 2, 26, 27, 28, 7, 8, 29, 30, 31, 32, 33, 34, 35, 1, 36, 37, 38]

[]
Nepal has a rich and diverse cultural heritage, home to multiple ethnic groups, languages, and religions — with Hinduism and Buddhism being the most prominent. Its capital city, Kathmandu, is known for its historic temples, bustling markets, and vibrant traditions.
[5, 39, 3, 40, 2, 41, 42, 43, 44, 4, 45, 46, 47, 48, 2, 49, 50, 51, 52, 2, 53, 54, 1, 55, 56, 8, 57, 58, 59, 6, 60, 7, 8, 61, 62, 63, 64, 2, 65, 66]

[]
Nepal is also the birthplace of Siddhartha Gautama, who became the Buddha, making it an important pilgrimage site for Buddhists worldwide.


In [12]:
max_sequence_len = max([len(x) for x in my_input_sequence])
input_sequences = np.array(pad_sequences(my_input_sequence, maxlen=max_sequence_len, padding='pre'))

In [13]:
input_sequences

array([[ 0,  0,  0, ...,  0,  5,  6],
       [ 0,  0,  0, ...,  5,  6,  3],
       [ 0,  0,  0, ...,  6,  3, 13],
       ...,
       [ 0,  0,  0, ..., 92, 12, 93],
       [ 0,  0,  0, ..., 12, 93, 94],
       [ 0,  0,  0, ..., 93, 94, 95]], dtype=int32)

In [14]:
x = input_sequences[:,:-1]
y = input_sequences[:,-1]

In [19]:
y[1]

np.int32(3)

In [20]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes = total_words))

In [21]:
y[0]

array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

**Traning the model Section**

In [28]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
print(model.summary())



None


In [30]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(x,y,epochs=100,verbose= 1)


Epoch 1/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.0902 - loss: 4.3153
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.0322 - loss: 4.2799
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.0554 - loss: 4.1142
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0714 - loss: 4.0527
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0723 - loss: 3.9970 
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0837 - loss: 3.8891
Epoch 7/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0860 - loss: 3.8178
Epoch 8/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.1072 - loss: 3.7247
Epoch 9/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x7895453488d0>

**Prediction Section by giving input**

In [35]:
input_text = "Nepal"
predict_next_words = 5

for _ in range(predict_next_words):
    token_list = mytokenizer.texts_to_sequences([input_text])[0]
    print(f"Tokenized input: {token_list}")

    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

    predicted_probs = model.predict(token_list, verbose=0)
    predicted = np.argmax(predicted_probs, axis=-1)[0]

    output_word = ""
    for word, index in mytokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break

    input_text += " " + output_word
    print(f"Updated input_text: {input_text}")


Tokenized input: [5]
Updated input_text: Nepal is
Tokenized input: [5, 6]
Updated input_text: Nepal is a
Tokenized input: [5, 6, 3]
Updated input_text: Nepal is a beautiful
Tokenized input: [5, 6, 3, 13]
Updated input_text: Nepal is a beautiful landlocked
Tokenized input: [5, 6, 3, 13, 14]
Updated input_text: Nepal is a beautiful landlocked country
