### importing core libraries

In [13]:
import numpy as np
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense

### loading the file

In [3]:
with open('C:\\Users\\odhia\\OneDrive\\Desktop\\nlpProjects\\data\\ruto.txt', 'r', encoding='utf-8') as myfile:
    text = myfile.read()

In [4]:
text

"At the onset, let me take this opportunity to warmly welcome you and your team to this beautiful resort town of Naivasha.I also take this opportunity to express my deep appreciation tomy fellow African Union Heads of State and Government for theirconfidence in assigning me the task of finalising the outstanding institutional reforms of the African Union. \n\nI recognise the admirable effort and progress made by my predecessor in this noble task and sincerely thank His Excellency Paul Kagame,President of the Republic of Rwanda, for the exemplary delivery of the reforms so far. I am honoured to have this chance to build on his good work and commit to expeditiously advancing the remaining reforms to their logical conclusion.Africa is endowed with abundant potential that largely remains untapped.\n\nThis denies the continent immense opportunities in various economic sectors, including agriculture, renewable energy, manufacturing, mining, and climate action. Under the African Union Agenda 

### tokenizing the text

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
input_dim = len(tokenizer.word_index) + 1

In [6]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'african': 5,
 'in': 6,
 'this': 7,
 'a': 8,
 'our': 9,
 'for': 10,
 'we': 11,
 'au': 12,
 'i': 13,
 'have': 14,
 'with': 15,
 'that': 16,
 'it': 17,
 'union': 18,
 'reforms': 19,
 'is': 20,
 'must': 21,
 'been': 22,
 'as': 23,
 'commission': 24,
 'on': 25,
 'us': 26,
 'are': 27,
 'has': 28,
 'new': 29,
 'at': 30,
 'take': 31,
 'opportunity': 32,
 'so': 33,
 'work': 34,
 'africa': 35,
 'potential': 36,
 'continent': 37,
 'economic': 38,
 'into': 39,
 'pan': 40,
 'reform': 41,
 'make': 42,
 'development': 43,
 'will': 44,
 'let': 45,
 'me': 46,
 'also': 47,
 'my': 48,
 'heads': 49,
 'state': 50,
 'government': 51,
 'task': 52,
 'effort': 53,
 'made': 54,
 'by': 55,
 'his': 56,
 'exemplary': 57,
 'far': 58,
 'am': 59,
 'their': 60,
 'climate': 61,
 'agenda': 62,
 '2063': 63,
 'highly': 64,
 'delivering': 65,
 'same': 66,
 'out': 67,
 'fit': 68,
 'purpose': 69,
 'objectives': 70,
 'execute': 71,
 'not': 72,
 'them': 73,
 'keep': 74,
 'structure':

### we generate n_gram seq where first 2 words can be on one sequence,first 3 words can be the next sequence and so on

In [7]:
input_seq = []
for line in text.split('\n'):
    #print(line)
    token_list = tokenizer.texts_to_sequences([line])[0] # extract the first (and only) list of token indices.i.e the first line
    for i in range(1,len(token_list)):
        n_gram_seq = token_list[:i+1]
        input_seq.append(n_gram_seq)

### making sentences to be of equal length

In [8]:
max_len = max([len(seq) for seq in input_seq])
inp_seq = np.array(pad_sequences(input_seq,maxlen=max_len, padding='pre'))
max_len

79

In [9]:
X = inp_seq[:,:-1]
y = inp_seq[:,-1]

In [10]:
X

array([[  0,   0,   0, ...,   0,   0,  30],
       [  0,   0,   0, ...,   0,  30,   1],
       [  0,   0,   0, ...,  30,   1,  88],
       ...,
       [  0,   0,   0, ..., 349, 350, 351],
       [  0,   0,   0, ..., 350, 351,  16],
       [  0,   0,   0, ...,   0,   0,   9]])

In [14]:
y = np.array(tensorflow.keras.utils.to_categorical(y,num_classes=input_dim))

### building model

In [15]:
model = Sequential([
    Embedding(input_dim,100,input_length = max_len -1),
    LSTM(150),
    Dense(input_dim,activation = 'softmax')
])




In [16]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])




In [17]:
model.fit(X,y,epochs =45,verbose=1)

Epoch 1/45


Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45


<keras.src.callbacks.History at 0x1b2b7ae1710>

### defining a predict function

In [18]:
input_text = "I recognise the admirable effort"
next_words = 10

for _ in range(next_words):
    tkn_list = tokenizer.texts_to_sequences([input_text])[0]
    tkn_list = pad_sequences([tkn_list],maxlen = max_len -1, padding='pre')
    predicted = np.argmax(model.predict(tkn_list),axis = -1)
    output_text = ''
    for word,index in tokenizer.word_index.items():
        if index == predicted:
            output_text = word
            break
    input_text += " " + output_text

print(input_text)

I recognise the admirable effort and progress made by my predecessor in this noble task


In [20]:
model.save(r'C:\Users\odhia\OneDrive\Desktop\nlpProjects\artifacts\model.keras')

In [21]:
import pickle
file_path = 'C:/Users/odhia/OneDrive/Desktop/nlpProjects/artifacts/tokenizer.pkl'
with open(file_path, 'wb') as file_obj:
    pickle.dump(tokenizer, file_obj)