# LGM DATA SCIENCE INTERNSHIP
# NAME: VAVILA S S V S SIRI SUDHEEKSHA

# ADVANCED LEVEL TASK-2
# NEXT WORD PREDICTION

##   import Libraries

In [23]:
import  numpy as np
from  nltk.tokenize import RegexpTokenizer
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import LSTM
from tensorflow.keras.optimizers import RMSprop
import  pickle
import heapq



##  Load data

In [27]:
text= open('1661-0.txt',encoding='UTF-8').read().lower()

In [28]:
print('corpus length:\t', len(text))

corpus length:	 581888


### Split the entire dataset into each word

In [29]:
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(text)

### To have the unique sorted words list.

In [30]:
unique_words= np.unique(words)
len(unique_words)

8201

In [31]:
unique_word_index= dict((c,i) for i,c in enumerate(unique_words))

### Feature Engineering & One Hot Encoding

In [32]:
WORD_LENGTH = 5
prev_words = []
next_words = []
for i in range(len(words) - WORD_LENGTH):
    prev_words.append(words[i:i + WORD_LENGTH])
    next_words.append(words[i + WORD_LENGTH])
print(prev_words[0])
print(next_words[0])

['project', 'gutenberg', 's', 'the', 'adventures']
of


### X : storing the features Y : storing the corresponding label(here, next word)

In [33]:
X = np.zeros((len(prev_words), word_length, len(unique_words)), dtype=bool)
Y = np.zeros((len(next_words), len(unique_words)), dtype=bool)
for i, each_words in enumerate(prev_words):
    for j, each_word in enumerate(each_words):
        X[i, j, unique_word_index[each_word]] = 1
    Y[i, unique_word_index[next_words[i]]] = 1


In [34]:
print(X[0][0])

[False False False ... False False False]


## model building

In [35]:
model = Sequential()
model.add(LSTM(128, input_shape=(word_length, len(unique_words))))
model.add(Dense(len(unique_words),activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               4264960   
_________________________________________________________________
dense_1 (Dense)              (None, 8201)              1057929   
Total params: 5,322,889
Trainable params: 5,322,889
Non-trainable params: 0
_________________________________________________________________


### Training

In [36]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
history = model.fit(X, Y, validation_split=0.05, batch_size=128, epochs=2, shuffle=True).history


Epoch 1/2
Epoch 2/2


### Load

In [37]:
model.save('keras_next_word_model.h5')
pickle.dump(history, open("history.p", "wb"))
model = load_model('keras_next_word_model.h5')
history = pickle.load(open("history.p", "rb"))

## Testing on user input data 

In [38]:
def prepare_input(text):
    x = np.zeros((1, word_length, len(unique_words)))
    for t, word in enumerate(text.split()):
        print(word)
        x[0, t, unique_word_index[word]] = 1
    return x
prepare_input("It is not a lack".lower())    

it
is
not
a
lack


array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

### To choose the best possible n words

In [39]:
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    return heapq.nlargest(top_n, range(len(preds)), preds.take)

### Function for prediction

In [40]:
def predict_completions(text, n=3):
    if text == "":
        return("0")
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [unique_words[idx] for idx in next_indices]

### Here, we use tokenizer.tokenize fo removing the punctuations and also we choose 5 first words because our predicts based on 5 previous words.

In [41]:
q =  "Light the candle instead of cursuing darkness"
print("correct sentence: ",q)
seq = " ".join(tokenizer.tokenize(q.lower())[0:5])
print("Sequence: ",seq)
print("next possible words: ", predict_completions(seq, 5))

correct sentence:  Light the candle instead of cursuing darkness
Sequence:  light the candle instead of
light
the
candle
instead
of
next possible words:  ['the', 'his', 'a', 'my', 'it']
