In [5]:
import numpy as np
np.random.seed(42)
import tensorflow as tf
tf.set_random_seed(42)
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation
from keras.layers import LSTM, Dropout
from keras.layers import TimeDistributed
from keras.layers.core import Dense, Activation, Dropout, RepeatVector
from keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import pickle
import sys
import heapq
import seaborn as sns
from pylab import rcParams

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
from nltk.corpus import PlaintextCorpusReader
import os
import sys
wordlists = PlaintextCorpusReader("Nepali_Corpus", '.*txt')
data = wordlists.fileids()[:500]
text = []
for i in data:
    with open(os.path.join("Nepali_Corpus",i)) as file:
        text.append(file.read())
text = "\n".join(text)

In [7]:
chars = sorted(list(set(text)))

char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
print("Chars",len(char_indices))
print("No. of Chars",len(text))

Chars 102
No. of Chars 1052559


In [8]:
SEQUENCE_LENGTH = 40
step = 3
sentences = []
next_chars = []

for i in range(0, len(text) - SEQUENCE_LENGTH, step):
    sentences.append(text[i: i + SEQUENCE_LENGTH])
    next_chars.append(text[i + SEQUENCE_LENGTH])
print("Comlpeted")


Comlpeted


In [9]:
X = np.zeros((len(sentences), SEQUENCE_LENGTH, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
X.shape,y.shape  

((350840, 40, 102), (350840, 102))

In [10]:
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
print(X[0][0])

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False  True False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False]


In [11]:

model = Sequential()
model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
history = model.fit(X, y, validation_split=0.05, batch_size=128, epochs=20, shuffle=True).history
model.save('keras_modelseq2.h5')
pickle.dump(history, open("historyseq2.p", "wb"))

Train on 333298 samples, validate on 17542 samples
Epoch 1/20
  2560/333298 [..............................] - ETA: 13:41 - loss: 3.6727 - acc: 0.1145

KeyboardInterrupt: 

In [12]:
model

<keras.models.Sequential at 0x7fd89bd065c0>

In [23]:
model = load_model('keras_modelseq2.h5')

In [24]:

def prepare_input(text):
    x = np.zeros((1, SEQUENCE_LENGTH, len(chars)))
    for t, char in enumerate(text):
        x[0, t, char_indices[char]] = 1
    return x

def sample(preds, top_n=3):
  
    preds = preds
    return heapq.nlargest(top_n, range(len(preds)), preds.take)

def predict_completion(text):
    original_text = text
    generated = text
    completion=''
    while True:
        x = prepare_input(text)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, top_n=1)[0]
        next_char = indices_char[next_index]

        text = text[1:] + next_char
        completion += next_char
        if next_char == ' ':
            return completion

def predict_completions(text, n):
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return "".join([indices_char[idx] + predict_completion(text[1:] + indices_char[idx]) for idx in next_indices])

quotes = [
   "एसिड प्रहारबाट घाइते भएकी महिलाको स्वास्",
    "प्रमुख निर्वाचन आयुक्त अयोधिप्रसाद यादवको घरमा बम विस्फोट भएको भन्दै"
]

for q in quotes:
    seq = q[:40].lower()
    print(seq)
    print(predict_completions(seq, 5))
    print()

एसिड प्रहारबाट घाइते भएकी महिलाको स्वास्
 र ा ्न 
का !ी 

प्रमुख निर्वाचन आयुक्त अयोधिप्रसाद यादवक
 स्ट ा ्ष 
काठ !ी 



In [25]:
\

ready
