In [1]:
from keras.preprocessing.text import Tokenizer
from nltk.tokenize import sent_tokenize
import numpy as np
from sklearn.utils import shuffle
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
with open("ShakeWord.txt","r") as f:
    shakesWords = f.read()
len(shakesWords)

5338885

In [3]:
shakesWordsTokens  = sent_tokenize(shakesWords)

In [4]:
shakesWordsTokens[100]

'If my slight muse do please these curious days,\n    The pain be mine, but thine shall be the praise.'

In [5]:
tokenizer = Tokenizer(num_words=None, filters='\n',lower= True,split= ' ')
tokenizer.fit_on_texts(shakesWordsTokens)

In [6]:
sequences = tokenizer.texts_to_sequences(shakesWordsTokens)

In [7]:
sequences[100]

[32,
 7,
 3111,
 3371,
 33,
 257,
 88,
 6768,
 2503,
 1,
 2088,
 17,
 715,
 19,
 262,
 31,
 17,
 1,
 3224]

In [8]:
word_idx = tokenizer.index_word
' '.join(word_idx[w] for w in sequences[100])

'if my slight muse do please these curious days, the pain be mine, but thine shall be the praise.'

In [9]:
# creating features and labels 
# 1-50 features and 51 label then 2-51 as lables and 52 as label and so on
feat = []
label = []

training_length = 50

for seq in sequences:
    # creating traning examples 
    for i in range(training_length, len(seq)):
        extract = seq[i - training_length:i + 1]
        feat.append(extract[:-1])
        label.append(extract[-1])
        
features = np.array(feat)

In [10]:
features.shape

(26603, 50)

In [11]:
# one hot-encoding for more effective training
nums_words = len(word_idx) + 1

label_array = np.zeros((len(features), nums_words), dtype= np.int8)

# One hot encode the labels
for example_index, word_index in enumerate(label):
    label_array[example_index, word_index] = 1
    
label_array.shape

(26603, 59187)

In [12]:
label_array[10]

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [13]:
# find the word corresponding to encoding 
word_idx[np.argmax(label_array[10])]

'thy'

In [14]:
RANDOM_STATE = 50
EPOCHS = 150
BATCH_SIZE = 2048
TRAINING_LENGTH = 50
TRAIN_FRACTION = 0.7
LSTM_CELLS = 64
VERBOSE = 0
SAVE_MODEL = True
def create_train_valid(features, labels, num_words, train_fraction=TRAIN_FRACTION):
    """Create training and validation features and labels."""

    # Randomly shuffle features and labels
    features, labels = shuffle(features, labels, random_state=RANDOM_STATE)

    # Decide on number of samples for training
    train_end = int(train_fraction * len(labels))

    train_features = np.array(features[:train_end])
    valid_features = np.array(features[train_end:])

    train_labels = labels[:train_end]
    valid_labels = labels[train_end:]

    # Convert to arrays
    X_train, X_valid = np.array(train_features), np.array(valid_features)

    # Using int8 for memory savings
    y_train = np.zeros((len(train_labels), num_words), dtype=np.int8)
    y_valid = np.zeros((len(valid_labels), num_words), dtype=np.int8)

    # One hot encoding of labels
    for example_index, word_index in enumerate(train_labels):
        y_train[example_index, word_index] = 1

    for example_index, word_index in enumerate(valid_labels):
        y_valid[example_index, word_index] = 1

    return X_train, X_valid, y_train, y_valid


In [15]:
X_train, X_valid, y_train, y_valid = create_train_valid(features, label,num_words = len(word_idx) + 1)

In [17]:
glove_vectors = 'glove.6B.100d.txt'
glove = np.loadtxt(glove_vectors, dtype='str', comments=None, encoding='utf8')
glove.shape

(400000, 101)

In [25]:
vectors = glove[:, 1:].astype('float')
words = glove[:, 0]
word_lookup = {word: vector for word, vector in zip(words, vectors)}
embedding_matrix = np.zeros((nums_words, vectors.shape[1]))

for i, word in enumerate(word_idx.keys()):
    vector = word_lookup.get(word, None)

    if vector is not None:
        embedding_matrix[i + 1, :] = vector

In [26]:
vectors = glove[:, 1:].astype('float')
words = glove[:, 0]

word_lookup = {word: vector for word, vector in zip(words, vectors)}

embedding_matrix = np.zeros((nums_words, vectors.shape[1]))

for i, word in enumerate(word_idx.keys()):
    vector = word_lookup.get(word, None)

    if vector is not None:
        embedding_matrix[i + 1, :] = vector

In [27]:
model = Sequential()
model.add(
    Embedding(input_dim=nums_words,
              input_length = training_length,
              output_dim=100,
              weights=[embedding_matrix],
              trainable=False,
              mask_zero=True))

model.add(Masking(mask_value=0.0))
model.add(LSTM(64, return_sequences=False, 
               dropout=0.1, recurrent_dropout=0.1))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(nums_words, activation='softmax'))
model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [28]:
word_lookup['rohan']

array([-0.53943  , -1.0541   ,  0.10218  , -0.36593  , -0.43664  ,
       -0.35874  , -0.082717 , -0.16641  ,  0.19637  ,  0.24793  ,
       -0.19658  ,  0.30248  , -0.49776  ,  0.12554  , -0.69693  ,
       -0.41446  ,  0.039589 ,  0.23527  , -0.88304  , -0.052161 ,
       -0.48762  , -0.61307  ,  0.67959  ,  0.28287  ,  0.67792  ,
       -0.23312  ,  0.61842  , -0.0095483,  0.16551  , -0.39402  ,
       -0.99557  , -0.81131  ,  0.22108  ,  0.54819  ,  0.36218  ,
       -0.16718  , -0.20254  ,  0.087024 , -0.023018 ,  0.4129   ,
       -0.1662   ,  0.21074  , -0.32115  , -0.48445  ,  0.3132   ,
       -0.5886   , -0.19807  , -0.37927  ,  0.078238 ,  0.012201 ,
        0.15415  ,  0.63248  , -0.072892 , -0.36836  , -0.25165  ,
       -0.35199  , -0.64826  ,  0.55861  , -0.98292  , -0.026837 ,
        0.095128 , -0.17437  ,  0.28334  , -0.38278  , -0.44446  ,
        0.64377  ,  0.68749  ,  0.60716  ,  0.54148  ,  1.0806   ,
       -0.21919  , -0.62316  , -0.056255 , -0.85292  , -0.0317

In [23]:
# history = model.fit(X_train,  y_train, 
#                     batch_size=2048, epochs=10,
#                     validation_data=(X_valid, y_valid))

In [24]:
# model.evaluate(X_valid, y_valid)