In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [2]:
with open('crime_and_punishment.txt', encoding='utf-8') as f:
    lines = f.readlines()
    for i in range(0, len(lines)):
        lines[i] = lines[i].lower()
    
    # Let's separate the data based on sentences
    raw_sentences = list()
    for line in lines:
        for sentence in line.split('.'):
            raw_sentences.append(sentence)
    print(' --- Sentences before additional cleaning --- ')
    print(raw_sentences[:10])
    print(f'Number of sentences: {len(raw_sentences)}')
    
    # Those \ns sure are annoying...
    sentences = list()
    for i in range(0, len(raw_sentences)):
        raw_sentence = raw_sentences[i]
        clean_sentence = raw_sentence.split('\n')[0]
        if clean_sentence:
            sentences.append(clean_sentence)
            
    print(' --- Sentences after additional cleaning --- ')
    print(sentences[:10])
    print(f'Number of sentences: {len(sentences)}')        

 --- Sentences before additional cleaning --- 
['part i\n', 'chapter i\n', '\n', 'on an exceptionally hot evening early in july a young man came out of the garret in which he lodged in s', ' place and walked slowly, as though in hesitation, towards k', ' bridge', '\n', '\n', 'he had successfully avoided meeting his landlady on the staircase', ' his garret was under the roof of a high, five-storied house and was more like a cupboard than a room']
Number of sentences: 23649
 --- Sentences after additional cleaning --- 
['part i', 'chapter i', 'on an exceptionally hot evening early in july a young man came out of the garret in which he lodged in s', ' place and walked slowly, as though in hesitation, towards k', ' bridge', 'he had successfully avoided meeting his landlady on the staircase', ' his garret was under the roof of a high, five-storied house and was more like a cupboard than a room', ' the landlady who provided him with garret, dinners, and attendance, lived on the floor below, 

In [3]:
# It's already time to tokenize! Let's check the total vocabulary size
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
print(len(tokenizer.word_index))

10732


In [4]:
# We also need to make a 'labels' class
# create input sequences using list of tokens
max_sentence_length = 100
input_sequences = []
for sentence in sentences:
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sentence_length+1, \
                                         padding='pre', truncating='post'))
word_index = tokenizer.word_index
num_tokens = len(word_index) + 1

predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

label = keras.utils.to_categorical(label, num_classes=num_tokens)

In [5]:
print(len(label[0]))
print(len(predictors[0]))

10733
100


In [6]:
# Let's use the GloVE word embeddings for this project, available here:
# nlp.stanford.edu/data/glove.6B.zip
# https://nlp.stanford.edu/pubs/glove.pdf
# Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014.  GloVe: Global Vectors for Word Representation. 
embeddings_index = {}
with open('glove.6B.100d.txt', encoding = 'utf-8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

In [7]:
# This code is adapted from here:
# https://keras.io/examples/nlp/pretrained_word_embeddings/
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))
# That should do!

Converted 8920 words (1812 misses)


In [50]:
# Let's make the model, first editing the Embedding layer
from tensorflow.keras.layers import Embedding
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    input_length=max_sentence_length,
    trainable=True,
)

In [60]:
import tensorflow.keras.layers as layers
import tensorflow.keras.regularizers as regularizers
inputs = keras.Input(shape=(max_sentence_length,))
embedded_sequences = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(1024, return_sequences = True))(embedded_sequences)
x = layers.Dropout(0.2)(x)
x = layers.LSTM(512, return_sequences=True)(x)
x = layers.Dropout(0.2)(x)
x = layers.LSTM(512)(x)
#x = layers.Dense(num_tokens/2, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
preds = layers.Dense(num_tokens, activation = 'softmax')(x)
model = keras.Model(inputs, preds)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "functional_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 100, 100)          1073300   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 100, 2048)         9216000   
_________________________________________________________________
dropout_9 (Dropout)          (None, 100, 2048)         0         
_________________________________________________________________
lstm_16 (LSTM)               (None, 100, 512)          5244928   
_________________________________________________________________
dropout_10 (Dropout)         (None, 100, 512)          0         
_________________________________________________________________
lstm_17 (LSTM)               (None, 512)             

In [61]:
history = model.fit(predictors, label, batch_size=64, epochs=10, verbose=2)

Epoch 1/10
3033/3033 - 666s - loss: 6.2833 - accuracy: 0.0691
Epoch 2/10
3033/3033 - 667s - loss: 5.3824 - accuracy: 0.1400
Epoch 3/10
3033/3033 - 667s - loss: 4.9588 - accuracy: 0.1668
Epoch 4/10
3033/3033 - 668s - loss: 4.6732 - accuracy: 0.1824
Epoch 5/10
3033/3033 - 668s - loss: 4.4416 - accuracy: 0.1955
Epoch 6/10
3033/3033 - 667s - loss: 4.2223 - accuracy: 0.2084
Epoch 7/10
3033/3033 - 669s - loss: 4.0106 - accuracy: 0.2225
Epoch 8/10
3033/3033 - 670s - loss: 3.8169 - accuracy: 0.2377
Epoch 9/10
3033/3033 - 670s - loss: 3.6262 - accuracy: 0.2557
Epoch 10/10
3033/3033 - 670s - loss: 3.4496 - accuracy: 0.2757


In [76]:
# Using the model to generate some new text
seed_text = "upon"
next_words = 100

for _ in range(next_words):
  token_list = tokenizer.texts_to_sequences([seed_text])[0]
  token_list = pad_sequences([token_list], maxlen=max_sentence_length, padding='pre')
  predicted = np.argmax(model.predict(token_list, verbose=0), axis=1)
  output_word = ''
  for word, index in tokenizer.word_index.items():
    if index == predicted:
      output_word = word
      break
  seed_text += ' ' + output_word

print(seed_text)

upon my word i am not going to be a card chantant and have a great deal of it and i am not going to be a card chantant and have a debate to her to day and i am not going to be a card chantant and have a debate to him to day ” he said suddenly raising his head and laughing maliciously gibing at the axe “i am not going to be crucified crucified on the kingdom of heaven for me and i am glad to be crucified crucified on the children and sister living with a subscription


In [None]:
# quite strange text, but it sure does sound like Dostoevsky!