In [1]:
import re
import nltk
import numpy as np
from tensorflow import keras
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
def encode_sentence(sentence):
    curr_s = []
    word_tok = word_tokenize(sentence)
    for i in range(len(word_tok)):
        word = word_tok[i]
        for ch in word:
            curr_ch = [0] * len(alphabet)
            curr_ch[dct[ch]] = 1
            curr_s.append(curr_ch)
        if i != (len(word_tok) - 1) and i != (len(word_tok) - 2):  # before and after '.' not space
            curr_ch = [0] * len(alphabet)
            curr_ch[dct[' ']] = 1
            curr_s.append(curr_ch)
    return curr_s

def generate_text(beginning, length=100):
    generated = ""
    e_beginning = encode_sentence(beginning)
    while True:
        pred = model.predict(np.array([e_beginning]))
        next_index = np.argmax(pred)
        next_char = alphabet[next_index]
        generated += next_char
        if next_char == '.' or len(generated) > length:
            break
        curr_ch = [0] * len(alphabet)
        curr_ch[dct[next_char]] = 1
        e_beginning.append(curr_ch)
        e_beginning = e_beginning[1:]
    return generated

def get_default():
    default = {}
    for ch in alphabet:
        default[ch] = 0
    return default

def create_markov_chain(txt, n=15):
    chain = {}
    for sentence in txt:
        for i in range(0, len(sentence) - n):
            state = sentence[i:i + n]
            next_ch = sentence[i + n]
            if not (state in chain):
                chain[state] = get_default()
            chain[state][next_ch] += 1
    for state in chain:
        total = float(sum(chain[state].values()))
        for next_char in chain[state]:
            chain[state][next_char] /= total
    return chain

def markov_generate_text(beginning, n=15, length=100):
    current_state = beginning[-n:]
    generated = ""
    while True:
        if not (current_state in markov_chain):
            next_char = np.random.choice(alphabet)
        else:
            next_char = max(markov_chain[current_state], key=markov_chain[current_state].get)
        generated += next_char
        beginning += next_char
        if next_char == '.' or len(beginning) > length:
            break
        current_state = beginning[-n:]
    return generated

In [3]:
file = open("text.txt", "r")
lines = file.readlines()
list_remove = ['\"', ',', ';', ':', '-', ')', '(', '\'', '$', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '—',
               '*', '/', 'à', 'á', 'é', 'í', 'ï', '–', 'è', '°','å', 'ë', '’', '“', '”', '′', 'å' , '&']

list_end = ['!', '…', '?']
text = ''
for item in lines:
    item = item
    for chr1 in list_remove:
        item = item.replace(chr1, ' ')
    for chr2 in list_end:
        item = item.replace(chr2, '.')
    item = re.sub('\s+', ' ', item)
    text += item.lower()
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
data = sent_tokenize(text)

In [5]:
data

['the most merciful thing in the world i think is the inability of the human mind to correlate all its contents.',
 'we live on a placid island of ignorance in the midst of black seas of infinity and it was not meant that we should voyage far.',
 'the sciences each straining in its own direction have hitherto harmed us little but some day the piecing together of dissociated knowledge will open up such terrifying vistas of reality and of our frightful position therein that we shall either go mad from the revelation or flee from the deadly light into the peace and safety of a new dark age.',
 'theosophists have guessed at the awesome grandeur of the cosmic cycle wherein our world and human race form transient incidents.',
 'they have hinted at strange survivals in terms which would freeze the blood if not masked by a bland optimism.',
 'but it is not from them that there came the single glimpse of forbidden aeons which chills me when i think of it and maddens me when i dream of it.',
 't

In [6]:
alphabet = (set(text))
alphabet = sorted(alphabet)

In [7]:
print(len(alphabet))
print(len(data))

28
429


In [8]:
dct = {}
for j in range(len(alphabet)):
    dct[alphabet[j]] = j

In [9]:
dct

{' ': 0,
 '.': 1,
 'a': 2,
 'b': 3,
 'c': 4,
 'd': 5,
 'e': 6,
 'f': 7,
 'g': 8,
 'h': 9,
 'i': 10,
 'j': 11,
 'k': 12,
 'l': 13,
 'm': 14,
 'n': 15,
 'o': 16,
 'p': 17,
 'q': 18,
 'r': 19,
 's': 20,
 't': 21,
 'u': 22,
 'v': 23,
 'w': 24,
 'x': 25,
 'y': 26,
 'z': 27}

In [10]:
one_hot_encoding = []
for sent in data:
    current_s = encode_sentence(sent)
    one_hot_encoding.append(current_s)

In [11]:
print(one_hot_encoding[0][0])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]


In [12]:
print(len(one_hot_encoding))
print(len(one_hot_encoding[0]))
print(len(one_hot_encoding[0][0]))

429
110
28


In [13]:
X = []
Y = []
limit = 20
for sent in one_hot_encoding:
    for j in range(len(sent) - limit):
        X.append(sent[j:j + limit])
        Y.append(sent[j + limit])
X = np.array(X)
Y = np.array(Y)

In [14]:
print(X.shape)
print(Y.shape)

(59271, 20, 28)
(59271, 28)
20


## LSTM

In [86]:
a = X.shape[1]
b = len(alphabet)
model = keras.Sequential(
    [
        keras.Input(shape=(a, b)),
        keras.layers.LSTM(256),
        keras.layers.Dropout(0.1),
        keras.layers.Dense(b, activation="softmax"),
    ]
)
model.compile(loss="categorical_crossentropy", optimizer="adam")
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 256)               291840    
                                                                 
 dropout_4 (Dropout)         (None, 256)               0         
                                                                 
 dense_4 (Dense)             (None, 28)                7196      
                                                                 
Total params: 299,036
Trainable params: 299,036
Non-trainable params: 0
_________________________________________________________________


In [87]:
model.fit(X, Y, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x18aa1014280>

In [105]:
example = "part of the manuscript"
gen = generate_text(example)
print(example)
print(gen)
print(example + gen)

part of the manuscript
 conser of the stars and had from the sea and the seemed death forghthen commenting the pertination o
part of the manuscript conser of the stars and had from the sea and the seemed death forghthen commenting the pertination o


In [90]:
example = "i looked carefully al"
gen = generate_text(example)
print(example)
print(gen)
print(example + gen)

i looked carefully al
y sucked the cult around in ligeties which i found emines on a rites and his hideous prood in despris
i looked carefully aly sucked the cult around in ligeties which i found emines on a rites and his hideous prood in despris


In [106]:
example = "had one point in advance"
gen = generate_text(example)
print(example)
print(gen)
print(example + gen)

had one point in advance
 of the chore the professor angell bean so star with strange survivingly ritual which was willing ear
had one point in advance of the chore the professor angell bean so star with strange survivingly ritual which was willing ear


## Марковская цепь

In [93]:
markov_chain = create_markov_chain(data)

In [111]:
keys = list(markov_chain.keys())
print(markov_chain['he most mercifu'])

{' ': 0.0, '.': 0.0, 'a': 0.0, 'b': 0.0, 'c': 0.0, 'd': 0.0, 'e': 0.0, 'f': 0.0, 'g': 0.0, 'h': 0.0, 'i': 0.0, 'j': 0.0, 'k': 0.0, 'l': 1.0, 'm': 0.0, 'n': 0.0, 'o': 0.0, 'p': 0.0, 'q': 0.0, 'r': 0.0, 's': 0.0, 't': 0.0, 'u': 0.0, 'v': 0.0, 'w': 0.0, 'x': 0.0, 'y': 0.0, 'z': 0.0}


In [108]:
example = "part of the manuscript"
gen = markov_generate_text(example)
print(example)
print(gen)
print(example + gen)

part of the manuscript
 again and correlating the theosophical and anthropological notes with the cult
part of the manuscript again and correlating the theosophical and anthropological notes with the cult


In [109]:
example = "i looked carefully al"
gen = markov_generate_text(example)
print(example)
print(gen)
print(example + gen)

i looked carefully al
skcswkfano cxp.
i looked carefully alskcswkfano cxp.


In [110]:
example = "had one point in advance"
gen = markov_generate_text(example)
print(example)
print(gen)
print(example + gen)

had one point in advance
 of professor webb had been engaged forty eight years before in a tour of gre
had one point in advance of professor webb had been engaged forty eight years before in a tour of gre
