In [None]:
import os
import pickle
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from skimage.color import rgb2gray
from sklearn.cluster import KMeans
import re
import unicodedata
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.callbacks import ModelCheckpoint

LSTM -
Generative Models to mimic writing style of prominent Bertrand Russell

In [None]:
def remove_newlines(sentence):
    new_sentence = re.sub(r'\\r\\n', ' ', sentence)
    return new_sentence

In [None]:
def remove_header(sentence):
    new_sentence = ' '
    if(sentence[:11] == "b'xefxbbxbf"):
        new_sentence = sentence[11:]
    elif(sentence[:10] == "bxefxbbxbf"):
        new_sentence = sentence[10:]
    elif(sentence[:2] == "b'"):
        new_sentence = sentence[2:]
    else:
        pass
    return new_sentence

In [None]:
def remove_end(sentence):
    new_sentence = ' '
    if(sentence[-2:] == " '"):
        new_sentence = sentence[:-2]
    return new_sentence

In [None]:
def to_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words


In [None]:
def remove_punct(words):
    new_words = []
    for word in words:
        new_word = re.sub("[^a-zA-Z' ]+", '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

In [None]:
def remove_controls(words):
    new_words = []
    for word in words:
        if(word[:1] != chr(92) and word != ''):
            new_words.append(word)
        else:
            pass
    return new_words

In [None]:
def remove_roman_numerals(words):
    roman_num = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x',
                 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X']
    new_words = []
    for word in words:
        if(word not in roman_num and word != ''):
            new_words.append(word)
        else:
            pass
    return new_words


In [None]:
def to_lower(words):
    new_words =[]
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

In [None]:
def back_to_string(words):
    sentences = []
    text = ' '
    for word in words:
        sentence = ' '.join(word)
        sentences.append(sentence)
    text = ' '.join(sentences)
    return text

In [None]:
def clean_text(text):
    sentences = text.split('.')
    new_words = []
    for sentence in sentences:
        new_sentence = re.sub(r'\\r\\n', ' ', sentence)
        words = new_sentence.split()
        words = remove_controls(words)
        words = to_ascii(words)
        words = remove_roman_numerals(words)
        words = remove_punct(words)
        words = to_lower(words)
        if(len(words) != 0):
            new_words.append(words)
        else:
            pass
    text = back_to_string(new_words)
    text = remove_header(text)
    return text


In [None]:
def unique_alpha(text):
    unique = ''
    for c in text:
        unique = set(c)
    return unique

In [None]:
def scale(text):
    scaled = (text - text.min()) / (text.max() - text.min())
    return scaled

In [None]:
def convert_to_ord(text):
    result = []
    for char in text:
        result.append(ord(char))
    result = pd.Series(result)
    return result


In [None]:
titles = ['MLOE.txt','TAMatter.txt', 'OKEWFSMP.txt', 'TPP.txt', 'THWP.txt', 'TAM.txt', 'AIIMAT.txt']
cleaned_texts = []
for t in titles:
    f = open(t, 'rb')
    s = str(f.read())
    f.close()
    cleaned_texts.append(clean_text(s))

cleaned_texts[6] = cleaned_texts[6][:-15]
cleaned_texts[0] = cleaned_texts[0][:-2]
cleaned_texts[1] = cleaned_texts[1][:-2]
cleaned_texts[2] = cleaned_texts[2][:-2]
cleaned_texts[4] = cleaned_texts[4][:-2]
cleaned_texts[5] = cleaned_texts[5][:-2]

corpus = ' '.join(cleaned_texts)

chars = list(set(corpus))
int_to_char = dict((i, c) for i, c in enumerate(chars))
corpus_converted = convert_to_ord(corpus)
corpus_scaled = scale(corpus_converted)

In [None]:
window = 100
stride = 1

string = []
next_char = []

for i in range(0, (len(corpus_scaled) - window), stride):
    string.append(corpus_scaled[i : (i + window)])
    next_char.append(corpus[(i + window)])

x = np.reshape(string, (len(string), window, 1))
y = OneHotEncoder().fit_transform(np.array(next_char).reshape(-1, 1)).toarray()

In [None]:
memory_units = len(chars)
model = Sequential()
model.add(LSTM(memory_units, input_shape = (x.shape[1], x.shape[2])))
model.add(Dropout(.2))
model.add(Dense(y.shape[1], activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'Adam')
filepath = "weights-improvement-{epoch:02d}-{loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor = 'loss', mode = 'min')
callbacks_set = [checkpoint]

model.fit(x, y, batch_size = 64, epochs = 60, callbacks = callbacks_set)

In [None]:
filename = "weights-improvement-47-2.33.hdf5"
model.load_weights(filename)

input_string = "there are those who take mental phenomena naively just as they would physical phenomena this school of psychologists tends not to  emphasize the ojbect"

length = 1000
output = ''
for i in range(length):
    input_converted = convert_to_ord(input_string[i : (window + i)])
    input_scaled = scale(input_converted)
    x_ = np.array(input_scaled).reshape(1, -1)
    x_ = np.reshape(x_, (1, window, 1))
    pred = model.predict(x_, verbose = 0)
    index = np.argmax(pred)
    result = int_to_char[index]
    output += result
    input_string += result

In [None]:
output

In [None]:
model2 = Sequential()
model2.add(LSTM(128, input_shape = (x.shape[1], x.shape[2])))
model2.add(Dropout(.2))
model2.add(Dense(y.shape[1], activation = 'softmax'))
model2.compile(loss = 'categorical_crossentropy', optimizer = 'Adam')
filepath2 = "model2/weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint2 = ModelCheckpoint(filepath2, monitor = 'loss', mode = 'min')
callbacks_set2 = [checkpoint2]

model2.fit(x, y, batch_size = 64, epochs = 30, callbacks = callbacks_set)

In [None]:
filename2 = "weights-improvement-06-1.93.hdf5"
model2.load_weights(filename2)

input_string2 = "there are those who take mental phenomena naively just as they would physical phenomena this school of psychologists tends not to  emphasize the ojbect"

length = 1000
output2 = ''
for i in range(length):
    input_converted2 = convert_to_ord(input_string[i : (window + i)])
    input_scaled2 = scale(input_converted)
    x_2 = np.array(input_scaled2).reshape(1, -1)
    x_2 = np.reshape(x_, (1, window, 1))
    pred2 = model2.predict(x_2, verbose = 0)
    index2 = np.argmax(pred2)
    result2 = int_to_char[index2]
    output2 += result2
    input_string2 += result2

In [None]:
output2

