In [12]:
import os

authors = os.listdir("C50/C50train/")
print(len(authors))

print(authors[:10])

50
['BernardHickey', 'JimGilchrist', 'HeatherScoffield', 'AlanCrosby', 'RobinSidel', 'DarrenSchuettler', 'EricAuchard', 'MichaelConnor', 'WilliamKazer', 'GrahamEarnshaw']


In [14]:
def get_chunks(l, n):
    n = max(1, n)
    return [l[i:i+n] for i in range(0, len(l), n)]

In [2]:
def get_texts_for_author(data_path, author_name):
    fpath = os.path.join(data_path, author_name)
    fnames = os.listdir(fpath)
    text_paths = [os.path.join(fpath, name) for name in fnames]
    texts = []
    for tp in text_paths:
        with open(tp) as f:
            s = f.read()
            texts.append(s)
    return texts

In [5]:
c50_train = "C50/C50train/"

def get_all_c50(c50_path):  # path to train or tests
    all_texts = [] 
    all_labels = []
    for author in authors:
        author_texts = get_texts_for_author(c50_path, author)
        all_texts += author_texts
        all_labels += [author] * len(author_texts)
        if len(author_texts) != 50:
            print(author, "not 50")
    return all_texts, all_labels

all_texts, all_labels = get_all_c50(c50_train)

In [17]:
# concatenate texts for train authors

by_author_texts = []
author_chunks = get_chunks(all_texts, 50)
for chunk in author_chunks:
    complete_text = ' '.join([text for text in chunk])
    by_author_texts.append(complete_text)

by_author_labels = []
label_chunks = get_chunks(all_labels, 50)
for chunk in label_chunks:
    assert(len(set(chunk)) == 1)
    by_author_labels.append(chunk[0])

In [20]:
# vectorization - chars to ints
import string
import random
import sys

import numpy as np

from keras.models import load_model

def sample(preds, temperature=1.0):
    """Sample predictions from a probability array"""
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-6) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate(model, diversity=0.5, text=""):
    """Generate text from a model"""
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)

    for i in range(5000):
        x = np.zeros((1, maxlen), dtype=np.int)
        for t, char in enumerate(sentence):
            try:
                x[0, t] = char_indices[char]
            except:
                print(sentence)
        preds = model.predict(x, verbose=0)[0][0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        generated += next_char
        sentence = sentence[1:] + next_char
        sys.stdout.write(next_char)
        sys.stdout.flush()
    return

def vectorize(text):
    """Convert text into character sequences"""
    step = 3
    sentences = []
    next_chars = []
    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_chars.append(text[i + maxlen])
    X = np.zeros((len(sentences), maxlen), dtype=np.int)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            X[i, t] = char_indices[char]
        y[i, char_indices[next_chars[i]]] = 1
    return X, y

def clean_text(text, charset):
    text = " ".join(text.split())  # all white space is one space
    text = "".join([x for x in text if x in charset])  # remove characters that we don't care about
    return text

def get_model(modelfile, freeze=False):
    model = load_model(modelfile)
    if freeze:
        for layer in model.layers[:6]:
            layer.trainable = False
    return model

chars = " " + string.ascii_letters + string.punctuation  # sorted to keep indices consistent
charset = set(chars)  # for lookup
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

maxlen = 100  # must match length which generated model - the sequence length

# load a pretrained language model
modelfile = "charlm2/model_middlemarch_cnn.hdf5"

Using TensorFlow backend.


In [23]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, Input, Embedding, Conv1D, MaxPooling1D, BatchNormalization, GRU
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

def get_gru_model():
    model = Sequential()
    model.add(Embedding(input_dim=len(charset), output_dim=100))
    model.add(Dropout(0.1))
    model.add(BatchNormalization())
    model.add(GRU(256))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [26]:
# test one to make sure we don't overfit
test_text = by_author_texts[0]
ct = clean_text(test_text, charset)
X, y = vectorize(ct)
model = get_gru_model()
model.fit(X, y, validation_split=0.2, epochs=5)

Train on 37097 samples, validate on 9275 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f18687a6cf8>

In [28]:
%%time
author_models = []  # [(author_model, author_id), (author_model, author_id), ...] - ids are ints
for i, train_text in enumerate(by_author_texts):
    print("{} / {}".format(i, len(by_author_texts)))
    ct = clean_text(train_text, charset)
    am = get_gru_model()
    X, y = vectorize(ct)
    am.fit(X, y, epochs=5, batch_size=128, verbose=0)
    author_models.append((am, by_author_labels[i]))

0 / 50
1 / 50
2 / 50
3 / 50
4 / 50
5 / 50
6 / 50
7 / 50
8 / 50
9 / 50
10 / 50
11 / 50
12 / 50
13 / 50
14 / 50
15 / 50
16 / 50
17 / 50
18 / 50
19 / 50
20 / 50
21 / 50
22 / 50
23 / 50
24 / 50
25 / 50
26 / 50
27 / 50
28 / 50
29 / 50
30 / 50
31 / 50
32 / 50
33 / 50
34 / 50
35 / 50
36 / 50
37 / 50
38 / 50
39 / 50
40 / 50
41 / 50
42 / 50
43 / 50
44 / 50
45 / 50
46 / 50
47 / 50
48 / 50
49 / 50
CPU times: user 5h 20min 59s, sys: 25min 47s, total: 5h 46min 46s
Wall time: 3h 34min 2s


In [32]:
print(len(author_models))
author_models[0]
# print(len(test_texts))
# print(len(test_labels))

for (model, author) in author_models:
    fname = "./authormodels/{}.hdf5".format(author)
    model.save(fname)

50


In [49]:
author_models[:5]

[(<keras.models.Sequential at 0x7f1860c74048>, 'BernardHickey'),
 (<keras.models.Sequential at 0x7f1862db0cc0>, 'JimGilchrist'),
 (<keras.models.Sequential at 0x7f18605a4208>, 'HeatherScoffield'),
 (<keras.models.Sequential at 0x7f1848debac8>, 'AlanCrosby'),
 (<keras.models.Sequential at 0x7f18479c4f60>, 'RobinSidel')]

In [35]:
from statistics import mean
word_counts = [text.count(" ") for text in by_author_texts]
max(word_counts)

28685

In [36]:
c50_test_texts, c50_test_labels = get_all_c50("C50/C50test/")

In [40]:
from random import shuffle
indicies = list(range(len(c50_test_texts)))
shuffle(indicies)
indicies = np.array(indicies)
c50_test_texts = np.array(c50_test_texts)[indicies]
c50_test_labels = np.array(c50_test_labels)[indicies]

array(['LydiaZajc', 'MureDickie', 'RobinSidel', 'MartinWolk',
       'KirstinRidley', 'PierreTran', 'NickLouth', "LynneO'Donnell",
       'BenjaminKangLim', 'FumikoFujisaki'],
      dtype='<U17')

In [45]:
# 182 words is quite short
# Try to join 5 tests texts together
longer_test_texts = get_chunks(test_texts, 5)
longer_test_labels = get_chunks(test_labels, 5)

In [46]:
all([len(set(x)) == 1 for x in longer_test_labels])  # Make sure that all combined labels are the same

True

In [47]:
longer_test_texts = ['\n'.join(chunk) for chunk in longer_test_texts]

In [48]:
longer_test_labels = [chunk[0] for chunk in longer_test_labels]

In [103]:
c50_test_labels[:10]

array(['LydiaZajc', 'MureDickie', 'RobinSidel', 'MartinWolk',
       'KirstinRidley', 'PierreTran', 'NickLouth', "LynneO'Donnell",
       'BenjaminKangLim', 'FumikoFujisaki'],
      dtype='<U17')

In [104]:
%%time
from random import shuffle
from datetime import datetime

def get_predictions(author_models, test_texts, test_labels):
    """Evaluate each text for each author_model and append first metric to predictions"""
    indicies = list(range(len(test_texts)))

    test_texts = np.array(test_texts)
    test_labels = np.array(test_labels)

    test_texts = test_texts[indicies]
    test_labels = test_labels[indicies]

    predictions = []
    for i, text in enumerate(test_texts):
        t1 = datetime.now()
        print("{} / {}".format(i, len(test_texts)), end=" ")
        X, y = vectorize(clean_text(text, charset))

        losses = []
        for am in author_models:
            print(".", end="")
            model = am[0]
            label = am[1]
            loss = model.evaluate(X, y, verbose=0)
            losses.append((loss, label))
        print(" {}".format(datetime.now() - t1))
        predictions.append(losses)
    return predictions
    

predictions = get_predictions(author_models, c50_test_texts[:500], c50_test_labels[:500])

0 / 500 .................................................. 0:00:36.031044
1 / 500 .................................................. 0:00:44.276223
2 / 500 .................................................. 0:01:25.236239
3 / 500 .................................................. 0:00:59.698112
4 / 500 .................................................. 0:00:59.605113
5 / 500 .................................................. 0:00:49.339650
6 / 500 .................................................. 0:00:37.502912
7 / 500 .................................................. 0:00:59.707956
8 / 500 .................................................. 0:00:59.678306
9 / 500 .................................................. 0:00:59.658917
10 / 500 .................................................. 0:01:04.766140
11 / 500 .................................................. 0:00:45.952228
12 / 500 .................................................. 0:00:51.168298
13 / 500 ..........................

In [105]:
print(c50_test_labels[0])
predictions[0]

LydiaZajc


[(2.0694980791109634, 'BernardHickey'),
 (2.2756346526898836, 'JimGilchrist'),
 (1.9072724757917894, 'HeatherScoffield'),
 (2.1537862226320863, 'AlanCrosby'),
 (2.1100960619309368, 'RobinSidel'),
 (1.7867581442044616, 'DarrenSchuettler'),
 (2.0638196387158088, 'EricAuchard'),
 (2.0492954010564843, 'MichaelConnor'),
 (2.1467253969918834, 'WilliamKazer'),
 (2.0326404962746349, 'GrahamEarnshaw'),
 (2.0347747551767448, 'KevinDrawbaugh'),
 (2.0896197546377269, 'KevinMorrison'),
 (2.2370971673782396, 'KouroshKarimkhany'),
 (2.2285476653568517, 'MatthewBunce'),
 (2.1947651979723952, 'JaneMacartney'),
 (2.1129373765951338, 'TheresePoletti'),
 (2.1946548352669635, "LynneO'Donnell"),
 (2.1010171747060014, 'MarkBendeich'),
 (2.2557203732776938, 'MarcelMichelson'),
 (2.1211482880654349, 'JohnMastrini'),
 (2.154840397391895, 'TimFarrand'),
 (2.0737582707183648, 'MartinWolk'),
 (2.085022070828606, 'JonathanBirt'),
 (2.1425213341373408, 'EdnaFernandes'),
 (2.1110903235042797, 'PierreTran'),
 (2.26518

In [106]:
c50_test_labels[1]

author_index = {pred[1]: i for i, pred in enumerate(predictions[0])}

In [137]:
print(predictions[3])
print(min(predictions[3]))
print(c50_test_labels[3])

[(2.0831662245701761, 'BernardHickey'), (2.2198001456879672, 'JimGilchrist'), (2.0750580340917342, 'HeatherScoffield'), (2.1432573063200673, 'AlanCrosby'), (1.9998730573679788, 'RobinSidel'), (1.9823461254473338, 'DarrenSchuettler'), (1.6140351901656298, 'EricAuchard'), (1.9735047365793077, 'MichaelConnor'), (2.0324745573096759, 'WilliamKazer'), (2.0248399109528763, 'GrahamEarnshaw'), (1.9383127999924716, 'KevinDrawbaugh'), (1.9436475790923691, 'KevinMorrison'), (1.6844351953337422, 'KouroshKarimkhany'), (2.1069769634129005, 'MatthewBunce'), (2.0418894876831861, 'JaneMacartney'), (1.5810203720868008, 'TheresePoletti'), (2.1662888173664268, "LynneO'Donnell"), (2.0859753987355858, 'MarkBendeich'), (1.9605231668229817, 'MarcelMichelson'), (2.0734674589059772, 'JohnMastrini'), (2.0973696851687658, 'TimFarrand'), (1.6761696850922669, 'MartinWolk'), (2.0534447117799504, 'JonathanBirt'), (2.0293640977266971, 'EdnaFernandes'), (2.0233807946916338, 'PierreTran'), (1.9543541267500888, 'RogerFill

In [107]:
author_index
index_author = {author_index[author]: author for author in author_index}

In [150]:
from statistics import mean, mode
model_means = []
er = 0
for i in range(50):
    try:
        means = mode([p[i][0] for p in predictions])
        model_means.append(means)
    except Exception as e:
        er += 1
        print(e)
        print(mean([p[i][0] for p in predictions]))
print(er)

pred_is = []
for pred in predictions:
    # pred_i = [p[0] - model_means[i] for i, p in enumerate(pred)]
    pred_i = [p[0] for p in pred]
    pred_is.append(pred_i)

no unique mode; found 500 equally common values
1.99349083611
no unique mode; found 500 equally common values
2.09272587334
no unique mode; found 500 equally common values
2.0044997065
no unique mode; found 500 equally common values
2.06208831822
no unique mode; found 500 equally common values
2.01058155494
no unique mode; found 500 equally common values
1.9532321233
no unique mode; found 500 equally common values
1.94649277314
no unique mode; found 500 equally common values
1.98714570273
no unique mode; found 500 equally common values
1.92850433424
no unique mode; found 500 equally common values
1.98429591956
no unique mode; found 500 equally common values
1.95621899949
no unique mode; found 500 equally common values
1.95305268386
no unique mode; found 500 equally common values
2.05843863892
no unique mode; found 500 equally common values
2.02130407747
no unique mode; found 500 equally common values
1.92922557104
no unique mode; found 500 equally common values
1.95382156184
no unique 

In [172]:
foo = [p[3][0] for p in predictions][:10]

In [174]:
from statistics import mode
print(foo)
print(mode(foo))

[2.1537862226320863, 2.3319116932342232, 2.2289197248852783, 2.1432573063200673, 2.1117479712232012, 1.8893041892147278, 2.169575651258016, 2.0724795832830165, 2.0525566351274489, 2.1720845848592352]


StatisticsError: no unique mode; found 10 equally common values

In [142]:
model_means[:10]

[1.9934908361106956,
 2.0927258733422343,
 2.0044997065007077,
 2.0620883182184224,
 2.0105815549435544,
 1.953232123302568,
 1.9464927731370281,
 1.9871457027280786,
 1.9285043342358887,
 1.9842959195572087]

In [143]:
pred_labs = [np.argmin(pred) for pred in pred_is]

In [144]:
pred_labs = [index_author[i] for i in pred_labs]
# pred_labs

In [145]:
from sklearn.metrics import accuracy_score
accuracy_score(c50_test_labels[:500], pred_labs)

0.66000000000000003

In [47]:
c50_test_labels[:10]

array(['LydiaZajc', 'MureDickie', 'RobinSidel', 'MartinWolk',
       'KirstinRidley', 'PierreTran', 'NickLouth', "LynneO'Donnell",
       'BenjaminKangLim', 'FumikoFujisaki'],
      dtype='<U17')

In [48]:
pred_labs

[27, 37, 4, 15, 29, 24, 28, 16, 30, 40]

In [118]:
train_texts, train_labels = get_all_c50(c50_train)

In [119]:
len(train_texts)

2500

In [120]:
len(train_labels)

2500

In [121]:
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
word_vec = TfidfVectorizer(min_df=3, ngram_range=(1,2))
char_vec = TfidfVectorizer(min_df=3, ngram_range=(2,5))

fu = FeatureUnion([
    ('word', word_vec),
    ('char', char_vec)
])


X_train = fu.fit_transform(train_texts)

In [122]:
X_test = fu.transform(c50_test_texts[:500])

In [123]:
from sklearn.svm import LinearSVC

svm = LinearSVC()

In [124]:
svm.fit(X_train, train_labels)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [125]:
preds = svm.predict(X_test)

In [127]:
accuracy_score(c50_test_labels[:500], preds)

0.72399999999999998

In [None]:
X_test_longer = fu.transform(longer_test_texts)

In [None]:
preds = svm.predict(X_test_longer)

In [None]:
accuracy_score(longer_test_labels, preds)

In [None]:
X.shape

In [None]:
X, y = vectorize(clean_text(train_texts[3], charset))

In [None]:
X.shape

In [None]:
y.shape

In [None]:
y[0]

In [None]:
X[0]

In [None]:
test_author_model.layers[:6]

In [37]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, Input, Embedding, Conv1D, MaxPooling1D, BatchNormalization, GRU
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

# load ascii text and covert to lowercase

# cnn = Dropout(0.2)(embedded)
# cnn = Conv1D(128, 5, activation='relu')(cnn)
# cnn = MaxPooling1D(pool_size=4)(cnn)

model = Sequential()
model.add(Embedding(input_dim=len(charset), output_dim=100))
model.add(Dropout(0.1))
model.add(BatchNormalization())
model.add(GRU(256))
model.add(Dropout(0.3))
model.add(BatchNormalization())
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=13, batch_size=128, validation_split=0.1)

"""
LSTM, BatchNorm
Train on 14929 samples, validate on 1659 samples
Epoch 1/5
14929/14929 [==============================] - 75s - loss: 3.0403 - val_loss: 3.4757
Epoch 2/5
14929/14929 [==============================] - 60s - loss: 2.3156 - val_loss: 2.9687

LSTM
Train on 14929 samples, validate on 1659 samples
Epoch 1/5
14929/14929 [==============================] - 75s - loss: 3.1259 - val_loss: 2.7865
Epoch 2/5
14929/14929 [==============================] - 60s - loss: 2.6150 - val_loss: 2.3894

CNN(5), LSTM  # faster, needs more epochs
Train on 14929 samples, validate on 1659 samples
Epoch 1/5
14929/14929 [==============================] - 42s - loss: 3.1579 - val_loss: 2.9987
Epoch 2/5
14929/14929 [==============================] - 26s - loss: 2.8874 - val_loss: 2.6994
Epoch 3/5
14929/14929 [==============================] - 26s - loss: 2.6220 - val_loss: 2.4879
Epoch 4/5
14929/14929 [==============================] - 26s - loss: 2.4309 - val_loss: 2.3942
Epoch 5/5
14929/14929 [==============================] - 26s - loss: 2.2950 - val_loss: 2.2902

CNN(5), CNN(3), LSTM doesn't drop below 3.0 in 5 epochs


Embedding, BatchNorm, GRU, BatchNorm
Train on 14725 samples, validate on 1637 samples
Epoch 1/20
14725/14725 [==============================] - 16s - loss: 2.9240 - val_loss: 3.9516
Epoch 2/20
14725/14725 [==============================] - 15s - loss: 2.2447 - val_loss: 3.3667
Epoch 3/20
14725/14725 [==============================] - 15s - loss: 2.0054 - val_loss: 2.8011
Epoch 4/20
14725/14725 [==============================] - 15s - loss: 1.8388 - val_loss: 2.3477
Epoch 5/20
14725/14725 [==============================] - 15s - loss: 1.7122 - val_loss: 2.0196
Epoch 6/20
14725/14725 [==============================] - 15s - loss: 1.6069 - val_loss: 1.9417
Epoch 7/20
14725/14725 [==============================] - 15s - loss: 1.5044 - val_loss: 1.9541
Epoch 8/20
14725/14725 [==============================] - 15s - loss: 1.3987 - val_loss: 1.9512
Epoch 9/20
14725/14725 [==============================] - 15s - loss: 1.2940 - val_loss: 1.9921
Epoch 10/20
14725/14725 [==============================] - 15s - loss: 1.1850 - val_loss: 2.0424

Train on 14725 samples, validate on 1637 samples
Epoch 1/20
14725/14725 [==============================] - 17s - loss: 3.2577 - val_loss: 3.8892
Epoch 2/20
14725/14725 [==============================] - 15s - loss: 2.5444 - val_loss: 3.2364
Epoch 3/20
14725/14725 [==============================] - 15s - loss: 2.2915 - val_loss: 2.7205
Epoch 4/20
14725/14725 [==============================] - 15s - loss: 2.1258 - val_loss: 2.3477
Epoch 5/20
14725/14725 [==============================] - 15s - loss: 2.0219 - val_loss: 2.0588
Epoch 6/20
14725/14725 [==============================] - 15s - loss: 1.9386 - val_loss: 1.9478
Epoch 7/20
14725/14725 [==============================] - 15s - loss: 1.8762 - val_loss: 1.9152
Epoch 8/20
14725/14725 [==============================] - 15s - loss: 1.7994 - val_loss: 1.9076
Epoch 9/20
14725/14725 [==============================] - 15s - loss: 1.7224 - val_loss: 1.8829
Epoch 10/20
14725/14725 [==============================] - 15s - loss: 1.6783 - val_loss: 1.9007
Epoch 11/20
14725/14725 [==============================] - 15s - loss: 1.6154 - val_loss: 1.8910
Epoch 12/20
14725/14725 [==============================] - 15s - loss: 1.5485 - val_loss: 1.8920
Epoch 13/20
14725/14725 [==============================] - 15s - loss: 1.4844 - val_loss: 1.9198
Epoch 14/20
14725/14725 [==============================] - 15s - loss: 1.4291 - val_loss: 1.9193
Epoch 15/20
14725/14725 [==============================] - 15s - loss: 1.3670 - val_loss: 1.9295
Epoch 16/20
14725/14725 [==============================] - 15s - loss: 1.3028 - val_loss: 1.9752
Epoch 17/20
14725/14725 [==============================] - 15s - loss: 1.2321 - val_loss: 1.9969
Epoch 18/20
14725/14725 [==============================] - 15s - loss: 1.2159 - val_loss: 2.0431
Epoch 19/20
14725/14725 [==============================] - 15s - loss: 1.1391 - val_loss: 2.0709
Epoch 20/20
14725/14725 [==============================] - 15s - loss: 1.1321 - val_loss: 2.1172


model = Sequential()
model.add(Embedding(input_dim=len(charset), output_dim=100))
model.add(BatchNormalization())
model.add(GRU(256))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=20, batch_size=128, validation_split=0.1)
Train on 14725 samples, validate on 1637 samples
Epoch 1/20
14725/14725 [==============================] - 17s - loss: 3.2577 - val_loss: 3.8892
Epoch 2/20
14725/14725 [==============================] - 15s - loss: 2.5444 - val_loss: 3.2364
Epoch 3/20
14725/14725 [==============================] - 15s - loss: 2.2915 - val_loss: 2.7205
Epoch 4/20
14725/14725 [==============================] - 15s - loss: 2.1258 - val_loss: 2.3477
Epoch 5/20
14725/14725 [==============================] - 15s - loss: 2.0219 - val_loss: 2.0588
Epoch 6/20
14725/14725 [==============================] - 15s - loss: 1.9386 - val_loss: 1.9478
Epoch 7/20
14725/14725 [==============================] - 15s - loss: 1.8762 - val_loss: 1.9152
Epoch 8/20
14725/14725 [==============================] - 15s - loss: 1.7994 - val_loss: 1.9076
Epoch 9/20
14725/14725 [==============================] - 15s - loss: 1.7224 - val_loss: 1.8829
Epoch 10/20
14725/14725 [==============================] - 15s - loss: 1.6783 - val_loss: 1.9007
Epoch 11/20
14725/14725 [==============================] - 15s - loss: 1.6154 - val_loss: 1.8910
Epoch 12/20
14725/14725 [==============================] - 15s - loss: 1.5485 - val_loss: 1.8920
Epoch 13/20
14725/14725 [==============================] - 15s - loss: 1.4844 - val_loss: 1.9198
Epoch 14/20
14725/14725 [==============================] - 15s - loss: 1.4291 - val_loss: 1.9193
Epoch 15/20
14725/14725 [==============================] - 15s - loss: 1.3670 - val_loss: 1.9295
Epoch 16/20
14725/14725 [==============================] - 15s - loss: 1.3028 - val_loss: 1.9752
Epoch 17/20
14725/14725 [==============================] - 15s - loss: 1.2321 - val_loss: 1.9969
Epoch 18/20
14725/14725 [==============================] - 15s - loss: 1.2159 - val_loss: 2.0431
Epoch 19/20
14725/14725 [==============================] - 15s - loss: 1.1391 - val_loss: 2.0709
Epoch 20/20
14725/14725 [==============================] - 15s - loss: 1.1321 - val_loss: 2.1172


model = Sequential()
model.add(Embedding(input_dim=len(charset), output_dim=100))
model.add(GRU(256))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=20, batch_size=128, validation_split=0.1)
Train on 14725 samples, validate on 1637 samples
Epoch 1/20
14725/14725 [==============================] - 16s - loss: 3.1731 - val_loss: 3.5648
Epoch 2/20
14725/14725 [==============================] - 14s - loss: 2.4964 - val_loss: 2.9875
Epoch 3/20
14725/14725 [==============================] - 14s - loss: 2.3446 - val_loss: 2.7695
Epoch 4/20
14725/14725 [==============================] - 14s - loss: 2.2928 - val_loss: 2.6010
Epoch 5/20
14725/14725 [==============================] - 14s - loss: 2.2642 - val_loss: 2.3900
Epoch 6/20
14725/14725 [==============================] - 14s - loss: 2.2373 - val_loss: 2.5023
Epoch 7/20
14725/14725 [==============================] - 14s - loss: 2.2186 - val_loss: 2.3780
Epoch 8/20
14725/14725 [==============================] - 14s - loss: 2.2029 - val_loss: 2.4928
Epoch 9/20
14725/14725 [==============================] - 14s - loss: 2.1852 - val_loss: 2.3480
Epoch 10/20
14725/14725 [==============================] - 14s - loss: 2.1745 - val_loss: 2.4801
Epoch 11/20
14725/14725 [==============================] - 14s - loss: 2.1563 - val_loss: 2.3951
Epoch 12/20
14725/14725 [==============================] - 14s - loss: 2.1391 - val_loss: 2.4133
Epoch 13/20
14725/14725 [==============================] - 14s - loss: 2.1192 - val_loss: 2.5896
Epoch 14/20
14725/14725 [==============================] - 14s - loss: 2.1020 - val_loss: 2.2692
Epoch 15/20
14725/14725 [==============================] - 14s - loss: 2.0770 - val_loss: 2.2179
Epoch 16/20
14725/14725 [==============================] - 14s - loss: 2.0643 - val_loss: 2.2822
Epoch 17/20
 6784/14725 [============>.................] - ETA: 7s - loss: 2.0302


model = Sequential()
model.add(Embedding(input_dim=len(charset), output_dim=100))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(GRU(256))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=20, batch_size=128, validation_split=0.1)
Train on 14725 samples, validate on 1637 samples
Epoch 1/20
14725/14725 [==============================] - 17s - loss: 3.1528 - val_loss: 3.9042
Epoch 2/20
14725/14725 [==============================] - 15s - loss: 2.4658 - val_loss: 3.2093
Epoch 3/20
14725/14725 [==============================] - 15s - loss: 2.2001 - val_loss: 2.6764
Epoch 4/20
14725/14725 [==============================] - 15s - loss: 2.0358 - val_loss: 2.2919
Epoch 5/20
14725/14725 [==============================] - 15s - loss: 1.9497 - val_loss: 2.0060
Epoch 6/20
14725/14725 [==============================] - 15s - loss: 1.8588 - val_loss: 1.9313
Epoch 7/20
14725/14725 [==============================] - 15s - loss: 1.7883 - val_loss: 1.9153
Epoch 8/20
14725/14725 [==============================] - 15s - loss: 1.7034 - val_loss: 1.9145
Epoch 9/20
14725/14725 [==============================] - 15s - loss: 1.6382 - val_loss: 1.8979
Epoch 10/20
14725/14725 [==============================] - 15s - loss: 1.5827 - val_loss: 1.8864
Epoch 11/20
14725/14725 [==============================] - 15s - loss: 1.5093 - val_loss: 1.8967
Epoch 12/20
14725/14725 [==============================] - 15s - loss: 1.4472 - val_loss: 1.9040
Epoch 13/20
14725/14725 [==============================] - 15s - loss: 1.3809 - val_loss: 1.9227
Epoch 14/20
14725/14725 [==============================] - 15s - loss: 1.3225 - val_loss: 1.9469
Epoch 15/20
14725/14725 [==============================] - 15s - loss: 1.2516 - val_loss: 1.9862
Epoch 16/20
14725/14725 [==============================] - 15s - loss: 1.2094 - val_loss: 1.9963
Epoch 17/20
14725/14725 [==============================] - 15s - loss: 1.1658 - val_loss: 2.0331
Epoch 18/20
14725/14725 [==============================] - 15s - loss: 1.0851 - val_loss: 2.0452
Epoch 19/20
14725/14725 [==============================] - 15s - loss: 1.0394 - val_loss: 2.0810
Epoch 20/20
14725/14725 [==============================] - 15s - loss: 0.9903 - val_loss: 2.1283


model = Sequential()
model.add(Embedding(input_dim=len(charset), output_dim=100))
model.add(Dropout(0.3))
model.add(BatchNormalization())
model.add(GRU(256))
model.add(Dropout(0.3))
model.add(BatchNormalization())
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=20, batch_size=128, validation_split=0.1)
Train on 14725 samples, validate on 1637 samples
Epoch 1/20
14725/14725 [==============================] - 17s - loss: 3.2991 - val_loss: 3.8902
Epoch 2/20
14725/14725 [==============================] - 15s - loss: 2.5672 - val_loss: 3.1627
Epoch 3/20
14725/14725 [==============================] - 15s - loss: 2.2731 - val_loss: 2.6340
Epoch 4/20
14725/14725 [==============================] - 15s - loss: 2.1316 - val_loss: 2.2594
Epoch 5/20
14725/14725 [==============================] - 15s - loss: 2.0249 - val_loss: 2.0159
Epoch 6/20
14725/14725 [==============================] - 15s - loss: 1.9571 - val_loss: 1.9456
Epoch 7/20
14725/14725 [==============================] - 15s - loss: 1.8789 - val_loss: 1.9213
Epoch 8/20
14725/14725 [==============================] - 15s - loss: 1.8233 - val_loss: 1.8924
Epoch 9/20
14725/14725 [==============================] - 15s - loss: 1.7575 - val_loss: 1.8987



model = Sequential()
model.add(Embedding(input_dim=len(charset), output_dim=100))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(GRU(256))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=20, batch_size=128, validation_split=0.1)
Train on 14725 samples, validate on 1637 samples
Epoch 1/20
14725/14725 [==============================] - 17s - loss: 3.6036 - val_loss: 3.7924
Epoch 2/20
14725/14725 [==============================] - 15s - loss: 2.7765 - val_loss: 3.0022
Epoch 3/20
14725/14725 [==============================] - 15s - loss: 2.4773 - val_loss: 2.5697
Epoch 4/20
14725/14725 [==============================] - 15s - loss: 2.3218 - val_loss: 2.2606
Epoch 5/20
14725/14725 [==============================] - 15s - loss: 2.2328 - val_loss: 2.0832
Epoch 6/20
14725/14725 [==============================] - 15s - loss: 2.1748 - val_loss: 2.0248
Epoch 7/20
14725/14725 [==============================] - 15s - loss: 2.1174 - val_loss: 1.9865
Epoch 8/20
14725/14725 [==============================] - 15s - loss: 2.0617 - val_loss: 1.9640
Epoch 9/20
14725/14725 [==============================] - 15s - loss: 2.0206 - val_loss: 1.9461
Epoch 10/20
14725/14725 [==============================] - 15s - loss: 1.9758 - val_loss: 1.9334
Epoch 11/20
14725/14725 [==============================] - 15s - loss: 1.9546 - val_loss: 1.9148
Epoch 12/20
14725/14725 [==============================] - 15s - loss: 1.9045 - val_loss: 1.9121
Epoch 13/20
14725/14725 [==============================] - 15s - loss: 1.8757 - val_loss: 1.8888
Epoch 14/20
14725/14725 [==============================] - 15s - loss: 1.8437 - val_loss: 1.8874
Epoch 15/20
14725/14725 [==============================] - 15s - loss: 1.8145 - val_loss: 1.8822
Epoch 16/20
14725/14725 [==============================] - 15s - loss: 1.7805 - val_loss: 1.8785
Epoch 17/20
14725/14725 [==============================] - 15s - loss: 1.7558 - val_loss: 1.8868
Epoch 18/20
14725/14725 [==============================] - 15s - loss: 1.7218 - val_loss: 1.8670
Epoch 19/20
14725/14725 [==============================] - 15s - loss: 1.7032 - val_loss: 1.8759
Epoch 20/20
14725/14725 [==============================] - 15s - loss: 1.6832 - val_loss: 1.8834


model = Sequential()
model.add(Embedding(input_dim=len(charset), output_dim=100))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(GRU(256, return_sequences=True))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(GRU(256))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=20, batch_size=128, validation_split=0.1)
Train on 14725 samples, validate on 1637 samples
Epoch 1/20
14725/14725 [==============================] - 33s - loss: 3.7814 - val_loss: 3.7282
Epoch 2/20
14725/14725 [==============================] - 29s - loss: 2.8453 - val_loss: 2.8542
Epoch 3/20
14725/14725 [==============================] - 29s - loss: 2.5178 - val_loss: 2.4434
Epoch 4/20
14725/14725 [==============================] - 29s - loss: 2.3762 - val_loss: 2.1894
Epoch 5/20
14725/14725 [==============================] - 29s - loss: 2.2896 - val_loss: 2.0862
Epoch 6/20
14725/14725 [==============================] - 29s - loss: 2.2254 - val_loss: 2.0516
Epoch 7/20
14725/14725 [==============================] - 29s - loss: 2.1565 - val_loss: 2.0133
Epoch 8/20
14725/14725 [==============================] - 29s - loss: 2.1132 - val_loss: 1.9992
Epoch 9/20
14725/14725 [==============================] - 29s - loss: 2.0798 - val_loss: 1.9881
Epoch 10/20
14725/14725 [==============================] - 29s - loss: 2.0509 - val_loss: 1.9784
Epoch 11/20
14725/14725 [==============================] - 29s - loss: 2.0198 - val_loss: 1.9618
Epoch 12/20
14725/14725 [==============================] - 29s - loss: 1.9822 - val_loss: 1.9383
Epoch 13/20
14725/14725 [==============================] - 29s - loss: 1.9437 - val_loss: 1.9300
Epoch 14/20
14725/14725 [==============================] - 29s - loss: 1.9198 - val_loss: 1.9163
Epoch 15/20
14725/14725 [==============================] - 29s - loss: 1.8989 - val_loss: 1.9160
Epoch 16/20
14725/14725 [==============================] - 29s - loss: 1.8866 - val_loss: 1.9085
Epoch 17/20
14725/14725 [==============================] - 29s - loss: 1.8493 - val_loss: 1.8965
Epoch 18/20
14725/14725 [==============================] - 29s - loss: 1.8248 - val_loss: 1.8878
Epoch 19/20
14725/14725 [==============================] - 29s - loss: 1.8037 - val_loss: 1.8870
Epoch 20/20
14725/14725 [==============================] - 29s - loss: 1.7724 - val_loss: 1.8862


model = Sequential()
model.add(Embedding(input_dim=len(charset), output_dim=100))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(GRU(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(GRU(256))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=20, batch_size=128, validation_split=0.1)
Train on 14725 samples, validate on 1637 samples
Epoch 1/20
14725/14725 [==============================] - 32s - loss: 3.2809 - val_loss: 3.7595
Epoch 2/20
14725/14725 [==============================] - 29s - loss: 2.4379 - val_loss: 2.9869
Epoch 3/20
14725/14725 [==============================] - 29s - loss: 2.1504 - val_loss: 2.5361
Epoch 4/20
14725/14725 [==============================] - 29s - loss: 1.9887 - val_loss: 2.1294
Epoch 5/20
14725/14725 [==============================] - 29s - loss: 1.8984 - val_loss: 1.9727
Epoch 6/20
14725/14725 [==============================] - 29s - loss: 1.7892 - val_loss: 1.9264
Epoch 7/20
14725/14725 [==============================] - 29s - loss: 1.7172 - val_loss: 1.9100
Epoch 8/20
14725/14725 [==============================] - 29s - loss: 1.6361 - val_loss: 1.9124
Epoch 9/20
14725/14725 [==============================] - 29s - loss: 1.5621 - val_loss: 1.9122
Epoch 10/20
14725/14725 [==============================] - 29s - loss: 1.4863 - val_loss: 1.9045
Epoch 11/20
14725/14725 [==============================] - 29s - loss: 1.4150 - val_loss: 1.9278
Epoch 12/20
14725/14725 [==============================] - 29s - loss: 1.3691 - val_loss: 1.9181
Epoch 13/20
14725/14725 [==============================] - 29s - loss: 1.2970 - val_loss: 1.9414
Epoch 14/20
 1536/14725 [==>...........................] - ETA: 25s - loss: 1.1475
 
 
 
model = Sequential()
model.add(Embedding(input_dim=len(charset), output_dim=100))
model.add(BatchNormalization())
model.add(GRU(256))
model.add(Dropout(0.4))
model.add(BatchNormalization())
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=20, batch_size=128, validation_split=0.1)
 Train on 14725 samples, validate on 1637 samples
Epoch 1/20
14725/14725 [==============================] - 17s - loss: 3.1787 - val_loss: 3.9282
Epoch 2/20
14725/14725 [==============================] - 15s - loss: 2.5070 - val_loss: 3.2479
Epoch 3/20
14725/14725 [==============================] - 15s - loss: 2.2191 - val_loss: 2.7522
Epoch 4/20
14725/14725 [==============================] - 15s - loss: 2.0637 - val_loss: 2.3331
Epoch 5/20
14725/14725 [==============================] - 15s - loss: 1.9539 - val_loss: 2.0326
Epoch 6/20
14725/14725 [==============================] - 15s - loss: 1.8622 - val_loss: 1.9440
Epoch 7/20
14725/14725 [==============================] - 15s - loss: 1.7821 - val_loss: 1.9166
Epoch 8/20
14725/14725 [==============================] - 15s - loss: 1.7169 - val_loss: 1.8996
Epoch 9/20
14725/14725 [==============================] - 15s - loss: 1.6561 - val_loss: 1.8849
Epoch 10/20
14725/14725 [==============================] - 15s - loss: 1.5910 - val_loss: 1.9032
Epoch 11/20
14725/14725 [==============================] - 15s - loss: 1.5082 - val_loss: 1.8878
Epoch 12/20
14725/14725 [==============================] - 15s - loss: 1.4513 - val_loss: 1.9252
Epoch 13/20
 7552/14725 [==============>...............] - ETA: 7s - loss: 1.3534
 
 
model = Sequential()
model.add(Embedding(input_dim=len(charset), output_dim=100))
model.add(BatchNormalization())
model.add(GRU(512))
model.add(Dropout(0.4))
model.add(BatchNormalization())
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=20, batch_size=128, validation_split=0.1)
 Train on 14725 samples, validate on 1637 samples
Epoch 1/20
14725/14725 [==============================] - 23s - loss: 3.1173 - val_loss: 3.8415
Epoch 2/20
14725/14725 [==============================] - 20s - loss: 2.4202 - val_loss: 3.1512
Epoch 3/20
14725/14725 [==============================] - 20s - loss: 2.1302 - val_loss: 2.7191
Epoch 4/20
14725/14725 [==============================] - 20s - loss: 1.9435 - val_loss: 2.3341
Epoch 5/20
14725/14725 [==============================] - 20s - loss: 1.7966 - val_loss: 1.9877
Epoch 6/20
14725/14725 [==============================] - 20s - loss: 1.6621 - val_loss: 1.9349
Epoch 7/20
14725/14725 [==============================] - 20s - loss: 1.5190 - val_loss: 1.9632
Epoch 8/20
14725/14725 [==============================] - 20s - loss: 1.3925 - val_loss: 1.9735
"""

Train on 14725 samples, validate on 1637 samples
Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13




In [None]:
model.fit(X, y, epochs=5, batch_size=128, validation_split=0.1)

In [None]:
test_author_model.summary()

In [22]:
generate(model, diversity=0.7, text="this is some test text does it really matter what it says " * 30)

----- Generating with seed: "oes it really matter what it says this is some test text does it really matter what it says this is "
oes it really matter what it says this is some test text does it really matter what it says this is 

TypeError: object of type 'numpy.float64' has no len()

In [24]:
def generate(model, diversity=0.5, text=""):
    """Generate text from a model"""
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)

    for i in range(5000):
        x = np.zeros((1, maxlen), dtype=np.int)
        for t, char in enumerate(sentence):
            try:
                x[0, t] = char_indices[char]
            except:
                print(sentence)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        generated += next_char
        sentence = sentence[1:] + next_char
        sys.stdout.write(next_char)
        sys.stdout.flush()
    return

In [26]:
generate(model, diversity=0.5, text="this is some test text does it really matter what it says " * 30)

----- Generating with seed: " what it says this is some test text does it really matter what it says this is some test text does "
 what it says this is some test text does it really matter what it says this is some test text does the chan store if the store in for Vis a singic and the places I nor for the store in find store is a gitter beat to the trist and the counders here to find. The sele the store with the storit and the chansic I was next the back the truck out. This is a git and the courder the trust and the ching store with the me to good and the next the countret to the this store is little back to the clourder the selsection for the stration's and my for probebles to have the store if friends out befould the chan net store with the park is can sele, I was not longer casteral friends. The store store is little this is this lace in findst and the inclure the selmen store in finds a befter the beet the car this store if the chan strater and closed to the counders. The strail a

KeyboardInterrupt: 