In [1]:
import os

authors = os.listdir("/data/C50/C50train/")
print(len(authors))

50


In [2]:
def get_chunks(l, n):
    n = max(1, n)
    return [l[i:i+n] for i in range(0, len(l), n)]

In [3]:
def get_texts_for_author(data_path, author_name):
    fpath = os.path.join(data_path, author_name)
    fnames = os.listdir(fpath)
    text_paths = [os.path.join(fpath, name) for name in fnames]
    texts = []
    for tp in text_paths:
        with open(tp) as f:
            s = f.read()
            texts.append(s)
    return texts

In [4]:
c50_test = "/data/C50/C50test/"

def get_all_c50(c50_path):  # path to train or tests
    all_texts = [] 
    all_labels = []
    for author in authors:
        author_texts = get_texts_for_author(c50_path, author)
        all_texts += author_texts
        all_labels += [author] * len(author_texts)
        if len(author_texts) != 50:
            print(author, "not 50")
    return all_texts, all_labels

c50_test_texts, c50_test_labels = get_all_c50(c50_test)

In [5]:
# vectorization - chars to ints
import string
import random
import sys

import numpy as np

from keras.models import load_model

def sample(preds, temperature=1.0):
    """Sample predictions from a probability array"""
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-6) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate(model, diversity=0.5, text=""):
    """Generate text from a model"""
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)

    for i in range(5000):
        x = np.zeros((1, maxlen), dtype=np.int)
        for t, char in enumerate(sentence):
            try:
                x[0, t] = char_indices[char]
            except:
                print(sentence)
        preds = model.predict(x, verbose=0)[0][0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        generated += next_char
        sentence = sentence[1:] + next_char
        sys.stdout.write(next_char)
        sys.stdout.flush()
    return

def vectorize(text):
    """Convert text into character sequences"""
    step = 3
    sentences = []
    next_chars = []
    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_chars.append(text[i + maxlen])
    X = np.zeros((len(sentences), maxlen), dtype=np.int)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            X[i, t] = char_indices[char]
        y[i, char_indices[next_chars[i]]] = 1
    return X, y

def clean_text(text, charset):
    text = " ".join(text.split())  # all white space is one space
    text = "".join([x for x in text if x in charset])  # remove characters that we don't care about
    return text

def get_model(modelfile, freeze=False):
    model = load_model(modelfile)
    if freeze:
        for layer in model.layers[:6]:
            layer.trainable = False
    return model

chars = " " + string.ascii_letters + string.punctuation  # sorted to keep indices consistent
charset = set(chars)  # for lookup
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

maxlen = 100  # must match length which generated model - the sequence length

# load a pretrained language model
modelfile = "charlm2/model_middlemarch_cnn.hdf5"

Using TensorFlow backend.


In [6]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, Input, Embedding, Conv1D, MaxPooling1D, BatchNormalization, GRU
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

def get_gru_model():
    model = Sequential()
    model.add(Embedding(input_dim=len(charset), output_dim=100))
    model.add(Dropout(0.1))
    model.add(BatchNormalization())
    model.add(GRU(256))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [7]:
from random import shuffle
indicies = list(range(len(c50_test_texts)))
shuffle(indicies)
indicies = np.array(indicies)
c50_test_texts = np.array(c50_test_texts)[indicies]
c50_test_labels = np.array(c50_test_labels)[indicies]

In [8]:
author_models = []
author_model_files = os.listdir("/data/authormodels/")
for fn in author_model_files:
    fp = os.path.join("/data/authormodels", fn)
    print(fp)
    model = load_model(os.path.join("/data/authormodels", fn))
    author_models.append((model, fn.split("."[0])))

/data/authormodels/AaronPressman.hdf5
/data/authormodels/AlanCrosby.hdf5
/data/authormodels/AlexanderSmith.hdf5
/data/authormodels/BenjaminKangLim.hdf5
/data/authormodels/BernardHickey.hdf5
/data/authormodels/BradDorfman.hdf5
/data/authormodels/DarrenSchuettler.hdf5
/data/authormodels/DavidLawder.hdf5
/data/authormodels/EdnaFernandes.hdf5
/data/authormodels/EricAuchard.hdf5
/data/authormodels/FumikoFujisaki.hdf5
/data/authormodels/GrahamEarnshaw.hdf5
/data/authormodels/HeatherScoffield.hdf5
/data/authormodels/JaneMacartney.hdf5
/data/authormodels/JanLopatka.hdf5
/data/authormodels/JimGilchrist.hdf5
/data/authormodels/JoeOrtiz.hdf5
/data/authormodels/JohnMastrini.hdf5
/data/authormodels/JonathanBirt.hdf5
/data/authormodels/JoWinterbottom.hdf5
/data/authormodels/KarlPenhaul.hdf5
/data/authormodels/KeithWeir.hdf5
/data/authormodels/KevinDrawbaugh.hdf5
/data/authormodels/KevinMorrison.hdf5
/data/authormodels/KirstinRidley.hdf5
/data/authormodels/KouroshKarimkhany.hdf5
/data/authormodels/Ly

In [None]:
pred_is = []
for pred in predictions_long:
    pred_i = [p[0] for p in pred]
    pred_is.append(pred_i)

In [None]:
pred_labs = [np.argmin(pred) for pred in pred_is]

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(longer_test_labels, pred_labs)

In [73]:
from multiprocessing import Pool, Queue

pool = Pool(2)

texts_to_vec = [(text, c50_test_labels[i]) for i, text in enumerate(c50_test_texts)]

def do_work(work):
    global results
    text, label = work
    X, y = vectorize(clean_text(text, charset))
    return (X, y, label)


In [78]:
%%time
results = []
for work in texts_to_vec:
    results.append(do_work(work))

CPU times: user 1min 1s, sys: 2.99 s, total: 1min 4s
Wall time: 1min 12s


In [81]:
%%time
results2 = pool.map(do_work, texts_to_vec)

CPU times: user 3.16 s, sys: 41 s, total: 44.1 s
Wall time: 1min 29s


In [98]:
from random import shuffle
from datetime import datetime

def get_evaluations(X, y, models):
    """Get evaluations from each model for a piece of vectorized text"""
    losses = []
    print("--")
    for am in models:
        print(".", end="")
        model = am[0]
        label = am[1]
        loss = model.evaluate(X, y, verbose=0)
        losses.append((loss, label))
    return losses  ## The score for each author label

def do_evaluate_work(work):
    X, y, models = work
    return get_evaluations(X, y, models)



In [95]:
%%time
for work in results[:2]:
    print("@")
    X, y, true_label = work
    print("?")
    evaluations = get_evaluations(X, y, author_models)

@
?
--
..................................................@
?
--
..................................................CPU times: user 20min 30s, sys: 1min 47s, total: 22min 17s
Wall time: 8min 46s


In [99]:
%%time
results2 = pool.map(do_evaluate_work, results[:2])

Process ForkPoolWorker-92:
Process ForkPoolWorker-93:
Traceback (most recent call last):
  File "/usr/local/Cellar/python3/3.6.0/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/local/Cellar/python3/3.6.0/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/Cellar/python3/3.6.0/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python3/3.6.0/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/usr/local/Cellar/python3/3.6.0/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/C

KeyboardInterrupt: 