In [3]:
import os
from spacy.en import English
import numpy as np
import string
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [4]:
# config
pan15train = "/data/pan15-authorship-verification-training-dataset-english-2015-04-19/"
pan15test = "/data/pan15-authorship-verification-test-dataset2-english-2015-04-19/"
pan14train = "/data/pan14-author-verification-training-corpus-english-essays-2014-04-22/"
pan14test = "/data/pan14-author-verification-test-corpus2-english-essays-2014-04-22/"
pan14train = "/data/pan14-author-verification-training-corpus-english-novels-2014-04-22/"
pan14test = "/data/pan14-author-verification-test-corpus2-english-novels-2014-04-22/"
char_embeddings_file = "glove.840B.300d-char.txt"  # full path to the character embeddings file

In [24]:
text = ' '.join(known_train)

embeddings_path = "glove.840B.300d-char.txt"
embedding_dim = 300
batch_size = 128
lr = 0.001
lr_decay = 1e-4
maxlen = 40  # sequence length
step = 3     # overlapping steps
PAD_CHAR = "+"
alphabet = [PAD_CHAR] + sorted(string.ascii_lowercase + string.ascii_uppercase + "!?:;,.'- ")

char_indices = dict((c, i) for i, c in enumerate(alphabet))
indices_char = dict((i, c) for i, c in enumerate(alphabet))

In [25]:
# cut the text in semi-redundant sequences of maxlen characters
sentences = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])


# a 'sentence' is just a sequence of maxlen chars
X = np.zeros((len(sentences), maxlen), dtype=np.int)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t] = char_indices.get(char, char_indices[PAD_CHAR])

In [26]:
# https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
embedding_vectors = {}
with open(embeddings_path, 'r') as f:
    for line in f:
        line_split = line.strip().split(" ")
        vec = np.array(line_split[1:], dtype=float)
        char = line_split[0]
        embedding_vectors[char] = vec

embedding_matrix = np.zeros((len(alphabet), 300))
#embedding_matrix = np.random.uniform(-1, 1, (len(chars), 300))
for char, i in char_indices.items():
    #print ("{}, {}".format(char, i))
    embedding_vector = embedding_vectors.get(char)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [27]:
from keras.models import Model
from keras.layers import Dense, Activation, Embedding
from keras.layers import LSTM, Input
from keras.layers.merge import concatenate
from keras.optimizers import RMSprop, Adam
from keras.utils.data_utils import get_file
from keras.layers.normalization import BatchNormalization
from keras.callbacks import Callback, ModelCheckpoint
from keras.utils import plot_model
print('Build model...')
main_input = Input(shape=(maxlen,))
embedding_layer = Embedding(
    len(alphabet), embedding_dim, input_length=maxlen,
    weights=[embedding_matrix])
# embedding_layer = Embedding(
#     len(chars), embedding_dim, input_length=maxlen)
embedded = embedding_layer(main_input)

# RNN Layer
rnn = LSTM(256)(embedded)

aux_output = Dense(len(alphabet))(rnn)
aux_output = Activation('softmax', name='aux_out')(aux_output)

# Hidden Layers
hidden_1 = Dense(512, use_bias=False)(rnn)
hidden_1 = BatchNormalization()(hidden_1)
hidden_1 = Activation('relu')(hidden_1)

hidden_2 = Dense(256, use_bias=False)(hidden_1)
hidden_2 = BatchNormalization()(hidden_2)
hidden_2 = Activation('relu')(hidden_2)

main_output = Dense(len(alphabet))(hidden_2)
main_output = Activation('softmax', name='main_out')(main_output)

model = Model(inputs=main_input, outputs=[main_output, aux_output])

optimizer = Adam(lr=lr, decay=lr_decay)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer, loss_weights=[1., 0.2])

Build model...


In [None]:
from nltk.tokenize.

In [18]:
def read_file(filepath):
    with open(filepath) as f:
        s = f.read()
    return s

def load_pan_data(directory, prefix="E"):
    """Load known and unknown texts in the PAN data format"""
    # FIXME: assumes one known file per author, which is fine for English datasets only
    authors = sorted([x for x in os.listdir(directory) if x.startswith(prefix)])
    known_texts = []
    unknown_texts = []
    for author in authors:
        kf = os.path.join(directory, author, "known01.txt")
        uf = os.path.join(directory, author, "unknown.txt")
        known_texts.append(read_file(kf))
        unknown_texts.append(read_file(uf))
        
    truthfile = os.path.join(directory, "truth.txt")
    with open(truthfile) as f:
        lines = f.read().strip().split("\n")
    y = [1 if line.split()[1] == "Y" else 0 for line in lines]
    y = np.array(y)
    return known_texts, unknown_texts, y

def create_pairs(knownX, unknownX):
    print(len(knownX), len(unknownX))
    """Creates pairs of known and unknown texts"""
    pairs = []
    for i in range(len(knownX)):
        pairs += [[knownX[i], unknownX[i]]]
    pairs = np.array(pairs)
    print(pairs.shape)
    return pairs

def vectorize(text, nlp, target_sentences=60):
    """Convert text (string) to embeddings (numpy array)
    nlp should be an initialised Spacy pipeline with loaded embeddings"""
    p = nlp(text, entity=False, tag=False)
    vecs = [sent.vector for sent in p.sents]
    if len(vecs) > target_sentences:
        return vecs[:target_sentences]
    vecs += [nlp.vocab["<pad>"].vector] * (target_sentences - len(vecs))
    return np.array(vecs)   

def build_embedding_matrix(embedding_path, char_indices):    
    embeddings_index = {}
    f = open(embedding_path)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    embedding_matrix = np.zeros((len(chars), 300))
    for char, i in char_indices.items():
        embedding_vector = embeddings_index.get(char)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


def ngrams(string, n):
    return list(zip(*[string[i:] for i in range(n)]))

def char_vectorize(text, char_indices, sequence_length=40, pad="+"):
    text = text.lower()
    indices = []
    for char in text:
        indices.append(char_indices.get(char, char_indices[pad]))
    result = ngrams(indices, sequence_length)
    return result
                       

def char_vectorize_all(texts, char_indices, sequence_length=40, target_length=1000):
    vecs = [char_vectorize(t, char_indices, sequence_length) for t in texts]
    return pad_sequences(vecs, target_length)

punctuation = '''!"'#$%&'()*,-./:;<>?@[]_'''
# "+ for padding
chars = ["+"] + sorted(list(set(string.ascii_lowercase + punctuation + " ")))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

embedding_matrix = build_embedding_matrix('glove.840B.300d-char.txt', char_indices)
# char_vectorize_all(["This is some text ==blah blah blah", "and some more asdfasdf asdf asdf asdf "], char_indices)
# char_vectorize("this is some text", char_indices, 5)


In [20]:
from __future__ import absolute_import
from __future__ import print_function
np.random.seed(1337)  # for reproducibility
import random

from keras.models import Sequential, Model
from keras.layers import Dense, Input, Lambda, LSTM, Masking, Dropout, Embedding
from keras.optimizers import RMSprop, Adam
from keras import backend as K

def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))

def cosine_distance(vects):
    x, y = vects
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

def create_base_network(input_dim):
    '''Base network to be shared (eq. to feature extraction).
    '''
    num_chars = 51
    embed_dim = 300
    max_seq_len = 1000
    seq_len = 10
    
    
    embedding_layer = Embedding(num_chars,
                            embed_dim,
                            weights=[embedding_matrix],
                            input_length=max_seq_len,
                            trainable=True)
    sequence_input = Input(shape=(max_seq_len,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    seq = Sequential()
    seq.add(embedding_layer)
    seq.add(LSTM(128, activation='relu'))
    seq.add(Dense(128, activation='relu'))
    return seq

def compute_accuracy(predictions, labels):
    return np.mean(np.equal(predictions.ravel() < 0.5, labels))

def train(tr_pairs, tr_y):
    input_dim = (tr_pairs.shape[-1])

    # network definition
    base_network = create_base_network(input_dim,)

    input_a = Input(shape=(input_dim,))
    input_b = Input(shape=(input_dim,))

    # because we re-use the same instance `base_network`,
    # the weights of the network
    # will be shared across the two branches
    processed_a = base_network(input_a)
    processed_b = base_network(input_b)
    distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
    model = Model(inputs=[input_a, input_b], outputs=distance)
    
    rms = Adam(lr=0.001, decay=1e-4)
    model.compile(loss=contrastive_loss, optimizer=rms)
    model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
              validation_split=0.015,
              batch_size=20,
              epochs=15)
    return model
       
def evaluate(model, tr_pairs, tr_y, te_pairs, te_y):
    # compute final accuracy on training and test sets
    pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
    tr_acc = compute_accuracy(pred, tr_y)
    pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
    te_acc = compute_accuracy(pred, te_y)
    print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
    print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
    
def combine_vectors(v1s, v2s):
    return [np.hstack([v1s[i], v2s[i]]) for i in range(len(v1s))]

def run_experiment(known_train, unknown_train, known_test, unknown_test, y_train, y_test):
    tr_pairs = create_pairs(known_train, unknown_train)
    te_pairs = create_pairs(known_test, unknown_test)
    
    model = train(tr_pairs, y_train)
    evaluate(model, tr_pairs, y_train, te_pairs, y_test)

In [80]:
model = create_base_network(1000)

In [21]:
known_train, unknown_train, y_train = load_pan_data(pan15train)
known_test, unknown_test, y_test = load_pan_data(pan15test)

In [22]:
known_train[0]

" My, my, I was forgetting all about\nthe children and the mysterious fern seed. I wonder if it has changed\nthem back into real little children again. Yes,\nhere they come.\n\nOh, thank you, Mr. Wishing Man. I feel ever so much better\nnow.\n\nYes, indeed. My clothes are a perfect fit and nobody will laugh at me now.\n\nI feel perfectly fan-tas-a-ma-gor-ious.\n\nOh, see the pretty French dollies. I wish they would talk to me.\n\nIf that's your wish, they can.\n\nCan you talk?\n\nPa-pa, pa-pa, pa-pa!\n\n And what can you say?\n\n Ma-ma, ma-ma, ma-ma!\n\nGo on and talk to me.\n\nMer-ry Christ-mas! Mer-ry Christ-mas!\n\nI wish you could wind them up so they could walk around and\nplay with us.\n\nIs that your wish?\n\nOh, yes. Do you think you can do it?\n\nI can try.\n\nPa-pa, pa-pa, pa-pa! \n\nHere, stop her. She'll fall down.  Here, turn\naround. Walk this way.\n\n Ma-ma, ma-ma, ma-ma; \n\nOh, I think you are a darling.\n\nMer-ry Christ-mas! Mer-ry Christmas. \n\nHere, wait for me. \n

In [89]:
# word vectors
known_train_vecs = char_vectorize_all(known_train, char_indices)
known_test_vecs = char_vectorize_all(known_test, char_indices)
unknown_train_vecs = char_vectorize_all(unknown_train, char_indices)
unknown_test_vecs = char_vectorize_all(unknown_test, char_indices)

13

In [81]:
from statistics import mean
min([x.count(".") for x in known_train])

40

In [100]:
run_experiment(known_train_vecs, unknown_train_vecs, known_test_vecs, unknown_test_vecs, y_train, y_test)

100 100
(100, 2, 1000)
500 500
(500, 2, 1000)
Train on 98 samples, validate on 2 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
* Accuracy on training set: 93.00%
* Accuracy on test set: 53.40%


In [46]:
tr_pairs[:, 0].shape

(100, 60, 300)

In [48]:
model = train(tr_pairs, y_train)

Train on 95 samples, validate on 5 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [76]:
run_experiment(w_known_train_vecs, w_unknown_train_vecs, w_known_test_vecs, w_unknown_test_vecs, y_train, y_test)
run_experiment(c_known_train_vecs, c_unknown_train_vecs, c_known_test_vecs, c_unknown_test_vecs, y_train, y_test)
run_experiment(wc_known_train_vecs, wc_unknown_train_vecs, wc_known_test_vecs, wc_unknown_test_vecs, y_train, y_test)

100 100
(100, 2, 300)
500 500
(500, 2, 300)
Train on 95 samples, validate on 5 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
* Accuracy on training set: 100.00%
* Accuracy on test set: 72.60%
100 100
(100, 2, 300)
500 500
(500, 2, 300)
Train on 95 samples, validate on 5 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
* Accuracy on training set: 100.00%
* Accuracy on test set: 74.60%
100 100
(100, 2, 600)
500 500
(500, 2, 600)
Train on 95 samples, validate on 5 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
* Accuracy on training set: 97.00%
* Accuracy on test set: 68.20%


In [63]:
knownXs = char_vectorize_all(["this is some text dsf asdf asdf ", "and some moer ateasdf asdf ", "we have three samples"], char_indices)
unknownXs = char_vectorize_all(["this is some text dsf asdf asdf ", "and some moer ateasdf asdf ", "we have three samples"], char_indices)

In [69]:
tr_pairs = create_pairs(knownXs, unknownXs)

3 3
(3, 2, 1000)


In [81]:
train(tr_pairs, [0,0,1])

Train on 2 samples, validate on 1 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.engine.training.Model at 0x15b6e1f60>