In [1]:
import os
from spacy.en import English
import numpy as np

In [2]:
# config
pan15train = "/data/pan15-authorship-verification-training-dataset-english-2015-04-19/"
pan15test = "/data/pan15-authorship-verification-test-dataset2-english-2015-04-19/"
pan14train = "/data/pan14-author-verification-training-corpus-english-essays-2014-04-22/"
pan14test = "/data/pan14-author-verification-test-corpus2-english-essays-2014-04-22/"
char_embeddings_file = "glove.840B.300d-char.txt"  # full path to the character embeddings file

In [3]:
# load spacy, this takes a while
nlp_word = English(parse=False, tag=False, entity=False)
nlp_char = English(parse=False, tag=False, entity=False)
with open(char_embeddings_file) as f:
    nlp_char.vocab.load_vectors(f)

In [3]:
def read_file(filepath):
    with open(filepath) as f:
        s = f.read()
    return s

def load_pan_data(directory, prefix="E"):
    """Load known and unknown texts in the PAN data format"""
    # FIXME: assumes one known file per author, which is fine for English datasets only
    authors = sorted([x for x in os.listdir(directory) if x.startswith(prefix)])
    known_texts = []
    unknown_texts = []
    for author in authors:
        kf = os.path.join(directory, author, "known01.txt")
        uf = os.path.join(directory, author, "unknown.txt")
        known_texts.append(read_file(kf))
        unknown_texts.append(read_file(uf))
        
    truthfile = os.path.join(directory, "truth.txt")
    with open(truthfile) as f:
        lines = f.read().strip().split("\n")
    y = [1 if line.split()[1] == "Y" else 0 for line in lines]
    y = np.array(y)
    return known_texts, unknown_texts, y

def create_pairs(knownX, unknownX):
    print(len(knownX), len(unknownX))
    """Creates pairs of known and unknown texts"""
    pairs = []
    for i in range(len(knownX)):
        pairs += [[knownX[i], unknownX[i]]]
    pairs = np.array(pairs)
    print(pairs.shape)
    return pairs

def vectorize(text, nlp):
    """Convert text (string) to embeddings (numpy array)
    nlp should be an initialised Spacy pipeline with loaded embeddings"""
    return nlp(text, entity=False, tag=False, parse=False).vector

In [34]:
from __future__ import absolute_import
from __future__ import print_function
np.random.seed(1337)  # for reproducibility
import random

from keras.models import Sequential, Model
from keras.layers import Dense, Input, Lambda, Embedding, LSTM, Dropout, Masking, Conv1D, MaxPooling1D
from keras.optimizers import RMSprop
from keras import backend as K

def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))

def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

def create_base_network(input_dim):
    '''Base network to be shared (eq. to feature extraction).
    '''
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True))
    model.add(Dropout(0.25))
    model.add(Conv1D(64,
                     5,
                     padding='valid',
                     activation='relu',
                     strides=1))
    model.add(MaxPooling1D(pool_size=4))
    model.add(LSTM(128))
    model.add(Dense(128))
    return model

def compute_accuracy(predictions, labels):
    return np.mean(np.equal(predictions.ravel() < 0.5, labels))

def train(tr_pairs, tr_y, epochs=20):
    input_dim = tr_pairs.shape[-1]

    # network definition
    base_network = create_base_network(input_dim)

    input_a = Input(shape=(input_dim,))
    input_b = Input(shape=(input_dim,))

    # because we re-use the same instance `base_network`,
    # the weights of the network
    # will be shared across the two branches
    processed_a = base_network(input_a)
    processed_b = base_network(input_b)
    distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
    model = Model(inputs=[input_a, input_b], outputs=distance)
    
    rms = RMSprop()
    model.compile(loss=contrastive_loss, optimizer=rms)
    model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
              validation_split=0.1,
              batch_size=20,
              epochs=epochs)
    return model
       
def evaluate(model, tr_pairs, tr_y, te_pairs, te_y):
    # compute final accuracy on training and test sets
    pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
    tr_acc = compute_accuracy(pred, tr_y)
    pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
    te_acc = compute_accuracy(pred, te_y)
    print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
    print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
    
def combine_vectors(v1s, v2s):
    return [np.hstack([v1s[i], v2s[i]]) for i in range(len(v1s))]

Found 7104 unique tokens.


In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 1000

known_train, unknown_train, y_train = load_pan_data(pan15train)
known_test, unknown_test, y_test = load_pan_data(pan15test)


tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(known_train + unknown_train + known_test + unknown_test)

known_seqs_tr = pad_sequences(tokenizer.texts_to_sequences(known_train), MAX_SEQUENCE_LENGTH)
unknown_seqs_tr = pad_sequences(tokenizer.texts_to_sequences(unknown_train), MAX_SEQUENCE_LENGTH)
known_seqs_te = pad_sequences(tokenizer.texts_to_sequences(known_test), MAX_SEQUENCE_LENGTH)
unknown_seqs_te = pad_sequences(tokenizer.texts_to_sequences(unknown_test), MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
embeddings_index = {}
with open("/data/glove/glove.6B.100d.txt") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

# prepare embedding matrix
EMBEDDING_DIM = 100
num_words = min(MAX_NB_WORDS, len(word_index))
print(num_words)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Found 7104 unique tokens.
Found 400000 word vectors.
5000


In [14]:
known_seqs_tr.shape

(100, 1000)

In [16]:
tr_pairs = create_pairs(known_seqs_tr, unknown_seqs_tr)
te_pairs = create_pairs(known_seqs_te, unknown_seqs_te)

100 100
(100, 2, 1000)
500 500
(500, 2, 1000)


In [None]:
known_train, unknown_train, y_train = load_pan_data(pan15train)
known_test, unknown_test, y_test = load_pan_data(pan15test)
print("word vec...")
# word vectors
w_known_train_vecs = [vectorize(t, nlp_word) for t in known_train]
w_unknown_train_vecs = [vectorize(t, nlp_word) for t in unknown_train]
w_known_test_vecs = [vectorize(t, nlp_word) for t in known_test]
w_unknown_test_vecs = [vectorize(t, nlp_word) for t in unknown_test]
# print("char vec...")
# character vectors
c_known_train_vecs = [vectorize(t, nlp_char) for t in known_train]
c_unknown_train_vecs = [vectorize(t, nlp_char) for t in unknown_train]
c_known_test_vecs = [vectorize(t, nlp_char) for t in known_test]
c_unknown_test_vecs = [vectorize(t, nlp_char) for t in unknown_test]
print("combining...")
# word + character vectors
wc_known_train_vecs = combine_vectors(w_known_train_vecs, c_known_train_vecs)
wc_unknown_train_vecs = combine_vectors(w_unknown_train_vecs, c_unknown_train_vecs)
wc_known_test_vecs = combine_vectors(w_known_test_vecs, c_known_test_vecs)
wc_unknown_test_vecs = combine_vectors(w_unknown_test_vecs, c_unknown_test_vecs)

word vec...


In [33]:
def run_experiment(known_train, unknown_train, known_test, unknown_test, y_train, y_test):
    tr_pairs = create_pairs(known_train, unknown_train)
    te_pairs = create_pairs(known_test, unknown_test)
    
    model = train(tr_pairs, y_train)
    evaluate(model, tr_pairs, y_train, te_pairs, y_test)

In [36]:
model = train(te_pairs, y_test, 5)

Train on 450 samples, validate on 50 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [38]:
evaluate(model, tr_pairs, y_train, te_pairs, y_test)

* Accuracy on training set: 49.00%
* Accuracy on test set: 98.60%


In [80]:
# known_train, unknown_train, y_train = load_pan_data(pan14train)
# known_test, unknown_test, y_test = load_pan_data(pan14test)
run_experiment(known_seqs_te, unknown_seqs_te, known_seqs_tr, unknown_seqs_tr, y_test, y_train)

500 500
(500, 2, 1000)
100 100
(100, 2, 1000)
Train on 450 samples, validate on 50 samples
Epoch 1/15
Epoch 2/15
 60/450 [===>..........................] - ETA: 129s - loss: 0.2296

KeyboardInterrupt: 