In [155]:
from spacy.en import English
nlp = English()

In [265]:
import os
import numpy as np

from __future__ import absolute_import
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

import random
from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda
from keras.optimizers import RMSprop
from keras import backend as K



def vectorize(text):
    processed = nlp(text)
    unk = nlp("<unk>").vector
    vecs = [word.vector for word in processed]
    if len(vecs) > 1200:
        return vecs[:1200]
    else:
        while len(vecs) < 1200:
            vecs.append(unk)
    return vecs
    

class TextPair:
    def __init__(self, author, known, unknown, max_length=1200):
        self.author = author
        self.known = known
        self.unknown = unknown

def get_string(filename):
    with open(filename, encoding="utf8") as f:
        s = f.read()
    return s

def get_texts(directory):
    authors = [x for x in os.listdir(directory) if x.startswith("EN")]
    tps = []
    for author in authors:
        known = os.path.join(directory, author, "known01.txt")
        unknown = os.path.join(directory, author, "unknown.txt")
        tps.append(TextPair(author, get_string(known), get_string(unknown)))
    return tps

def get_data(directory):
    # read all texts into known, unknown pairs
    tps = get_texts(directory)
    
    # get labels
    truthfile = os.path.join(directory, "truth.txt")
    with open(truthfile) as f:
        lines = f.read().strip().split("\n")
    y = [1 if line.split()[1] == "Y" else 0 for line in lines]
    y = np.array(y)
    return tps, y

def create_pairs(knownX, unknownX):
    pairs = []
    for i in range(len(knownX)):
        pairs += [[knownX[i], unknownX[i]]]
    pairs = np.array(pairs)
    print(pairs.shape)
    return pairs

def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))


def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

def create_base_network(input_dim):
    '''Base network to be shared (eq. to feature extraction).
    '''
    seq = Sequential()
    seq.add(Dense(128, input_shape=(1200, 300), activation='relu'))
    seq.add(Dense(128, activation='relu'))
    # seq.add(Dropout(0.1))
    # seq.add(Dense(256, activation='relu'))
    # seq.add(Dense(512, activation='relu'))
    # seq.add(Dense(512, activation='relu'))
    # seq.add(Dense(512, activation='relu'))
    return seq


def compute_accuracy(predictions, labels):
    return np.mean(np.equal(predictions.ravel() < 0.5, labels))

In [157]:
pan15train = "/data/pan15-authorship-verification-training-dataset-english-2015-04-19/"
pan15test = "/data/pan15-authorship-verification-test-dataset2-english-2015-04-19/"

# pan15train = "/data/pan14-author-verification-training-corpus-english-novels-2014-04-22/"
# pan15test = "/data/pan14-author-verification-test-corpus2-english-novels-2014-04-22/"

In [251]:
from statistics import mean
print("loading data...")
tr_pairs, tr_y = get_data(pan15train)
te_pairs, te_y = get_data(pan15test)
print("num train pairs: {}, num test pairs: {}".format(len(tr_pairs), len(te_pairs)))

loading data...
num train pairs: 100, num test pairs: 500


In [253]:
train_known_X = [vectorize(tp.known) for tp in tr_pairs]
print(".")
train_unknown_X = [vectorize(tp.unknown) for tp in tr_pairs]
print("..")
test_known_X = [vectorize(tp.known) for tp in te_pairs]
print("...")
test_unknown_X = [vectorize(tp.unknown) for tp in te_pairs]
print("building positive/negative pairs...")
tr_pairs = create_pairs(train_known_X, train_unknown_X)
te_pairs = create_pairs(test_known_X, test_unknown_X)

.
..
...
building positive/negative pairs...
(100, 2, 1200, 300)
(500, 2, 1200, 300)


In [254]:
max([len(x) for x in test_unknown_X])

1200

In [162]:
tr_pairs, te_pairs = te_pairs, tr_pairs
tr_y, te_y = te_y, tr_y

In [257]:
tr_pairs[:, 0].shape
tr_pairs.shape

(100, 2, 1200, 300)

In [266]:
input_dim = (1200, 300)
base_network = create_base_network(input_dim)
input_a = Input(shape=(input_dim))
input_b = Input(shape=(input_dim))
processed_a = base_network(input_a)
processed_b = base_network(input_b)
distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
model = Model(inputs=[input_a, input_b], outputs=distance)


rms = RMSprop()
model.compile(loss=contrastive_loss, optimizer='adam')
model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
          validation_split=0.05,
          batch_size=128,
          epochs=5)

Train on 95 samples, validate on 5 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x177a5af28>

In [217]:
# compute final accuracy on training and test sets
pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
tr_acc = compute_accuracy(pred, tr_y)
pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
te_acc = compute_accuracy(pred, te_y)

print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))

* Accuracy on training set: 100.00%
* Accuracy on test set: 72.60%


In [175]:
te_y

array([1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0,

In [174]:
np.array([1 if p[0] < 0.5 else 0 for p in pred])

array([1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0,

In [193]:
from sklearn.metrics import accuracy_score
from collections import Counter
preds = np.array([1 if p[0] < 0.57 else 0 for p in pred])
print(Counter(preds))
accuracy_score(te_y, preds)


Counter({0: 253, 1: 247})


0.64600000000000002

In [197]:
max([p[0] for p in pred])

1.8714997