In [None]:
import os
from spacy.en import English
import numpy as np

In [None]:
# config
pan15train = "/data/pan15-authorship-verification-training-dataset-english-2015-04-19/"
pan15test = "/data/pan15-authorship-verification-test-dataset2-english-2015-04-19/"
pan14train = "/data/pan14-author-verification-training-corpus-english-essays-2014-04-22/"
pan14test = "/data/pan14-author-verification-test-corpus2-english-essays-2014-04-22/"
char_embeddings_file = "glove.840B.300d-char.txt"  # full path to the character embeddings file

In [None]:
# load spacy, this takes a while
nlp_word = English(parse=False, tag=False, entity=False)
nlp_char = English(parse=False, tag=False, entity=False)
with open(char_embeddings_file) as f:
    nlp_char.vocab.load_vectors(f)

In [173]:
def read_file(filepath):
    with open(filepath) as f:
        s = f.read()
    return s

def load_pan_data(directory, prefix="E"):
    """Load known and unknown texts in the PAN data format"""
    # FIXME: assumes one known file per author, which is fine for English datasets only
    authors = sorted([x for x in os.listdir(directory) if x.startswith(prefix)])
    known_texts = []
    unknown_texts = []
    for author in authors:
        kf = os.path.join(directory, author, "known01.txt")
        uf = os.path.join(directory, author, "unknown.txt")
        known_texts.append(read_file(kf))
        unknown_texts.append(read_file(uf))
        
    truthfile = os.path.join(directory, "truth.txt")
    with open(truthfile) as f:
        lines = f.read().strip().split("\n")
    y = [1 if line.split()[1] == "Y" else 0 for line in lines]
    y = np.array(y)
    return known_texts, unknown_texts, y

def load_C50(directory):
    """Load known and unknown texts in the PAN data format"""
    # FIXME: assumes one known file per author, which is fine for English datasets only
    authors = sorted([x for x in os.listdir(directory)])
    known_texts = []
    unknown_texts = []
    for author in authors:
        kf = os.path.join(directory, author, "known.txt")
        uf = os.path.join(directory, author, "unknown.txt")
        known_texts.append(read_file(kf))
        unknown_texts.append(read_file(uf))
    y = [1] * 25 + [0] * 25
    return known_texts, unknown_texts, y

def create_pairs(knownX, unknownX):
    """Creates pairs of known and unknown texts"""
    pairs = []
    for i in range(len(knownX)):
        pairs += [[knownX[i], unknownX[i]]]
    pairs = np.array(pairs)
    return pairs

def vectorize(text, nlp):
    """Convert text (string) to embeddings (numpy array)
    nlp should be an initialised Spacy pipeline with loaded embeddings"""
    return nlp(text, entity=False, tag=False, parse=False).vector

In [504]:
from __future__ import absolute_import
from __future__ import print_function
np.random.seed(1337)  # for reproducibility
import random

from keras.models import Sequential, Model
from keras.layers import Dense, Input, Lambda
from keras.optimizers import RMSprop
from keras import backend as K

def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

def create_base_network(input_dim):
    '''Base network to be shared (eq. to feature extraction).
    '''
    seq = Sequential()
    seq.add(Dense(512, input_shape=(input_dim,), activation='relu'))
    seq.add(Dense(512, activation='relu'))
    seq.add(Dense(512, activation='relu'))
    return seq

def compute_accuracy(predictions, labels):
    return np.mean(np.equal(predictions.ravel() < 0.5, labels))

def train(tr_pairs, tr_y, epochs=10):
    input_dim = tr_pairs.shape[-1]

    # network definition
    base_network = create_base_network(input_dim)

    input_a = Input(shape=(input_dim,))
    input_b = Input(shape=(input_dim,))

    # because we re-use the same instance `base_network`,
    # the weights of the network
    # will be shared across the two branches
    processed_a = base_network(input_a)
    processed_b = base_network(input_b)
    distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
    model = Model(inputs=[input_a, input_b], outputs=distance)
    
    rms = RMSprop()
    model.compile(loss=contrastive_loss, optimizer='adam')
    model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
              validation_split=0.05,
              batch_size=20,
              epochs=epochs)
    return model
       
def evaluate(model, tr_pairs, tr_y, te_pairs, te_y):
    # compute final accuracy on training and test sets
    pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
    tr_acc = compute_accuracy(pred, tr_y)
    pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
    te_acc = compute_accuracy(pred, te_y)
    print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
    print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
    
def combine_vectors(v1s, v2s):
    return [np.hstack([v1s[i], v2s[i]]) for i in range(len(v1s))]

In [74]:
known_train, unknown_train, y_train = load_C50("/data/C50/C50TrainVerification/")
known_test, unknown_test, y_test = load_pan_data(pan15test)
print("word vec...")
# word vectors
w_known_train_vecs = [vectorize(t, nlp_word) for t in known_train]
w_unknown_train_vecs = [vectorize(t, nlp_word) for t in unknown_train]
w_known_test_vecs = [vectorize(t, nlp_word) for t in known_test]
w_unknown_test_vecs = [vectorize(t, nlp_word) for t in unknown_test]
# print("char vec...")
# character vectors
c_known_train_vecs = [vectorize(t, nlp_char) for t in known_train]
c_unknown_train_vecs = [vectorize(t, nlp_char) for t in unknown_train]
c_known_test_vecs = [vectorize(t, nlp_char) for t in known_test]
c_unknown_test_vecs = [vectorize(t, nlp_char) for t in unknown_test]
print("combining...")
# word + character vectors
wc_known_train_vecs = combine_vectors(w_known_train_vecs, c_known_train_vecs)
wc_unknown_train_vecs = combine_vectors(w_unknown_train_vecs, c_unknown_train_vecs)
wc_known_test_vecs = combine_vectors(w_known_test_vecs, c_known_test_vecs)
wc_unknown_test_vecs = combine_vectors(w_unknown_test_vecs, c_unknown_test_vecs)

word vec...
combining...


In [75]:
def run_experiment(known_train, unknown_train, known_test, unknown_test, y_train, y_test):
    tr_pairs = create_pairs(known_train, unknown_train)
    te_pairs = create_pairs(known_test, unknown_test)
    
    model = train(tr_pairs, y_train)
    evaluate(model, tr_pairs, y_train, te_pairs, y_test)

In [76]:
run_experiment(w_known_train_vecs, w_unknown_train_vecs, w_known_test_vecs, w_unknown_test_vecs, y_train, y_test)
run_experiment(c_known_train_vecs, c_unknown_train_vecs, c_known_test_vecs, c_unknown_test_vecs, y_train, y_test)
run_experiment(wc_known_train_vecs, wc_unknown_train_vecs, wc_known_test_vecs, wc_unknown_test_vecs, y_train, y_test)

100 100
(100, 2, 300)
500 500
(500, 2, 300)
Train on 95 samples, validate on 5 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
* Accuracy on training set: 100.00%
* Accuracy on test set: 72.60%
100 100
(100, 2, 300)
500 500
(500, 2, 300)
Train on 95 samples, validate on 5 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
* Accuracy on training set: 100.00%
* Accuracy on test set: 74.60%
100 100
(100, 2, 600)
500 500
(500, 2, 600)
Train on 95 samples, validate on 5 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
* Accuracy on training set: 97.00%
* Accuracy on test set: 68.20%


In [210]:
C50TRAIN = "/data/C50/C50train/"
C50TEST = "/data/C50/C50test/"

def get_author_files(author_name, path):
    apath = os.path.join(path, author_name)
    afiles = os.listdir(apath)
    return [os.path.join(apath, f) for f in afiles]

def get_author_texts(author_name, path, max_texts=None):
    afiles = get_author_files(author_name, path)
    atexts = []
    for af in afiles:
        with open(af) as f:
            s = f.read()
        atexts.append(s)
    if max_texts:
        return atexts[:max_texts]
    return atexts

def get_negative_sample(exclude_name, path):
    authors = os.listdir(path)
    authors = [x for x in authors if x != exclude_name]
    negative_texts = []
    
    # take two texts from the first author to make up for deleted one
    negative_texts += get_author_texts(authors[0], path, max_texts=2)
    
    # add one text from each other author
    for author in authors[1:]:
        negative_texts += get_author_texts(author, path, max_texts=1)
    return negative_texts

In [438]:
def get_author_pairs(author, path):
    positive_samples = get_author_texts(author, path)
    negative_samples = get_negative_sample(author, path) 
    pos_vecs = [vectorize(t, nlp_word) for t in positive_samples]
    neg_vecs = [vectorize(t, nlp_word) for t in negative_samples]
    known = pos_vecs + pos_vecs
    unknown = pos_vecs[1:] + [pos_vecs[0]] + neg_vecs
    pairs = create_pairs(known, unknown)
    return pairs

In [568]:
def mostly_negative(author, path):
    """Create unbalanced pairs for author -- 50 positive examples and the rest negative"""
    
    positive_samples = get_author_texts(author, path)

    authors = os.listdir(path)
    authors = [x for x in authors if x != author]
    
    # 2450 negative examples
    negative_samples = []
    for a in authors:
        negative_samples += get_author_texts(a, path)
     
    # vectorize
    pos_vecs = [vectorize(t, nlp_word) for t in positive_samples]
    neg_vecs = [vectorize(t, nlp_word) for t in negative_samples]
    
    # 50 positive examples first
    known = pos_vecs + pos_vecs * 49
    unknown = pos_vecs[1:] + [pos_vecs[0]] + neg_vecs
    pairs = create_pairs(known, unknown)
    return pairs

In [569]:
neg_pairs = mostly_negative(authors[0], C50TEST)

In [570]:
neg_pairs.shape

(2500, 2, 300)

In [439]:
all_tr_pairs = np.vstack([get_author_pairs(author, C50TRAIN) for author in authors])
all_te_pairs = np.vstack([get_author_pairs(author, C50TEST) for author in authors])

In [440]:
all_tr_pairs.shape

(5000, 2, 300)

In [528]:
tr_y = [1] * 50 + [0] * 50
# tr_y *= 50
te_y = [1] * 50 + [0] * 50


In [551]:
aaron_pairs_train = get_author_pairs(authors[1], C50TRAIN)
aaron_pairs_test = get_author_pairs(authors[1], C50TEST)


In [547]:
model = train(all_tr_pairs[:1000], (tr_y * 50)[:1000])

Train on 950 samples, validate on 50 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [559]:
accs = []
for author in authors:
    tr_pairs = get_author_pairs(author, C50TRAIN)
    te_pairs = get_author_pairs(author, C50TEST)
    model = train(tr_pairs, tr_y)
    preds = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
    acc = compute_accuracy(preds, te_y)
    print(acc)
    accs.append(acc)
print(mean(accs))

Train on 95 samples, validate on 5 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.92
Train on 95 samples, validate on 5 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.92
Train on 95 samples, validate on 5 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.78
Train on 95 samples, validate on 5 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.82
Train on 95 samples, validate on 5 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.74
Train on 95 samples, validate on 5 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.78
Train on 95 samples, validate on 5 samples
Epo

In [584]:
tr_neg_pairs = mostly_negative(authors[0], C50TRAIN)
te_neg_pairs = mostly_negative(authors[0], C50TEST)
labels = [1] * 50 + [0] * 2450
model = train(tr_neg_pairs, labels)
preds = model.predict([te_neg_pairs[:, 0], te_neg_pairs[:, 1]])
preds = [1 if x < 0.5 else 0 for x in preds]
print(classification_report(labels, preds))


Train on 2375 samples, validate on 125 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
             precision    recall  f1-score   support

          0       0.99      0.99      0.99      2450
          1       0.51      0.44      0.47        50

avg / total       0.98      0.98      0.98      2500



In [587]:
Counter(preds[:50])

Counter({0: 28, 1: 22})

In [583]:
from sklearn.metrics import classification_report
preds

[1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [561]:
from statistics import mean
mean(accs)

0.81840000000000002

In [508]:
model = train(all_tr_pairs[:1000], tr_y[:1000], epochs=30)

Train on 950 samples, validate on 50 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [556]:
preds = model.predict([all_te_pairs[:, 0], all_te_pairs[:, 1]])

In [558]:
compute_accuracy(preds, te_y * 50)

0.75139999999999996

In [445]:
compute_accuracy(preds, te_y)



0.5

In [429]:
t1 = get_author_texts(authors[0], C50TEST)[49]
t1 = vectorize(t1, nlp_word)
t2 = get_author_texts(authors[8], C50TEST)[49]
t2 = vectorize(t2, nlp_word)
samp = np.array([[t1, t2]])
model.predict([samp[:, 0], samp[:, 1]])

array([[ 0.30504674]], dtype=float32)

In [295]:
authors = os.listdir(C50TRAIN)

# concatenate everything we know about the training authors
# we use these as the 'known' texts and predict on unknown texts
# by finding the smallest distance to the known text
concat_training_texts = [' '.join(get_author_texts(author, C50TRAIN)) for author in authors]
known_vectors = [vectorize(t, nlp_word) for t in concat_training_texts]    

In [355]:
# get some unseen test text
def predict(text, known_vectors, nlp):
    test_vec = vectorize(text, nlp)

    # we duplicate the unknown text so that it is paired with each possible known author
    unknown_vectors = [test_vec for _ in range(len(authors))]
    pairs = create_pairs(known_vectors, unknown_vectors)

    # and find the closest known author, based on the learned distance function
    preds = model.predict([pairs[:, 0], pairs[:, 1]])
    return np.argmin(preds)

all_test_texts = []
for author in authors:
    all_test_texts += get_author_texts(author, C50TEST)

preds = [predict(text, known_vectors, nlp_word) for text in all_test_texts]

TypeError: object of type 'int' has no len()

In [368]:
labels = []
for i in range(50):
    labels += [i] * 50

In [382]:
from collections import Counter
from sklearn.metrics import accuracy_score
majs = [Counter(preds[i:i+50]) for i in range(0,2500, 50)]
mpreds = [c.most_common(1)[0][0] for c in majs]

In [None]:
len(labels)

In [277]:
author = authors[0]

pairs = get_author_pairs(author, C50TEST)
preds = model.predict([pairs[:, 0], pairs[:, 1]])

In [283]:
from random import shuffle
indices = list(range(pairs.shape[0]))
shuffle(indices)

In [285]:
indices[:5]

[51, 42, 68, 23, 74]

In [290]:
s_pairs = pairs[indices]
s_y = np.array(tr_y)[indices]

In [291]:
preds = model.predict([s_pairs[:, 0], s_pairs[:, 1]])


In [293]:
compute_accuracy(preds, s_y)

1.0

In [195]:
known_train, unknown_train, y_train = load_pan_data(pan15train)
known_test, unknown_test, y_test = load_pan_data(pan15test)

w_known_train_vecs = [vectorize(t, nlp_word) for t in known_train]
w_unknown_train_vecs = [vectorize(t, nlp_word) for t in unknown_train]
w_known_test_vecs = [vectorize(t, nlp_word) for t in known_test]
w_unknown_test_vecs = [vectorize(t, nlp_word) for t in unknown_test]

pan_train_pairs = create_pairs(w_known_train_vecs, w_unknown_train_vecs)
pan_test_pairs = create_pairs(w_known_test_vecs, w_unknown_test_vecs)

In [196]:
ptrain_preds = model.predict([pan_train_pairs[:, 0], pan_train_pairs[:, 1]])
ptest_preds = model.predict([pan_test_pairs[:, 0], pan_test_pairs[:, 1]])

In [197]:
compute_accuracy(ptrain_preds, y_train)

0.44

In [199]:
compute_accuracy(ptest_preds, y_test)

0.60799999999999998