In [1]:
import os
from spacy.en import English
import numpy as np

In [65]:
# config
pan15train = "/data/pan15-authorship-verification-training-dataset-english-2015-04-19/"
pan15test = "/data/pan15-authorship-verification-test-dataset2-english-2015-04-19/"
pan14train = "/data/pan14-author-verification-training-corpus-english-essays-2014-04-22/"
pan14test = "/data/pan14-author-verification-test-corpus2-english-essays-2014-04-22/"
pan14train = "/data/pan14-author-verification-training-corpus-english-novels-2014-04-22/"
pan14test = "/data/pan14-author-verification-test-corpus2-english-novels-2014-04-22/"
char_embeddings_file = "glove.840B.300d-char.txt"  # full path to the character embeddings file

In [3]:
# load spacy, this takes a while
nlp_word = English(parse=False, tag=False, entity=False)
nlp_char = English(parse=False, tag=False, entity=False)
with open(char_embeddings_file) as f:
    nlp_char.vocab.load_vectors(f)

In [92]:
def read_file(filepath):
    with open(filepath) as f:
        s = f.read()
    return s

def load_pan_data(directory, prefix="E"):
    """Load known and unknown texts in the PAN data format"""
    # FIXME: assumes one known file per author, which is fine for English datasets only
    authors = sorted([x for x in os.listdir(directory) if x.startswith(prefix)])
    known_texts = []
    unknown_texts = []
    for author in authors:
        kf = os.path.join(directory, author, "known01.txt")
        uf = os.path.join(directory, author, "unknown.txt")
        known_texts.append(read_file(kf))
        unknown_texts.append(read_file(uf))
        
    truthfile = os.path.join(directory, "truth.txt")
    with open(truthfile) as f:
        lines = f.read().strip().split("\n")
    y = [1 if line.split()[1] == "Y" else 0 for line in lines]
    y = np.array(y)
    return known_texts, unknown_texts, y

def create_pairs(knownX, unknownX):
    print(len(knownX), len(unknownX))
    """Creates pairs of known and unknown texts"""
    pairs = []
    for i in range(len(knownX)):
        pairs += [[knownX[i], unknownX[i]]]
    pairs = np.array(pairs)
    print(pairs.shape)
    return pairs

def vectorize(text, nlp, target_sentences=60):
    """Convert text (string) to embeddings (numpy array)
    nlp should be an initialised Spacy pipeline with loaded embeddings"""
    p = nlp(text, entity=False, tag=False)
    vecs = [sent.vector for sent in p.sents]
    if len(vecs) > target_sentences:
        return vecs[:target_sentences]
    vecs += [nlp.vocab["<pad>"].vector] * (target_sentences - len(vecs))
    return np.array(vecs)    

In [128]:
from __future__ import absolute_import
from __future__ import print_function
np.random.seed(1337)  # for reproducibility
import random

from keras.models import Sequential, Model
from keras.layers import Dense, Input, Lambda, LSTM, Masking, Dropout
from keras.optimizers import RMSprop, Adam
from keras import backend as K

def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))

def cosine_distance(vects):
    x, y = vects
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

def create_base_network(input_dim):
    '''Base network to be shared (eq. to feature extraction).
    '''
    seq = Sequential()
    seq.add(Masking(input_shape=(input_dim)))
    seq.add(LSTM(256, input_shape=(input_dim), return_sequences=True))
    seq.add(Dropout(0.2))
    seq.add(LSTM(256, input_shape=(input_dim), activation='relu'))
    # seq.add(LSTM(256, activation='relu'))
    seq.add(Dense(256, activation='relu'))
    return seq

def compute_accuracy(predictions, labels):
    return np.mean(np.equal(predictions.ravel() < 0.5, labels))

def train(tr_pairs, tr_y):
    input_dim = (tr_pairs.shape[-2], tr_pairs.shape[-1])

    # network definition
    base_network = create_base_network(input_dim)

    input_a = Input(shape=(input_dim))
    input_b = Input(shape=(input_dim))

    # because we re-use the same instance `base_network`,
    # the weights of the network
    # will be shared across the two branches
    processed_a = base_network(input_a)
    processed_b = base_network(input_b)
    distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
    model = Model(inputs=[input_a, input_b], outputs=distance)
    
    rms = Adam(lr=0.001, decay=1e-4)
    model.compile(loss=contrastive_loss, optimizer=rms)
    model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
              validation_split=0.3,
              batch_size=20,
              epochs=30)
    return model
       
def evaluate(model, tr_pairs, tr_y, te_pairs, te_y):
    # compute final accuracy on training and test sets
    pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
    tr_acc = compute_accuracy(pred, tr_y)
    pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
    te_acc = compute_accuracy(pred, te_y)
    print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
    print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
    
def combine_vectors(v1s, v2s):
    return [np.hstack([v1s[i], v2s[i]]) for i in range(len(v1s))]

def run_experiment(known_train, unknown_train, known_test, unknown_test, y_train, y_test):
    tr_pairs = create_pairs(known_train, unknown_train)
    te_pairs = create_pairs(known_test, unknown_test)
    
    model = train(tr_pairs, y_train)
    evaluate(model, tr_pairs, y_train, te_pairs, y_test)

In [104]:
known_train, unknown_train, y_train = load_pan_data(pan15train)
known_test, unknown_test, y_test = load_pan_data(pan15test)

In [106]:
# word vectors
known_train_vecs = [vectorize(t, nlp_word, target_sentences=100) for t in known_train]
unknown_train_vecs = [vectorize(t, nlp_word, target_sentences=100) for t in unknown_train]
known_test_vecs = [vectorize(t, nlp_word, target_sentences=100) for t in known_test]
unknown_test_vecs = [vectorize(t, nlp_word, target_sentences=100) for t in unknown_test]

In [81]:
from statistics import mean
min([x.count(".") for x in known_train])

40

In [129]:
run_experiment(known_train_vecs, unknown_train_vecs, known_test_vecs, unknown_test_vecs, y_train, y_test)

100 100
(100, 2, 100, 300)
500 500
(500, 2, 100, 300)
Train on 70 samples, validate on 30 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
* Accuracy on training set: 94.00%
* Accuracy on test set: 60.80%


In [46]:
tr_pairs[:, 0].shape

(100, 60, 300)

In [48]:
model = train(tr_pairs, y_train)

Train on 95 samples, validate on 5 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [76]:
run_experiment(w_known_train_vecs, w_unknown_train_vecs, w_known_test_vecs, w_unknown_test_vecs, y_train, y_test)
run_experiment(c_known_train_vecs, c_unknown_train_vecs, c_known_test_vecs, c_unknown_test_vecs, y_train, y_test)
run_experiment(wc_known_train_vecs, wc_unknown_train_vecs, wc_known_test_vecs, wc_unknown_test_vecs, y_train, y_test)

100 100
(100, 2, 300)
500 500
(500, 2, 300)
Train on 95 samples, validate on 5 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
* Accuracy on training set: 100.00%
* Accuracy on test set: 72.60%
100 100
(100, 2, 300)
500 500
(500, 2, 300)
Train on 95 samples, validate on 5 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
* Accuracy on training set: 100.00%
* Accuracy on test set: 74.60%
100 100
(100, 2, 600)
500 500
(500, 2, 600)
Train on 95 samples, validate on 5 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
* Accuracy on training set: 97.00%
* Accuracy on test set: 68.20%


60

In [130]:
embeddings_index = {}
f = open('glove.840B.300d-char.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [138]:
import string
punctuation = '''!"'#$%&'()*,-./:;<>?@[]_'''
# + for padding -- should be mapped to index 0
chars = ["+"] + sorted(list(set(string.ascii_lowercase + punctuation + " ")))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
print(len(chars))

51


In [141]:
embedding_matrix = np.zeros((len(chars), 300))
for char, i in char_indices.items():
    embedding_vector = embeddings_index.get(char)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

array([  1.49838001e-01,  -2.55777001e-01,   3.77602994e-01,
         2.11992994e-01,  -3.13699991e-01,  -1.22639000e-01,
         1.46604002e-01,   1.29430994e-01,   8.87980014e-02,
        -1.63570595e+00,   5.25404990e-01,  -3.29488009e-01,
        -9.42559987e-02,   5.02242982e-01,   1.15299998e-02,
         2.61368006e-01,   1.80951998e-01,  -1.00705302e+00,
         4.12450999e-01,   2.66483992e-01,   1.16962999e-01,
         4.89906996e-01,   8.65300000e-02,   1.06690004e-02,
        -2.78346986e-01,  -5.26660010e-02,   1.22345001e-01,
        -5.28879985e-02,  -1.65929005e-01,   7.32019991e-02,
         1.54027000e-01,   4.53899987e-03,   1.42658994e-01,
        -1.91837996e-01,  -2.94609994e-01,   1.98869005e-01,
         4.51820008e-02,   6.76470026e-02,   1.82930008e-02,
         3.82256001e-01,  -1.43556997e-01,  -2.37182006e-01,
         4.43550013e-02,   3.31019983e-02,  -1.97981000e-01,
         1.58462003e-01,   9.09340009e-02,   1.29146993e-01,
        -2.46602997e-01,

In [160]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

texts = ["this is a test text it's longer more words one two three unique", "and this is another"]
tokenizer = Tokenizer(num_words=500)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=30)

Found 15 unique tokens.


In [163]:
sequences

[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [14, 1, 2, 15]]

In [162]:
data

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0, 14,  1,  2, 15]], dtype=int32)

In [157]:
help(tokenizer.texts_to_sequences)

Help on method texts_to_sequences in module keras.preprocessing.text:

texts_to_sequences(texts) method of keras.preprocessing.text.Tokenizer instance
    Transforms each text in texts in a sequence of integers.
    
    Only top "num_words" most frequent words will be taken into account.
    Only words known by the tokenizer will be taken into account.
    
    # Arguments
        texts: A list of texts (strings).
    
    # Returns
        A list of sequences.

