In [5]:
import os
import numpy as np
import textacy
from spacy.en import English

nlp = English()

In [263]:
class TextPair:
    def __init__(self, author, known, unknown, max_length=1200):
        self.author = author
        self.known = known
        self.unknown = unknown
        self.max_length = max_length

def get_string(filename):
    with open(filename, encoding="utf8") as f:
        s = f.read()
    return s

def get_texts(directory):
    authors = [x for x in os.listdir(directory) if x.startswith("EN")]
    tps = []
    for author in authors:
        known = os.path.join(directory, author, "known01.txt")
        unknown = os.path.join(directory, author, "unknown.txt")
        tps.append(TextPair(author, get_string(known), get_string(unknown)))
    return tps

def get_data(directory):
    
    # read all texts into known, unknown pairs
    tps = get_texts(directory)
    
    # get labels
    truthfile = os.path.join(directory, "truth.txt")
    with open(truthfile) as f:
        lines = f.read().strip().split("\n")
    y = [1 if line.split()[1] == "Y" else 0 for line in lines]
    y = np.array(y)
    return tps, y
    


    
    # create pairs


In [264]:
pan15train = "/data/pan15-authorship-verification-training-dataset-english-2015-04-19/"
pan15test = "/data/pan15-authorship-verification-test-dataset2-english-2015-04-19/"
pan14train = "/data/pan14-author-verification-training-corpus-english-essays-2014-04-22/"
pan14test = "/data/pan14-author-verification-test-corpus2-english-essays-2014-04-22/"
pan14train = "/data/pan14-author-verification-training-corpus-english-novels-2014-04-22/"
pan14test = "/data/pan14-author-verification-test-corpus2-english-novels-2014-04-22/"

In [265]:
X_train, y_train = get_data(pan14train)
X_test, y_test = get_data(pan14test)

In [266]:
def vectorize_distances(tps):
    for i in range(len(tps)):
        tps[i].known = nlp(tps[i].known)
        tps[i].unknown = nlp(tps[i].unknown)
    for i in range(len(tps)):
        w2v = textacy.similarity.word2vec(tps[i].known, tps[i].unknown)
        wm = textacy.similarity.word_movers(tps[i].known, tps[i].unknown)
        jc = textacy.similarity.jaccard(str(tps[i].known), str(tps[i].unknown))
        hm = textacy.similarity.hamming(str(tps[i].known), str(tps[i].unknown))
        jw = textacy.similarity.jaro_winkler(str(tps[i].known), str(tps[i].unknown))
        le = textacy.similarity.levenshtein(str(tps[i].known), str(tps[i].unknown))
        ts = textacy.similarity.token_sort_ratio(str(tps[i].known), str(tps[i].unknown))
        tps[i].distances = [w2v, wm, jc, hm, jw, le, ts]
    return tps


In [None]:
X_train = vectorize_distances(X_train)
X_test = vectorize_distances(X_test)

In [None]:
print(len(X_train), len(X_test))

In [None]:
mean_known = [0] * 7
mean_unknown = [0] * 7
print(mean_known)
for i, p in enumerate(X_train):
    for j, dm in enumerate(p.distances):
        if y_train[i] == 1:
            mean_known[j] += dm
        else:
            mean_unknown[j] += dm
            
    
print(mean_known)
print(mean_unknown)
    

In [None]:
tr_pairs = [tp.distances for i, tp in enumerate(X_train)]
te_pairs = [tp.distances for i, tp in enumerate(X_test)]

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from statistics import mean
print(mean(cross_val_score(DecisionTreeClassifier(), te_pairs, y_test, cv=5)))
DecisionTreeClassifier().fit(tr_pairs, y_train)
preds = clf.predict(te_pairs)
accuracy_score(y_test, preds)

In [None]:
train_known_X = [x.known.vector for x in X_train]
train_unknown_X = [x.unknown.vector for x in X_train]
test_known_X = [x.known.vector for x in X_test]
test_unknown_X = [x.unknown.vector for x in X_test]

In [None]:
tr_pairs = create_pairs(train_known_X, train_unknown_X)
te_pairs = create_pairs(test_known_X, test_unknown_X)

In [None]:
tr_y = y_train
te_y = y_test

In [268]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
# np.random.seed(1337)  # for reproducibility

import random
from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda
from keras.optimizers import RMSprop
from keras import backend as K


def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))


def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))


def create_pairs(x, digit_indices):
    '''Positive and negative pair creation.
    Alternates between positive and negative pairs.
    '''
    pairs = []
    labels = []
    n = min([len(digit_indices[d]) for d in range(10)]) - 1
    for d in range(10):
        for i in range(n):
            z1, z2 = digit_indices[d][i], digit_indices[d][i + 1]
            pairs += [[x[z1], x[z2]]]
            inc = random.randrange(1, 10)
            dn = (d + inc) % 10
            z1, z2 = digit_indices[d][i], digit_indices[dn][i]
            pairs += [[x[z1], x[z2]]]
            labels += [1, 0]
    return np.array(pairs), np.array(labels)


def create_base_network(input_dim):
    '''Base network to be shared (eq. to feature extraction).
    '''
    seq = Sequential()
    seq.add(Dense(128, input_shape=(input_dim,), activation='relu'))
    seq.add(Dense(128, activation='relu'))
    seq.add(Dense(128, activation='relu'))
    return seq


def compute_accuracy(predictions, labels):
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    return labels[predictions.ravel() < 0.5].mean()

In [269]:
input_dim = tr_pairs.shape[-1]

# network definition
base_network = create_base_network(input_dim)

input_a = Input(shape=(input_dim,))
input_b = Input(shape=(input_dim,))

# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])

model = Model(input=[input_a, input_b], output=distance)

AttributeError: 'list' object has no attribute 'shape'

In [None]:
# train
"""
tr_pairs1 = tr_pairs[:250]
te_pairs1 = tr_pairs[250:]
tr_y1 = tr_y[:250]
te_y1 = tr_y[250:]

tr_pairs = tr_pairs1
te_pairs = te_pairs1
tr_y = tr_y1
te_y = te_y1
"""
# tr_pairs = np.vstack([tr_pairs, te_pairs])
# print(tr_pairs.shape)

rms = RMSprop()
model.compile(loss=contrastive_loss, optimizer=rms)
model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
          validation_split=0.15,
          batch_size=5,
          nb_epoch=60)



In [None]:
# compute final accuracy on training and test sets
pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
tr_acc = compute_accuracy(pred, tr_y)
pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
te_acc = compute_accuracy(pred, te_y)

print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))