In [1]:
# Helper functions for preprocessing raw text before feeding it into a Neural Net
import string
import numpy as np

# Map printable characters to ints and vice-versa
ALPHABET = string.printable

char2int = dict((c, i) for i, c in enumerate(ALPHABET))
int2char = dict((i, c) for i, c in enumerate(ALPHABET))


def c2i(char):
    return char2int[char]

def i2c(num):
    return int2char[num]


def vectorize_text(text):
    """Convert a text into integers"""
    X = []
    text = ''.join(list(filter(lambda x: x in ALPHABET, text)))
    X = [c2i(char) for char in text]
    X = np.array(X)
    X = X / float(len(ALPHABET))
    return X

In [2]:
print(vectorize_text([ALPHABET]).shape)
# print(len(ALPHABET))
# print(ALPHABET[-1])

(100,)


In [14]:
from spacy.en import English
nlp = English()

In [22]:
import os


class TextPair:
    def __init__(self, author, known, unknown, max_length=1200):
        self.author = author
        self.known = known
        self.unknown = unknown
        self.max_length = max_length

def get_string(filename):
    with open(filename, encoding="utf8") as f:
        s = f.read()
    return s

def get_texts(directory):
    authors = [x for x in os.listdir(directory) if x.startswith("EN")]
    print("Authors: {}".format(len(authors)))
    tps = []
    for author in authors:
        known = os.path.join(directory, author, "known01.txt")
        unknown = os.path.join(directory, author, "unknown.txt")
        tps.append(TextPair(author, get_string(known), get_string(unknown)))
    return tps

def load_truth(directory):
    # get labels
    truthfile = os.path.join(directory, "truth.txt")
    with open(truthfile) as f:
        lines = f.read().strip().split("\n")
    y = [1 if line.split()[1] == "Y" else 0 for line in lines]
    y = np.array(y)
    return y

def get_data(tps):  
    knownX = [nlp(tp.known).vector for tp in tps]
    unknownX = [nlp(tp.unknown).vector for tp in tps]
    # lengths = [x.shape[0] for x in knownX + unknownX]
    # from matplotlib import pyplot as plt
    # plt.hist(lengths)
    # plt.show()
    
    # print(min([x.shape[0] for x in knownX + unknownX]))
    
    # truncate all texts to 1200 characters
    tkX = np.array(knownX)
    tuX = np.array(unknownX)
    # print(tkX.shape)
    
    # create pairs
    pairs = []
    for i in range(tkX.shape[0]):
        pairs += [[tkX[i], tuX[i]]]
    pairs = np.array(pairs)
    print(pairs.shape)
    return pairs



In [66]:
pan15train = "/data/pan15-authorship-verification-training-dataset-english-2015-04-19/"
pan15test = "/data/pan15-authorship-verification-test-dataset2-english-2015-04-19/"
pan14train = "/data/pan14-author-verification-training-corpus-english-novels-2014-04-22/"
pan14test = "/data/pan14-author-verification-test-corpus2-english-novels-2014-04-22/"
emailstrain = "/data/emails-train/"
emailstest = "/data/emails-test/"



In [70]:
te_tps = get_texts(pan15train)
te_y = load_truth(pan15train)
tr_tps = get_texts(pan15test)
tr_y = load_truth(pan15test)

Authors: 100
Authors: 500


In [71]:
tr_pairs = get_data(tr_tps)

(500, 2, 300)


In [72]:
te_pairs = get_data(te_tps)

(100, 2, 300)


In [48]:
def build_training(textpairs):
    # Build training only from known texts
    # Test as known/unknown pairs
    training = []
    tr_labels = []
    for i, tp in enumerate(textpairs):
        bi = int(len(tp.known)/2)
        nxt = i+1
        if textpairs[i] == textpairs[-1]:
            nxt = 0
        ntp_same = TextPair(tp.author, tp.known[:bi], tp.known[bi:])
        ntp_diff = TextPair(tp.author, tp.known, textpairs[nxt].known)
        training += [ntp_same, ntp_diff]
        tr_labels += [1, 0]
    return training, tr_labels

In [73]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

import random
from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda, LSTM
from keras.optimizers import RMSprop
from keras import backend as K


def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))


def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))


def create_pairs(x, digit_indices):
    '''Positive and negative pair creation.
    Alternates between positive and negative pairs.
    '''
    pairs = []
    labels = []
    n = min([len(digit_indices[d]) for d in range(10)]) - 1
    for d in range(10):
        for i in range(n):
            z1, z2 = digit_indices[d][i], digit_indices[d][i + 1]
            pairs += [[x[z1], x[z2]]]
            inc = random.randrange(1, 10)
            dn = (d + inc) % 10
            z1, z2 = digit_indices[d][i], digit_indices[dn][i]
            pairs += [[x[z1], x[z2]]]
            labels += [1, 0]
    return np.array(pairs), np.array(labels)


def create_base_network(input_dim):
    '''Base network to be shared (eq. to feature extraction).
    '''
    seq = Sequential()
    seq.add(Dense(128, input_shape=(input_dim,), activation='relu'))
    seq.add(Dense(128, activation='relu'))
    seq.add(Dropout(0.1))
    seq.add(Dense(256, activation='relu'))
    seq.add(Dense(512, activation='relu'))
    seq.add(Dense(512, activation='relu'))
    seq.add(Dense(512, activation='relu'))
    return seq

def compute_accuracy(predictions, labels):
    return np.mean(np.equal(predictions.ravel() < 0.5, labels))

In [83]:
input_dim = tr_pairs.shape[-1]

# network definition
base_network = create_base_network(input_dim)

input_a = Input(shape=(input_dim,))
input_b = Input(shape=(input_dim,))

# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])

model = Model(inputs=[input_a, input_b], outputs=distance)

In [84]:
# train

'''tr_pairs1 = tr_pairs[:250]
te_pairs1 = tr_pairs[250:]
tr_y1 = tr_y[:250]
te_y1 = tr_y[250:]

tr_pairs = tr_pairs1
te_pairs = te_pairs1
tr_y = tr_y1
te_y = te_y1'''

# tr_pairs = emails
# tr_y = emails_y
# te_pairs = emails_t
# te_y = emails_t_y

# tr_pairs = np.vstack([tr_pairs, te_pairs])
# print(tr_pairs.shape)

rms = RMSprop()
model.compile(loss=contrastive_loss, optimizer='adam')
model.fit([te_pairs[:, 0], te_pairs[:, 1]], te_y,
          validation_split=0.05,
          batch_size=64,
          epochs=50)



Train on 95 samples, validate on 5 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x13dba6710>

In [86]:
# compute final accuracy on training and test sets
from sklearn.metrics import accuracy_score

tr_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
tr_acc = compute_accuracy(tr_pred, tr_y)
te_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
te_acc = compute_accuracy(te_pred, te_y)

#pred = model.predict([ve_pairs[:, 0], ve_pairs[:, 1]])
#ve_acc = compute_accuracy(pred, ve_y)
#pred = model.predict([vet_pairs[:, 0], vet_pairs[:, 1]])
#vet_acc = compute_accuracy(pred, vet_y)

print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))

tr_thresh = [1 if p < 0.5 else 0 for p in tr_pred]
te_thresh = [1 if p < 0.5 else 0 for p in te_pred]
print(accuracy_score(tr_y, tr_thresh))
print(accuracy_score(te_y, te_thresh))

# print('* Accuracy on ver set: %0.2f%%' % (100 * ve_acc))
#print('* Accuracy on vet set: %0.2f%%' % (100 * vet_acc))

* Accuracy on training set: 72.20%
* Accuracy on test set: 100.00%
0.722
1.0


In [89]:
print(tr_pred)
print(tr_y)

[[ 0.38579309]
 [ 0.19190618]
 [ 1.21739352]
 [ 1.38640368]
 [ 0.53209692]
 [ 0.87954098]
 [ 0.439675  ]
 [ 0.18316305]
 [ 0.71409887]
 [ 0.76006997]
 [ 0.3019276 ]
 [ 1.37371516]
 [ 1.48004222]
 [ 1.11111081]
 [ 0.73745477]
 [ 0.07453167]
 [ 0.29078701]
 [ 0.64323968]
 [ 0.34583589]
 [ 0.30699137]
 [ 1.28228533]
 [ 1.14684033]
 [ 1.0880276 ]
 [ 0.11579861]
 [ 0.70992297]
 [ 0.87186182]
 [ 1.08331406]
 [ 0.12139704]
 [ 1.25472093]
 [ 0.07606126]
 [ 0.1952341 ]
 [ 1.20237398]
 [ 1.28170323]
 [ 1.65513492]
 [ 0.1786336 ]
 [ 0.25072396]
 [ 0.29158568]
 [ 0.50225377]
 [ 0.58886331]
 [ 0.59077293]
 [ 0.99517995]
 [ 0.53107297]
 [ 0.14088911]
 [ 0.33663586]
 [ 0.59289837]
 [ 0.51265287]
 [ 1.74963951]
 [ 0.06626197]
 [ 0.58706957]
 [ 0.3783356 ]
 [ 0.73556936]
 [ 0.33740765]
 [ 0.32651854]
 [ 1.3617934 ]
 [ 0.62894005]
 [ 0.46390149]
 [ 0.38330084]
 [ 0.44733942]
 [ 0.51164544]
 [ 0.61188418]
 [ 0.57369584]
 [ 1.10470688]
 [ 0.10274324]
 [ 1.19675803]
 [ 0.07092064]
 [ 0.76058114]
 [ 0.54506

In [1619]:
te_y[pred.ravel() < 0.5].mean()

0.8571428571428571

In [None]:
np.vstack([tr_pairs, te_pairs]).shape

In [1020]:
tr_pairs.shape

(500, 2, 1200)

In [1021]:
te_pairs[:,0].shape

(100, 1200)

In [1022]:
np.hstack([tr_y,te_y])

array([1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0,

In [1834]:
from sklearn.metrics.pairwise import cosine_distances
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,6))


tps = get_texts(pan15train)
vectorizer.fit([tp.known for tp in tps] + [tp.unknown for tp in tps])



Authors: 100


TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 6), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [1835]:
known_vectors = vectorizer.transform([tp.known for tp in tps])
unknown_vectors = vectorizer.transform([tp.unknown for tp in tps])

In [1836]:
labels = load_truth(pan15train)
same_indices = labels
diff_indices = [0 if x else 1 for x in labels]

In [1837]:
distances = cosine_distances(known_vectors, unknown_vectors)

In [1840]:
np.mean([x for i,x in enumerate(distances.diagonal()) if labels[i]])

0.32170328860606484

In [1839]:
same_indices

array([1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1])

In [1831]:
distances.shape

(100, 100)

In [1842]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import cross_val_score
from sklearn.cross_validation import cross_val_score

svc = SVC()

cross_val_score(svc, distances, labels, cv=2)



array([ 0.52,  0.58])