In [2]:
# Helper functions for preprocessing raw text before feeding it into a Neural Net
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Map printable characters to ints and vice-versa
ALPHABET = string.printable

char2int = dict((c, i) for i, c in enumerate(ALPHABET))
int2char = dict((i, c) for i, c in enumerate(ALPHABET))


def c2i(char):
    return char2int[char]

def i2c(num):
    return int2char[num]


def vectorize_text(text):
    """Convert a text into integers"""
    X = []
    text = ''.join(list(filter(lambda x: x in ALPHABET, text)))
    X = [c2i(char) for char in text]
    X = np.array(X)
    X = X / float(len(ALPHABET))
    return X

In [3]:
print(vectorize_text([ALPHABET]).shape)
# print(len(ALPHABET))
# print(ALPHABET[-1])

(100,)


In [5]:
import os

class TextPair:
    def __init__(self, author, known, unknown, max_length=1200):
        self.author = author
        self.known = known
        self.unknown = unknown
        self.max_length = max_length

def get_string(filename):
    with open(filename, encoding="utf8") as f:
        s = f.read()
    return s

def get_texts(directory):
    authors = [x for x in os.listdir(directory) if x.startswith("EN")]
    tps = []
    for author in authors:
        known = os.path.join(directory, author, "known01.txt")
        unknown = os.path.join(directory, author, "unknown.txt")
        tps.append(TextPair(author, get_string(known), get_string(unknown)))
    return tps

def get_data(directory):
    
    # read all texts into known, unknown pairs
    tps = get_texts(directory)
    
    # get labels
    truthfile = os.path.join(directory, "truth.txt")
    with open(truthfile) as f:
        lines = f.read().strip().split("\n")
    y = [1 if line.split()[1] == "Y" else 0 for line in lines]
    y = np.array(y)
    return tps, y
    


    
    # create pairs


In [7]:
pan15train = "/data/pan15-authorship-verification-training-dataset-english-2015-04-19/"
pan15test = "/data/pan15-authorship-verification-test-dataset2-english-2015-04-19/"
# pan15train = "/data/emails-train/"
# pan15test = "/data/emails-test/"



te_pairs, te_y = get_data(pan15train)
tr_pairs, tr_y = get_data(pan15test)

vec = TfidfVectorizer(analyzer='char', ngram_range=(4,4))
vec.fit([tp.known for tp in te_pairs] + [tp.unknown for tp in te_pairs] +
         [tp.known for tp in tr_pairs] + [tp.unknown for tp in tr_pairs])


def normalize(text, target_length=1200):
    if len(text) > target_length:
        return text[:target_length]
    else:
        return text + ("$" * (target_length - len(text)))

def vectorize(texts, vec, target_length=1200):
    # texts = [normalize(text) for text in texts]
    return vec.transform(texts)
    

In [8]:
train_known_X = vectorize([tp.known for tp in tr_pairs], vec)
train_unknown_X = vectorize([tp.unknown for tp in tr_pairs], vec)

In [9]:
test_known_X = vectorize([tp.known for tp in te_pairs], vec)
test_unknown_X = vectorize([tp.unknown for tp in te_pairs], vec)

In [58]:
# train_known_X = train_known_X.todense()
# rain_unknown_X = train_unknown_X.todense()
test_known_X = test_known_X.todense()
test_unknown_X = test_unknown_X.todense()

In [59]:
def create_pairs(knownX, unknownX):
    pairs = []
    for i in range(knownX.shape[0]):
        pairs += [[knownX[i], unknownX[i]]]
    pairs = np.array(pairs)
    print(pairs.shape)
    return pairs

In [60]:
tr_pairs = create_pairs(train_known_X, train_unknown_X)
te_pairs = create_pairs(test_known_X, test_unknown_X)

(500, 2, 1, 31909)
(100, 2, 1, 31909)


In [61]:
tr_pairs = tr_pairs.squeeze()
te_pairs = te_pairs.squeeze()

In [63]:
tr_pairs.shape

(500, 2, 31909)

In [114]:
tr_pairs = create_pairs(np.reshape(train_known_X.todense(), newshape=(500,1,120003)), 
                   np.reshape(train_unknown_X.todense(), newshape=(500,1,120003)))
tr_pairs = train_pairs.squeeze()

te_pairs = create_pairs(np.reshape(test_known_X.todense(), newshape=(100,1,120003)), 
                   np.reshape(test_unknown_X.todense(), newshape=(100,1,120003)))
te_pairs = test_pairs.squeeze()

print(te_pairs.shape)
print(tr_pairs.shape)

(100, 2, 120003)
(500, 2, 120003)


In [64]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

import random
from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda
from keras.optimizers import RMSprop
from keras import backend as K


def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))


def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))


def create_pairs(x, digit_indices):
    '''Positive and negative pair creation.
    Alternates between positive and negative pairs.
    '''
    pairs = []
    labels = []
    n = min([len(digit_indices[d]) for d in range(10)]) - 1
    for d in range(10):
        for i in range(n):
            z1, z2 = digit_indices[d][i], digit_indices[d][i + 1]
            pairs += [[x[z1], x[z2]]]
            inc = random.randrange(1, 10)
            dn = (d + inc) % 10
            z1, z2 = digit_indices[d][i], digit_indices[dn][i]
            pairs += [[x[z1], x[z2]]]
            labels += [1, 0]
    return np.array(pairs), np.array(labels)


def create_base_network(input_dim):
    '''Base network to be shared (eq. to feature extraction).
    '''
    seq = Sequential()
    seq.add(Dense(512, input_shape=(input_dim,), activation='relu'))
    seq.add(Dense(512, activation='relu'))
    seq.add(Dense(512, activation='relu'))
    seq.add(Dense(512, activation='relu'))
    seq.add(Dense(512, activation='relu'))
    seq.add(Dense(512, activation='relu'))
    return seq


def compute_accuracy(predictions, labels):
    return np.mean(np.equal(predictions.ravel() < 0.5, labels))

Using TensorFlow backend.


In [82]:
input_dim = tr_pairs.shape[-1]

# network definition
base_network = create_base_network(input_dim)

input_a = Input(shape=(input_dim,))
input_b = Input(shape=(input_dim,))

# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])

model = Model(inputs=[input_a, input_b], outputs=distance)

In [83]:
# train
"""
tr_pairs1 = tr_pairs[:250]
te_pairs1 = tr_pairs[250:]
tr_y1 = tr_y[:250]
te_y1 = tr_y[250:]

tr_pairs = tr_pairs1
te_pairs = te_pairs1
tr_y = tr_y1
te_y = te_y1
"""
# tr_pairs = np.vstack([tr_pairs, te_pairs])
# print(tr_pairs.shape)

rms = RMSprop()
model.compile(loss=contrastive_loss, optimizer=rms)
model.fit([te_pairs[:, 0], te_pairs[:, 1]], te_y,
          validation_split=0.01,
          batch_size=10,
          epochs=10)



Train on 99 samples, validate on 1 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x18333ee10>

In [84]:
# compute final accuracy on training and test sets
pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
tr_acc = compute_accuracy(pred, tr_y)
pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
te_acc = compute_accuracy(pred, te_y)

print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))

* Accuracy on training set: 66.80%
* Accuracy on test set: 99.00%


In [104]:
tr_pairs[:250].shape

(250, 2, 1200)

In [209]:
np.vstack([tr_pairs, te_pairs]).shape

(600, 2, 1200)

In [210]:
tr_pairs.shape

(100, 2, 1200)

In [216]:
te_pairs[:,0].shape

(500, 1200)

In [219]:
np.hstack([tr_y,te_y])

array([1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 1,