In [398]:
import os
import numpy as np
import textacy
from spacy.en import English

nlp = English()

In [399]:
class TextPair:
    def __init__(self, author, known, unknown, max_length=1200):
        self.author = author
        self.known = known
        self.unknown = unknown
        self.max_length = max_length

def get_string(filename):
    with open(filename, encoding="utf8") as f:
        s = f.read()
    return s

def get_texts(directory):
    authors = [x for x in os.listdir(directory) if x.startswith("EN")]
    tps = []
    for author in authors:
        known = os.path.join(directory, author, "known01.txt")
        unknown = os.path.join(directory, author, "unknown.txt")
        tps.append(TextPair(author, get_string(known), get_string(unknown)))
    return tps

def get_data(directory):
    
    # read all texts into known, unknown pairs
    tps = get_texts(directory)
    
    # get labels
    truthfile = os.path.join(directory, "truth.txt")
    with open(truthfile) as f:
        lines = f.read().strip().split("\n")
    y = [1 if line.split()[1] == "Y" else 0 for line in lines]
    y = np.array(y)
    return tps, y
    


    
    # create pairs


In [400]:
pan15train = "/data/pan15-authorship-verification-training-dataset-english-2015-04-19/"
pan15test = "/data/pan15-authorship-verification-test-dataset2-english-2015-04-19/"
pan14train = "/data/pan14-author-verification-training-corpus-english-essays-2014-04-22/"
pan14test = "/data/pan14-author-verification-test-corpus2-english-essays-2014-04-22/"
pan14train = "/data/pan14-author-verification-training-corpus-english-novels-2014-04-22/"
pan14test = "/data/pan14-author-verification-test-corpus2-english-novels-2014-04-22/"

In [401]:
X_train, y_train = get_data(pan15train)
X_test, y_test = get_data(pan15test)

In [402]:
with open("glove.840B.300d-char.txt") as f:
    nlp.vocab.load_vectors(f)
    
nlp_word = English()

In [403]:
def vectorize_distances(tps):
    for i in range(len(tps)):
        tps[i].known = nlp(tps[i].known)
        tps[i].unknown = nlp(tps[i].unknown)
    for i in range(len(tps)):
        w2v = textacy.similarity.word2vec(tps[i].known, tps[i].unknown)
        wm = textacy.similarity.word_movers(tps[i].known, tps[i].unknown)
        jc = textacy.similarity.jaccard(str(tps[i].known), str(tps[i].unknown))
        hm = textacy.similarity.hamming(str(tps[i].known), str(tps[i].unknown))
        jw = textacy.similarity.jaro_winkler(str(tps[i].known), str(tps[i].unknown))
        le = textacy.similarity.levenshtein(str(tps[i].known), str(tps[i].unknown))
        ts = textacy.similarity.token_sort_ratio(str(tps[i].known), str(tps[i].unknown))
        tps[i].distances = [w2v, wm, jc, hm, jw, le, ts]
    return tps


In [282]:
def vectorize_text(str_text):
    return np.hstack([nlp(str_text, entity=False, tag=False, parse=False).vector, 
                      nlp_word(str_text, entity=False, tag=False, parse=False).vector])

X_train_known = [vectorize_text(x.known) for x in X_train]
X_train_unknown = [vectorize_text(x.unknown) for x in X_train]
X_test_known = [vectorize_text(x.known) for x in X_test]
X_test_unknown = [vectorize_text(x.unknown) for x in X_test]

In [462]:
import string
from collections import Counter

def _vectorize(str_text):
    alphabet = string.ascii_lowercase + "!?:;,.'- "
    processed = nlp(str_text, entity=False, tag=True, parse=True)
    stats = textacy.text_stats.TextStats(processed).basic_counts
    s_keys = ['n_long_words', 'n_monosyllable_words', 'n_polysyllable_words', 'n_sents', 
              'n_syllables', 'n_unique_words', 'n_words']
    tag_keys = ['ADJ', 'ADP', 'ADV', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SPACE', 'SYM', 'VERB', 'X']
    tag_keys_set = set(tag_keys)
    
    ratio_stats = [(key, stats[key] / len(str_text)) for key in s_keys]
    
    lower_text_ratios = Counter(''.join(filter(lambda x: x in alphabet, str_text.lower() + alphabet)))
    for key in lower_text_ratios:
        lower_text_ratios[key] /= len(str_text)
    
    lower_text_ratios = [(key, lower_text_ratios[key]) for key in sorted(list(lower_text_ratios.keys()))]

    new_tags = set([word.pos_ for word in processed if word.pos_ not in tag_keys_set])
    if len(new_tags) > 0:
        print(new_tags)
    tags = [word.pos_ for word in processed if word.pos_ in tag_keys_set] + tag_keys
    tag_counter_ratios = Counter(tags)
    for key in tag_counter_ratios:
        tag_counter_ratios[key] /= len(processed)
    
    tag_counter_ratios = [(key, tag_counter_ratios[key]) for key in sorted(list(tag_counter_ratios.keys()))]
    
    return ratio_stats + lower_text_ratios + tag_counter_ratios
        
def vectorize(str_text):
    vecs = _vectorize(str_text)
    return np.array([x[1] for x in vecs])
    
    

In [463]:
X_train_known = [vectorize(x.known) for x in X_train]
X_train_unknown = [vectorize(x.unknown) for x in X_train]
X_test_known = [vectorize(x.known) for x in X_test]
X_test_unknown = [vectorize(x.unknown) for x in X_test]

In [466]:
X_test_known[2].shape

(58,)

In [382]:
set([x[0] for x in test[0]]) - set([x[0] for x in test[1]])

set()

In [None]:
mean_known = [0] * 7
mean_unknown = [0] * 7
print(mean_known)
for i, p in enumerate(X_train):
    for j, dm in enumerate(p.distances):
        if y_train[i] == 1:
            mean_known[j] += dm
        else:
            mean_unknown[j] += dm
            
    
print(mean_known)
print(mean_unknown)
    

In [None]:
tr_pairs = [tp.distances for i, tp in enumerate(X_train)]
te_pairs = [tp.distances for i, tp in enumerate(X_test)]

In [109]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from statistics import mean
Xs = [X_train_known[i] - X_train_unknown[i] for i in range(len(X_train_known))]
print(mean(cross_val_score(DecisionTreeClassifier(), Xs, tr_y, cv=5)))
# DecisionTreeClassifier().fit(tr_pairs, y_train)
# preds = clf.predict(te_pairs)
# accuracy_score(y_test, preds)

0.48


In [None]:
train_known_X = [x.known.vector for x in X_train]
train_unknown_X = [x.unknown.vector for x in X_train]
test_known_X = [x.known.vector for x in X_test]
test_unknown_X = [x.unknown.vector for x in X_test]

In [578]:
def create_pairs(knownX, unknownX):
    pairs = []
    for i in range(len(knownX)):
        pairs += [[knownX[i], unknownX[i]]]
    pairs = np.array(pairs)
    print(pairs.shape)
    return pairs

In [580]:
tr_pairs = create_pairs(knownzs, unknownzs)
te_pairs = create_pairs(X_test_known, X_test_unknown)

(100, 2, 66)
(500, 2, 66)


In [63]:
tr_y = y_train
te_y = y_test

shuff_tr_y = tr_y[:]
shuff_te_y = te_y[:]
from random import shuffle
shuffle(shuff_tr_y)
shuffle(shuff_te_y)


In [162]:
truthfile = os.path.join(pan15train, "truth.txt")
with open(truthfile) as f:
    lines = f.read().strip().split("\n")
y = [1 if line.split()[1] == "Y" else 0 for line in lines]
tr_y = np.array(y)

In [589]:
te_y = y_test
tr_y = y_train

In [582]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

import random
from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda
from keras.optimizers import RMSprop
from keras import backend as K


def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))


def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))


def create_pairs(x, digit_indices):
    '''Positive and negative pair creation.
    Alternates between positive and negative pairs.
    '''
    pairs = []
    labels = []
    n = min([len(digit_indices[d]) for d in range(10)]) - 1
    for d in range(10):
        for i in range(n):
            z1, z2 = digit_indices[d][i], digit_indices[d][i + 1]
            pairs += [[x[z1], x[z2]]]
            inc = random.randrange(1, 10)
            dn = (d + inc) % 10
            z1, z2 = digit_indices[d][i], digit_indices[dn][i]
            pairs += [[x[z1], x[z2]]]
            labels += [1, 0]
    return np.array(pairs), np.array(labels)


def create_base_network(input_dim):
    '''Base network to be shared (eq. to feature extraction).
    '''
    seq = Sequential()
    seq.add(Dense(64, input_shape=(input_dim,), activation='relu'))
    seq.add(Dense(64))
    seq.add(Dense(64))
    seq.add(Dense(64))

    return seq

def compute_accuracy(predictions, labels):
    return np.mean(np.equal(predictions.ravel() < 0.5, labels))

In [590]:
tr_pairs, te_pairs = te_pairs, tr_pairs
tr_y, te_y = te_y, tr_y

In [598]:
input_dim = tr_pairs.shape[-1]

# network definition
base_network = create_base_network(input_dim)

input_a = Input(shape=(input_dim,))
input_b = Input(shape=(input_dim,))

# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])

model = Model(inputs=[input_a, input_b], outputs=distance)

In [599]:
rms = RMSprop()
model.compile(loss=contrastive_loss, optimizer=rms)
model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
          validation_split=0.7,
          batch_size=20,
          epochs=30)

Train on 150 samples, validate on 350 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x169819eb8>

In [600]:
# compute final accuracy on training and test sets
pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
tr_acc = compute_accuracy(pred, tr_y)
pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
te_acc = compute_accuracy(pred, te_y)

print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))

* Accuracy on training set: 89.60%
* Accuracy on test set: 50.00%


In [227]:
tr_y

array([1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1])

In [57]:
y_train

array([1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 1])

In [58]:
shuff_tr_y

array([1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 1])

In [59]:
shuffle(shuff_tr_y)

In [60]:
shuff_tr_y

array([1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1])

In [61]:
y_train

array([1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1])

In [499]:
# standard imports
import string
from collections import Counter

# third-party imports
import textacy
from spacy.en import English
from statistics import mean, stdev

def _normalize_counter(counter, c):
    """Divide all the values in a Counter by a constant and remove padding"""
    for key in counter:
        counter[key] = (counter[key] - 1) / c
    return counter

class TextAnalyser:
    def __init__(self, nlp=None):
        if nlp:
            self.nlp = nlp
        else:
            self.nlp = English()
            
        # alphabet for letter ratios
        self.alphabet = string.ascii_lowercase + "!?:;,.'- "
        
        # keys that we care about from textacy.stats
        self.basic_keys = ['n_long_words', 'n_monosyllable_words', 'n_polysyllable_words', 'n_sents', 'n_syllables', 'n_unique_words', 'n_words']
        
        # keys that we care about for textacy readability stats
        self.readability_keys = ['automated_readability_index','coleman_liau_index', 'flesch_kincaid_grade_level',
                                 'flesch_readability_ease', 'gulpease_index', 'gunning_fog_index', 'lix',
                                 'wiener_sachtextformel']
        
        # parts of speech that we care about from spacy (pos_ not tag_)
        self.pos_keys = ['ADJ', 'ADP', 'ADV', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SPACE', 'SYM', 'VERB', 'X']
        self.pos_keys_set = set(self.pos_keys)

    def get_named_features(self, text):
        # TODO: Add bigrams, trigrams?
        processed = self.nlp(text, entity=False, tag=True, parse=True)
        stats = textacy.text_stats.TextStats(processed)
        basic_stats = stats.basic_counts
        readability_stats = stats.readability_stats
        cleaned_text = ''.join(filter(lambda x: x in self.alphabet, text.lower() + self.alphabet))
        
        stats_ratios = {key: (basic_stats[key] / len(text)) for key in self.basic_keys}
        readability_ratios = {key: (readability_stats[key] / len(text)) for key in self.readability_keys}
        stats_ratios.update(readability_ratios)

        # get only the characters we care about 
        # append alphabet so that each character artificially appears once
        char_ratios = Counter(cleaned_text)
        char_ratios = _normalize_counter(char_ratios, len(text))

        # calculate pos ratios
        tags = [word.pos_ for word in processed if word.pos_ in self.pos_keys_set] + self.pos_keys
        pos_ratios = Counter(tags)
        pos_ratios = _normalize_counter(pos_ratios, len(processed)) # normalize by word length

        res = stats_ratios
        res.update(char_ratios)
        res.update(pos_ratios)
        return [(key, res[key]) for key in sorted(res)]
    
    def calculate_mean_and_std(self, extracted_texts):
        """finds unusual patterns by calculating mean and std deviation for a list of 
           extracted features and sorting by z-score"""
        means = []
        stds = []
        sample = extracted_texts[0]  # get one text for feature size and names
        num_features = len(sample)
        # fi = feature index
        for fi in range(num_features):
            u = mean([stat[fi][1] for stat in extracted_texts])
            o = stdev([stat[fi][1] for stat in extracted_texts])
            means.append((sample[fi][0], u))
            stds.append((sample[fi][0], o))
        return means, stds
    
    def calculate_z_scores(self, extracted_text, means, stds):
        """Calculate the zscores for each features of a single text (extractions)"""
        # z = (X - μ) / σ
        zscores = []
        num_features = len(extracted_text)
        for fi in range(num_features):
            feature_name = extracted_text[fi][0]
            try:
                zscore = (extracted_text[fi][1] - means[fi][1]) / stds[fi][1]
            except ZeroDivisionError:
                zscore = 0
            zscores.append((zscore, feature_name))
        return zscores
        
def vectorize(str_text):
    vecs = _vectorize(str_text)
    return np.array([x[1] for x in vecs])
    
    

In [619]:
def format_unusual_features(zscores, nfeatures=5):
    zscores = sorted(zscores)
    s = ""
    for j in range(nfeatures):
        fname = zscores[j][1]
        padding = " " * (30 - len(fname))
        s += "{} is low {}(zscore: {})\n".format(fname, padding, zscores[j][0])
    s += "    . . .     \n"
    for j in range(nfeatures):
        fname = zscores[-(j + 1)][1]
        padding = " " * (30 - len(fname))
        s += "{} is high{}(zscore: {})\n".format(fname, padding, zscores[-(j + 1)][0])
    return s

def supports_and_opposes(zscores1, zscores2, sup_threshold=1.5):
    supports = []
    opposes = []
    for i in range(len(zscores1)):
        # both are high or low
        if (zscores1[i][0] > sup_threshold and zscores2[i][0] > sup_threshold) or (
            zscores1[i][0] < -sup_threshold and zscores2[i][0] < -sup_threshold):
            supports.append((zscores1[i], zscores2[i]))
        # one is high and the other is low
        if (zscores1[i][0] > sup_threshold and zscores2[i][0] < -sup_threshold) or (
            zscores1[i][0] < -sup_threshold and zscores2[i][0] > sup_threshold):
            opposes.append((zscores1[i], zscores2[i]))
    return supports, opposes
         
    
    

In [635]:
def t(knowns, unknowns):

In [637]:
preds = [1 if p else 0 for p in preds]

In [639]:
accuracy_score(y_train, preds)

0.68999999999999995

In [665]:
def get(tps):
    knownstats = [ta.get_named_features(text) for text in [x.known for x in tps]]
    unknownstats = [ta.get_named_features(text) for text in [x.unknown for x in tps]]
    means, stds = ta.calculate_mean_and_std(knownstats + unknownstats)
    knownzs = [ta.calculate_z_scores(ks, means, stds) for ks in knownstats]
    unknownzs = [ta.calculate_z_scores(ks, means, stds) for ks in unknownstats]
    
    preds = []
    for i in range(len(tps)):
        z1 = knownzs[i]
        z2 = unknownzs[i]
        s, o = supports_and_opposes(z1, z2, sup_threshold=0.2)
        preds.append(len(s) > len(o))
    return preds

preds_train = get(X_train)
preds_test = get(X_test)



In [666]:
print(Counter(preds_train))
print(Counter(preds_test))


Counter({True: 68, False: 32})
Counter({True: 288, False: 212})


In [671]:
print(classification_report(y_test, preds_test))

             precision    recall  f1-score   support

          0       0.73      0.62      0.67       250
          1       0.67      0.77      0.71       250

avg / total       0.70      0.69      0.69       500



In [670]:
from sklearn.metrics import classification_report
print(classification_report(y_train, preds_train))

             precision    recall  f1-score   support

          0       0.91      0.58      0.71        50
          1       0.69      0.94      0.80        50

avg / total       0.80      0.76      0.75       100

