In [1]:
import os

data_dir = "/data/Gutenberg/"

files = os.listdir(data_dir)
files = [x for x in files if x.endswith(".txt") and "__" in x]

In [2]:
texts = []
labels = []

for fn in files:
    with open(os.path.join(data_dir, fn), encoding="ISO-8859-1") as f:
        s = f.read()
    texts.append(s[1000:])
    labels.append(fn[:-4])

In [3]:
shortest = sorted([(len(text), i) for i, text in enumerate(texts)])[:36]

In [4]:
shortest_is = set([s[1] for s in shortest])

In [5]:
texts = [x for i, x in enumerate(texts) if i not in shortest_is]
labels = [x for i, x in enumerate(labels) if i not in shortest_is]

In [16]:
def get_chunks(l, n):
    n = max(1, n)
    return (l[i:i+n] for i in range(0, len(l), n))

In [17]:
text_fragments = []
label_fragments = []

for i, text in enumerate(texts):
    num_added = 0
    chunks = get_chunks(text, 1000)
    for chunk in chunks:
        if num_added > 10:
            continue
        num_added += 1
        text_fragments.append(chunk)
        label_fragments.append(labels[i])
        

In [18]:
len(text_fragments)

33000

In [19]:
import numpy as np
from random import shuffle

indices = list(range(len(text_fragments)))
shuffle(indices)

In [20]:
text_fragments = np.array(text_fragments)
label_fragments = np.array(label_fragments)

In [21]:
text_fragments = text_fragments[indices]
label_fragments = label_fragments[indices]

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
%%time
vectorizer = TfidfVectorizer(min_df=5, ngram_range=(1,5), analyzer='char', lowercase=False)

vectors = vectorizer.fit_transform(text_fragments)

CPU times: user 3min, sys: 7.85 s, total: 3min 8s
Wall time: 3min 14s


In [37]:
vectors.shape

(33000, 453674)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

nb = MultinomialNB(fit_prior=False, alpha=0.001)
# svm = SVC(kernel='linear', probability=True)

X_train, X_test, y_train, y_test = train_test_split(vectors, label_fragments, test_size=0.1)

In [None]:
%%time
nb.fit(X_train, y_train)

In [None]:
%%time
from sklearn.metrics import accuracy_score
preds = nb.predict(X_test)

print(accuracy_score(y_test, preds))

In [None]:
with open("/data/panstuffs/pan15-authorship-verification-training-dataset-english-2015-04-19/EN001/known01.txt") as f:
    k1 = f.read()

with open("/data/panstuffs/pan15-authorship-verification-training-dataset-english-2015-04-19/EN001/unknown.txt") as f:
    u1 = f.read()

vk1 = vectorizer.transform([k1])
vu1 = vectorizer.transform([u1])

In [None]:
pdfk = nb.predict_proba(vk1)
pdfu = nb.predict_proba(vu1)

In [None]:
print(np.abs(pdfk[0] - pdfu[0]).tolist())

In [None]:
preds[:10]

In [None]:
y_test[:10]

In [None]:
def read_file(filepath):
    with open(filepath) as f:
        s = f.read()
    return s

def load_pan_data(directory, prefix="E"):
    """Load known and unknown texts in the PAN data format"""
    # FIXME: assumes one known file per author, which is fine for English datasets only
    authors = sorted([x for x in os.listdir(directory) if x.startswith(prefix)])
    known_texts = []
    unknown_texts = []
    for author in authors:
        kf = os.path.join(directory, author, "known01.txt")
        uf = os.path.join(directory, author, "unknown.txt")
        known_texts.append(read_file(kf))
        unknown_texts.append(read_file(uf))
        
    truthfile = os.path.join(directory, "truth.txt")
    with open(truthfile) as f:
        lines = f.read().strip().split("\n")
    y = [1 if line.split()[1] == "Y" else 0 for line in lines]
    y = np.array(y)
    return known_texts, unknown_texts, y

In [None]:
# paths to PAN datasets, available from http://pan.webis.de/clef15/pan15-web/author-identification.html
# and http://pan.webis.de/clef14/pan14-web/author-identification.html
pan15train = "/data/panstuffs/pan15-authorship-verification-training-dataset-english-2015-04-19/"
pan15test = "/data/panstuffs/pan15-authorship-verification-test-dataset2-english-2015-04-19/"
pan14train = "/data/panstuffs/pan14-author-verification-training-corpus-english-novels-2014-04-22/"
pan14test = "/data/panstuffs/pan14-author-verification-test-corpus2-english-novels-2014-04-22/"
pan14train_e = "/data/panstuffs/pan14-author-verification-training-corpus-english-essays-2014-04-22/"
pan14test_e = "/data/panstuffs/pan14-author-verification-test-corpus2-english-essays-2014-04-22/"

In [None]:
p15tr_known, p15tr_unknown, p15tr_labels = load_pan_data(pan15train)

In [None]:
p15te_known, p15te_unknown, p15te_labels = load_pan_data(pan15test)

In [None]:
p15tr_known_v = vectorizer.transform(p15tr_known)
p15tr_unknown_v = vectorizer.transform(p15tr_unknown)

In [None]:
def get_prob_dist_diff(knowns, unknowns, vectorizer, classifier):
    known_vecs = vectorizer.transform(knowns)
    unknown_vecs = vectorizer.transform(unknowns)
    
    known_probs = classifier.predict_proba(known_vecs)
    unknown_probs = classifier.predict_proba(unknown_vecs)
    
    diffs = np.abs(known_probs - unknown_probs)
    return diffs

diffs = get_prob_dist_diff(p15tr_known, p15tr_unknown, vectorizer, nb)
    
    

In [None]:
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import cross_val_score


pclf = LinearSVC()
pclf.fit(diffs, p15tr_labels)
# cross_val_score(pclf, diffs, p15tr_labels)

In [None]:
te_diffs = get_prob_dist_diff(p15te_known, p15te_unknown, vectorizer, nb)

In [None]:
from sklearn.metrics import classification_report
preds = pclf.predict(te_diffs)
print(classification_report(p15te_labels, preds))

In [None]:
from collections import Counter
Counter(preds)