In [1]:
from __future__ import print_function
import os
import glob
import json
import argparse
import time
import codecs
from collections import defaultdict
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB
import nltk

def represent_text(text,n):
    # Extracts all character 'n'-grams from  a 'text'
    if n>0:
        tokens = [text[i:i+n] for i in range(len(text)-n+1)]
    frequency = defaultdict(int)
    for token in tokens:
        frequency[token] += 1
    return frequency

def read_files(path,label):
    # Reads all text files located in the 'path' and assigns them to 'label' class
    files = glob.glob(path+os.sep+label+os.sep+'*.txt')
    texts=[]
    for i,v in enumerate(files):
        f=codecs.open(v,'r',encoding='utf-8')
        texts.append((f.read(),label))
        f.close()
    return texts

def extract_vocabulary(texts,n,ft):
    # Extracts all characer 'n'-grams occurring at least 'ft' times in a set of 'texts'
    occurrences=defaultdict(int)
    
    #for i in grams:
        
    for (text,label) in texts:
        text_occurrences=represent_text(text,n)
        for ngram in text_occurrences:
            if ngram in occurrences:
                occurrences[ngram]+=text_occurrences[ngram]
            else:
                occurrences[ngram]=text_occurrences[ngram]

    vocabulary=[]
    for i in occurrences.keys():
        if occurrences[i]>=ft:
            vocabulary.append(i)
    return vocabulary

# Lexical Features extraction

def LexicalFeatures(docs):
    fvs_lexical = np.zeros((len(docs), 3), np.float64)
    fvs_punct = np.zeros((len(docs), 6), np.float64)
    word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    for e, ch_text in enumerate(docs):
#             print(e,ch_text)
        # note: the nltk.word_tokenize includes punctuation
        tokens = nltk.word_tokenize(ch_text[0].lower())
        words = word_tokenizer.tokenize(ch_text[0].lower())
#             print(len(words))
        sentences = sentence_tokenizer.tokenize(ch_text[0])
        vocab = set(words)
        words_per_sentence = np.array([len(word_tokenizer.tokenize(s))
                                       for s in sentences])

        # average number of words per sentence
        fvs_lexical[e, 0] = words_per_sentence.mean()
        # sentence length variation
        fvs_lexical[e, 1] = words_per_sentence.std()
        # Lexical diversity
        fvs_lexical[e, 2] = len(vocab) / float(len(words))

       # Commas per sentence
        fvs_punct[e, 0] = tokens.count(',') / float(len(sentences))
        # Punctuations per sentece
        fvs_punct[e, 1] = tokens.count('.') / float(len(sentences))
        # Semicolons per sentence
        fvs_punct[e, 2] = tokens.count(';') / float(len(sentences))
        # Colons per sentence
        fvs_punct[e, 3] = tokens.count(':') / float(len(sentences))
        # Colons per sentence
        fvs_punct[e, 4] = tokens.count('!') / float(len(sentences))
        # Colons per sentence
        fvs_punct[e, 5] = tokens.count('-') / float(len(sentences))
        
        #print(fvs_punct[e, 0],fvs_punct[e, 1],fvs_punct[e, 2],fvs_punct[e, 3],fvs_punct[e, 4],fvs_punct[e, 5])
    max_abs_scaler = preprocessing.Normalizer()
    fvs_lexical = max_abs_scaler.fit_transform(fvs_lexical)
    fvs_punct = max_abs_scaler.fit_transform(fvs_punct)
    return fvs_lexical,fvs_punct

# Pipeline word and ngrams

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion

start_time = time.time()

path = "pan18-cross-domain-authorship-attribution-training-dataset-2017-12-02"
outpath = "outpath"
    
# Reading information about the collection
infocollection = path+os.sep+'collection-info.json'
problems = []
language = []

with open(infocollection, 'r') as f:
    for attrib in json.load(f):
        problems.append(attrib['problem-name'])
        language.append(attrib['language'])

for index,problem in enumerate(problems):
    
    print(problem)

    # Reading information about the problem
    infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
    candidates = []
    with open(infoproblem, 'r') as f:
        fj = json.load(f)
        unk_folder = fj['unknown-folder']
        for attrib in fj['candidate-authors']:
            candidates.append(attrib['author-name'])

    # Building training set
    train_docs=[]
    for candidate in candidates:
        train_docs.extend(read_files(path+os.sep+problem,candidate))

    train_texts = [text for i,(text,label) in enumerate(train_docs)]
    train_labels = [label for i,(text,label) in enumerate(train_docs)]

    # Building test set
    test_docs=read_files(path+os.sep+problem,unk_folder)
    test_texts = [text for i,(text,label) in enumerate(test_docs)]
        
    pipeline = Pipeline(steps=[
    ('union', FeatureUnion([
        #('word_vec', TfidfVectorizer(sublinear_tf=True,analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1, 3),max_features=10000)),
        ('char_vec', TfidfVectorizer(sublinear_tf=True,
        analyzer='char',
        ngram_range=(5, 5),
        max_features=50000)),
        #('pos', PosTagMatrix(tokenizer=nltk.word_tokenize)),
        ])),
    
    #('to_dense', DenseTransformer()),

    #('lreg', CalibratedClassifierCV(OneVsRestClassifier(LogisticRegression(solver='lbfgs')),cv=5))
    #'lreg', LogisticRegression())
    ('SVC', CalibratedClassifierCV(OneVsRestClassifier(SVC(C=1,gamma='auto')),cv=5))
    #('Naive', CalibratedClassifierCV(OneVsRestClassifier(MultinomialNB()),cv=5))
    #('Naive', MultinomialNB())
    ])
    
    print("Fitting...")
    #X_train
    pipeline.fit(train_texts, train_labels)
    print("Prediciting...")
    predictions=pipeline.predict(test_texts)
    proba=pipeline.predict_proba(test_texts)
        
    # Saving output data
    out_data=[]
    unk_filelist = glob.glob(path+os.sep+problem+os.sep+unk_folder+os.sep+'*.txt')
    pathlen=len(path+os.sep+problem+os.sep+unk_folder+os.sep)
    
    for i,v in enumerate(predictions):
        out_data.append({'unknown-text': unk_filelist[i][pathlen:], 'predicted-author': v})
    with open(outpath+os.sep+'answers-'+problem+'.json', 'w') as f:
        json.dump(out_data, f, indent=4)
    print('\t', 'answers saved to file','answers-'+problem+'.json')
print('elapsed time:', time.time() - start_time)

problem00001
Fitting...
Prediciting...
	 answers saved to file answers-problem00001.json
problem00002
Fitting...
Prediciting...
	 answers saved to file answers-problem00002.json
elapsed time: 14.32524299621582


In [5]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mac/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /Users/mac/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# char and word grams

In [6]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack, vstack

start_time = time.time()

path = "pan18-cross-domain-authorship-attribution-training-dataset-2017-12-02"
outpath = "outpath"

pt=0.1
    
# Reading information about the collection
infocollection = path+os.sep+'collection-info.json'
problems = []
language = []

with open(infocollection, 'r') as f:
    for attrib in json.load(f):
        problems.append(attrib['problem-name'])
        language.append(attrib['language'])


for index,problem in enumerate(problems):
    
    print(problem)

    # Reading information about the problem
    infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
    candidates = []
    with open(infoproblem, 'r') as f:
        fj = json.load(f)
        unk_folder = fj['unknown-folder']
        for attrib in fj['candidate-authors']:
            candidates.append(attrib['author-name'])

    # Building training set
    train_docs=[]
    for candidate in candidates:
        train_docs.extend(read_files(path+os.sep+problem,candidate))

    train_texts = [text for i,(text,label) in enumerate(train_docs)]
    train_labels = [label for i,(text,label) in enumerate(train_docs)]

    # Building test set
    test_docs=read_files(path+os.sep+problem,unk_folder)
    test_texts = [text for i,(text,label) in enumerate(test_docs)]


    word_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 1),
        lowercase=False,
        max_features=10000)

    char_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        analyzer='char',
        ngram_range=(5, 7),
        lowercase=False,
        max_features=50000)

    train_word_features = word_vectorizer.fit_transform(train_texts)
    
    test_word_features = word_vectorizer.transform(test_texts)

    train_char_features=char_vectorizer.fit_transform(train_texts)
    
    test_char_features = char_vectorizer.transform(test_texts)
    
    #test_char_features = test_char_features.astype(float)
            
    #train_char_features=train_char_features.astype(float)
  
            
    train_fvs_lexical,train_fvs_punct=LexicalFeatures(train_docs)
    test_fvs_lexical,test_fvs_punct=LexicalFeatures(test_docs)
    
    token_data=[]
    for i in train_texts:
        token_data.append(nltk.word_tokenize(i))

    tagged_data=[]
    for j in token_data:
        tagged_data.append(nltk.pos_tag(j))

    # make data with only tags 
    tags_of_data=[]
    for one_tag_data in tagged_data:
        tags_of_one=[]
        for i in one_tag_data:
            tags_of_one.append(i[1])

        tags_of_data.append(tags_of_one)
    all_pos=[]
    #make tags vocabulary
    for sublist in tags_of_data:
        for item in sublist:
            all_pos.append(item)

    
    all_pos=list(set(all_pos))

    tags_of_data=[str(i) for i in tags_of_data]

    vectorizer = CountVectorizer(ngram_range=(1,5),vocabulary=all_pos,lowercase=False)
    train_data_pos=vectorizer.fit_transform(tags_of_data)
    
    token_test=[]
    for i in test_texts:
        token_test.append(nltk.word_tokenize(i))

    tagged_test=[]
    for j in token_test:
        tagged_test.append(nltk.pos_tag(j))
    # make data containing only tags 
    tags_of_test=[]
    for one_tag_test in tagged_test:
        tags_of_one=[]
        for i in one_tag_test:
            tags_of_one.append(i[1])
        tags_of_one=" ".join(tags_of_one)
        tags_of_test.append(tags_of_one)

    tags_of_test=[str(i) for i in tags_of_test]
    test_data_pos=vectorizer.fit_transform(tags_of_test)
    
    max_abs_scaler = preprocessing.MaxAbsScaler()
    train_char_features = max_abs_scaler.fit_transform(train_char_features)
    test_char_features = max_abs_scaler.transform(test_char_features)
    
    max_abs_scaler_word = preprocessing.MaxAbsScaler()
    train_word_features = max_abs_scaler_word.fit_transform(train_word_features)
    test_word_features = max_abs_scaler_word.transform(test_word_features)
    
    max_abs_scaler_pos = preprocessing.MaxAbsScaler()
    scaled_train_data_pos = max_abs_scaler_pos.fit_transform(train_data_pos)
    scaled_test_data_pos = max_abs_scaler_pos.transform(test_data_pos)
     
    
    # Stack here with lexical and/or POS features:
    train_features = hstack([train_char_features, train_word_features])
    #train_features = hstack([train_features, train_fvs_lexical])
    #train_features = hstack([train_features, train_fvs_punct])
    #train_features = hstack([train_features, scaled_train_data_pos])
    
    #train_features = hstack([scaled_train_data_pos,train_fvs_lexical])
    #train_features = hstack([train_features, train_fvs_punct])


    #print(train_features[0])

    test_features = hstack([test_char_features, test_word_features])
    #test_features = hstack([test_features, test_fvs_lexical])
    #test_features = hstack([test_features, test_fvs_punct])
    #test_features = hstack([test_features, scaled_test_data_pos])
    
    #test_features = hstack([scaled_test_data_pos, test_fvs_lexical]) 
    #test_features = hstack([test_features, test_fvs_punct])

    #print(test_features.shape)
    #stop
    

    clf=CalibratedClassifierCV(OneVsRestClassifier(SVC(C=1,gamma='auto')),cv=5)
    #clf=CalibratedClassifierCV(OneVsRestClassifier(LogisticRegression(solver='lbfgs')),cv=5)
    #clf=CalibratedClassifierCV(OneVsRestClassifier(MultinomialNB()),cv=5)
    
    print("Fitting...")
    clf.fit(train_features, train_labels)
    print("Predicting...")
    
    predictions=clf.predict(test_features)
    proba=clf.predict_proba(test_features)


    # Saving output data 
    out_data=[]
    unk_filelist = glob.glob(path+os.sep+problem+os.sep+unk_folder+os.sep+'*.txt')
    pathlen=len(path+os.sep+problem+os.sep+unk_folder+os.sep)
    
    for i,v in enumerate(predictions):
        out_data.append({'unknown-text': unk_filelist[i][pathlen:], 'predicted-author': v})
    with open(outpath+os.sep+'answers-'+problem+'.json', 'w') as f:
        json.dump(out_data, f, indent=4)
    print('\t', 'answers saved to file','answers-'+problem+'.json')
print('elapsed time:', time.time() - start_time)
    

problem00001
Fitting...
Predicting...
	 answers saved to file answers-problem00001.json
problem00002
Fitting...
Predicting...
	 answers saved to file answers-problem00002.json
elapsed time: 158.7294521331787


## Evaluation

In [7]:
import argparse
import os
import json
import warnings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder


def eval_measures(gt, pred):
    """Compute macro-averaged F1-scores, macro-averaged precision, 
    macro-averaged recall, and micro-averaged accuracy according the ad hoc
    rules discussed at the top of this file.
    Parameters
    ----------
    gt : dict
        Ground truth, where keys indicate text file names
        (e.g. `unknown00002.txt`), and values represent
        author labels (e.g. `candidate00003`)
    pred : dict
        Predicted attribution, where keys indicate text file names
        (e.g. `unknown00002.txt`), and values represent
        author labels (e.g. `candidate00003`)
    Returns
    -------
    f1 : float
        Macro-averaged F1-score
    precision : float
        Macro-averaged precision
    recall : float
        Macro-averaged recall
    accuracy : float
        Micro-averaged F1-score
    """

    actual_authors = list(gt.values())
    encoder = LabelEncoder().fit(['<UNK>'] + actual_authors)

    text_ids, gold_authors, silver_authors = [], [], []
    for text_id in sorted(gt):
        text_ids.append(text_id)
        gold_authors.append(gt[text_id])
        try:
            silver_authors.append(pred[text_id])
        except KeyError:
            # missing attributions get <UNK>:
            silver_authors.append('<UNK>')

    assert len(text_ids) == len(gold_authors)
    assert len(text_ids) == len(silver_authors)

    # replace non-existent silver authors with '<UNK>':
    silver_authors = [a if a in encoder.classes_ else '<UNK>' 
                      for a in silver_authors]

    gold_author_ints = encoder.transform(gold_authors)
    silver_author_ints = encoder.transform(silver_authors)
    # get F1 for individual classes (and suppress warnings):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        labels=list(set(gold_author_ints))
        # Exclude the <UNK> class
        for x in labels:
            #print(encoder.inverse_transform([2]))
            if encoder.inverse_transform([x])=='<UNK>':
                #print(encoder.inverse_transform([x]))
                labels.remove(x)
        f1 = f1_score(gold_author_ints,
                  silver_author_ints,
                  labels,
                  average='macro')
        precision = precision_score(gold_author_ints,
                  silver_author_ints,
                  labels,
                  average='macro')
        recall = recall_score(gold_author_ints,
                  silver_author_ints,
                  labels,
                  average='macro')
        accuracy = accuracy_score(gold_author_ints,
                  silver_author_ints)

    return f1,precision,recall,accuracy

def evaluate(ground_truth_file,predictions_file):
    # Calculates evaluation measures for a single attribution problem
    gt = {}
    with open(ground_truth_file, 'r') as f:
        for attrib in json.load(f)['ground_truth']:
            gt[attrib['unknown-text']] = attrib['true-author']
    pred = {}
    with open(predictions_file, 'r') as f:
        for attrib in json.load(f):
            if attrib['unknown-text'] not in pred:
                pred[attrib['unknown-text']] = attrib['predicted-author']
    f1,precision,recall,acc =  eval_measures(gt,pred)
    return round(f1,3), round(precision,3), round(recall,3), round(acc,3)

def evaluate_all(path_collection,path_answers,path_out):
    # Calculates evaluation measures for a PAN-18 collection of attribution problems
    infocollection = path_collection+os.sep+'collection-info.json'
    problems = []
    data = []
    with open(infocollection, 'r') as f:
        for attrib in json.load(f):
            problems.append(attrib['problem-name'])
    scores=[];
    accscore=[];
    recscore=[];
    precscore=[];
    for problem in problems:
        f1,precision,recall,acc=evaluate(path_collection+os.sep+problem+os.sep+'ground-truth.json',
                                         path_answers+os.sep+'answers-'+problem+'.json')
        accscore.append(acc)
        scores.append(f1)
        recscore.append(recall)
        precscore.append(precision)
        data.append({'problem-name': problem, 'macro-f1': round(f1,3), 'macro-precision': round(precision,3),
                     'macro-recall': round(recall,3),'macro-acc': round(acc,3)})
        print(str(problem),'Macro-F1:',round(f1,3),'Accuracy:',round(acc,3),
              'Precision:',round(precision,3),'Recall:',round(recall,3))
    overall_acc=sum(accscore)/len(accscore)
    overall_score=sum(scores)/len(scores)
    overall_prec=sum(precscore)/len(precscore)
    overall_rec=sum(recscore)/len(recscore)
    # Saving data to output files (out.json and evaluation.prototext)
    with open(path_out+os.sep+'out.json', 'w') as f:
        json.dump({'problems': data, 'overall_score': round(overall_score,3)}, f, indent=4, sort_keys=True)
    print('Overall score:', round(overall_score,3),'Overall accuracy:', round(overall_acc,3),
          'Overall precision:', round(overall_prec,3),'Overall recall:', round(overall_rec,3))
    #print('Overall score:', round(overall_score,3))
    prototext='measure {\n key: "mean macro-f1"\n value: "'+str(round(overall_score,3))+'"\n}\n'
    with open(path_out+os.sep+'evaluation.prototext', 'w') as f:
        f.write(prototext)
        
def main():
      
    collection = "pan18-cross-domain-authorship-attribution-training-dataset-2017-12-02"
    answers = "outpath"
    outeval = "outeval"

    evaluate_all(collection,answers,outeval)

if __name__ == '__main__':
    main()

problem00001 Macro-F1: 0.491 Accuracy: 0.629 Precision: 0.469 Recall: 0.676
problem00002 Macro-F1: 0.697 Accuracy: 0.81 Precision: 0.68 Recall: 0.75
Overall score: 0.594 Overall accuracy: 0.72 Overall precision: 0.574 Overall recall: 0.713


In [9]:
def baseline(path,outpath,n=3,ft=5,pt=0.1):
    start_time = time.time()
    
    # Reading information about the collection
    infocollection = path+os.sep+'collection-info.json'
    problems = []
    language = []
    with open(infocollection, 'r') as f:
        for attrib in json.load(f):
            problems.append(attrib['problem-name'])
            language.append(attrib['language'])
    for index,problem in enumerate(problems):
        print(problem)
        
        # Reading information about the problem
        infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
        candidates = []
        with open(infoproblem, 'r') as f:
            fj = json.load(f)
            unk_folder = fj['unknown-folder']
            for attrib in fj['candidate-authors']:
                candidates.append(attrib['author-name'])
        
        # Building training set
        train_docs=[]
        for candidate in candidates:
            train_docs.extend(read_files(path+os.sep+problem,candidate))
            
        train_texts = [text for i,(text,label) in enumerate(train_docs)]
        train_labels = [label for i,(text,label) in enumerate(train_docs)]
        
        #grams = 1,2
        
        vocabulary = extract_vocabulary(train_docs,3,ft)
        #print(vocabulary)
        #vectorizer = CountVectorizer(analyzer='char',ngram_range=(grams),lowercase=False,vocabulary=vocabulary)
        vectorizer = TfidfVectorizer(analyzer='char',ngram_range=(3,3),lowercase=False,vocabulary=vocabulary)

        # Maybe do a TFIDF-vectorizer aswell :) 
        #print(train_texts)
        train_data = vectorizer.fit_transform(train_texts)
        #print(train_data)
        train_data = train_data.astype(float)
        #print(vectorizer.vocabulary_)

        for i,v in enumerate(train_texts):
            train_data[i]=train_data[i]/len(train_texts[i])
        print('\t', 'language: ', language[index])
        print('\t', len(candidates), 'candidate authors')
        print('\t', len(train_texts), 'known texts')
        print('\t', 'vocabulary size:', len(vocabulary))
        
        # Building test set
        test_docs=read_files(path+os.sep+problem,unk_folder)
        test_texts = [text for i,(text,label) in enumerate(test_docs)]
        test_data = vectorizer.transform(test_texts)
        test_data = test_data.astype(float)
        for i,v in enumerate(test_texts):
            test_data[i]=test_data[i]/len(test_texts[i])
        print('\t', len(test_texts), 'unknown texts')
        
        
        # Applying SVM
        max_abs_scaler = preprocessing.MaxAbsScaler()
        scaled_train_data = max_abs_scaler.fit_transform(train_data)
        scaled_test_data = max_abs_scaler.transform(test_data)
        clf=CalibratedClassifierCV(OneVsRestClassifier(SVC(C=1,gamma='auto')),cv=5) # change cv for faster runtime
        print(train_labels)
        clf.fit(scaled_train_data, train_labels)
        predictions=clf.predict(scaled_test_data)
        proba=clf.predict_proba(scaled_test_data)
        
        # Reject option (used in open-set cases)
        count=0
        for i,p in enumerate(predictions):
            sproba=sorted(proba[i],reverse=True)
            if sproba[0]-sproba[1]<pt:
                predictions[i]=u'<UNK>'
                count=count+1
        print('\t',count,'texts left unattributed')
        
        # Saving output data
        out_data=[]
        unk_filelist = glob.glob(path+os.sep+problem+os.sep+unk_folder+os.sep+'*.txt')
        pathlen=len(path+os.sep+problem+os.sep+unk_folder+os.sep)
        for i,v in enumerate(predictions):
            out_data.append({'unknown-text': unk_filelist[i][pathlen:], 'predicted-author': v})
        with open(outpath+os.sep+'answers-'+problem+'.json', 'w') as f:
            json.dump(out_data, f, indent=4)
        print('\t', 'answers saved to file','answers-'+problem+'.json')
    print('elapsed time:', time.time() - start_time)

def main():
    
    folder = "pan18-cross-domain-authorship-attribution-training-dataset-2017-12-02"
    outpath = "outpath"
    
    baseline(folder, outpath)

if __name__ == '__main__':
    main()

problem00001
	 language:  en
	 20 candidate authors
	 140 known texts
	 vocabulary size: 5640
	 105 unknown texts
['candidate00001', 'candidate00001', 'candidate00001', 'candidate00001', 'candidate00001', 'candidate00001', 'candidate00001', 'candidate00002', 'candidate00002', 'candidate00002', 'candidate00002', 'candidate00002', 'candidate00002', 'candidate00002', 'candidate00003', 'candidate00003', 'candidate00003', 'candidate00003', 'candidate00003', 'candidate00003', 'candidate00003', 'candidate00004', 'candidate00004', 'candidate00004', 'candidate00004', 'candidate00004', 'candidate00004', 'candidate00004', 'candidate00005', 'candidate00005', 'candidate00005', 'candidate00005', 'candidate00005', 'candidate00005', 'candidate00005', 'candidate00006', 'candidate00006', 'candidate00006', 'candidate00006', 'candidate00006', 'candidate00006', 'candidate00006', 'candidate00007', 'candidate00007', 'candidate00007', 'candidate00007', 'candidate00007', 'candidate00007', 'candidate00007', 'ca

In [111]:
from __future__ import print_function
import os
import glob
import json
import argparse
import time
import codecs
from collections import defaultdict
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.calibration import CalibratedClassifierCV

def represent_text(text,n):
    # Extracts all character 'n'-grams from  a 'text'
    if n>0:
        tokens = [text[i:i+n] for i in range(len(text)-n+1)]
    frequency = defaultdict(int)
    for token in tokens:
        frequency[token] += 1
    return frequency

def read_files(path,label):
    # Reads all text files located in the 'path' and assigns them to 'label' class
    files = glob.glob(path+os.sep+label+os.sep+'*.txt')
    texts=[]
    for i,v in enumerate(files):
        f=codecs.open(v,'r',encoding='utf-8')
        texts.append((f.read(),label))
        f.close()
    return texts

def extract_vocabulary(texts,n,ft):
    # Extracts all characer 'n'-grams occurring at least 'ft' times in a set of 'texts'
    occurrences=defaultdict(int)
    
    #for i in grams:
        
    for (text,label) in texts:
        text_occurrences=represent_text(text,n)
        for ngram in text_occurrences:
            if ngram in occurrences:
                occurrences[ngram]+=text_occurrences[ngram]
            else:
                occurrences[ngram]=text_occurrences[ngram]

    vocabulary=[]
    for i in occurrences.keys():
        if occurrences[i]>=ft:
            vocabulary.append(i)
    return vocabulary

def baseline(path,outpath,n=3,ft=5,pt=0.1):
    start_time = time.time()
    
    # Reading information about the collection
    infocollection = path+os.sep+'collection-info.json'
    problems = []
    language = []
    with open(infocollection, 'r') as f:
        for attrib in json.load(f):
            problems.append(attrib['problem-name'])
            language.append(attrib['language'])
    for index,problem in enumerate(problems):
        print(problem)
        
        # Reading information about the problem
        infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
        candidates = []
        with open(infoproblem, 'r') as f:
            fj = json.load(f)
            unk_folder = fj['unknown-folder']
            for attrib in fj['candidate-authors']:
                candidates.append(attrib['author-name'])
        
        # Building training set
        train_docs=[]
        for candidate in candidates:
            train_docs.extend(read_files(path+os.sep+problem,candidate))
            
        train_texts = [text for i,(text,label) in enumerate(train_docs)]
        train_labels = [label for i,(text,label) in enumerate(train_docs)]
        
        #grams = 1,2
        
        vocabulary = extract_vocabulary(train_docs,3,ft)
        #print(vocabulary)
        #vectorizer = CountVectorizer(analyzer='char',ngram_range=(grams),lowercase=False,vocabulary=vocabulary)
        vectorizer = TfidfVectorizer(analyzer='char',ngram_range=(3,3),lowercase=False,vocabulary=vocabulary)

        # Maybe do a TFIDF-vectorizer aswell :) 
        #print(train_texts)
        train_data = vectorizer.fit_transform(train_texts)
        #print(train_data)
        train_data = train_data.astype(float)
        #print(vectorizer.vocabulary_)

        for i,v in enumerate(train_texts):
            train_data[i]=train_data[i]/len(train_texts[i])
        print('\t', 'language: ', language[index])
        print('\t', len(candidates), 'candidate authors')
        print('\t', len(train_texts), 'known texts')
        print('\t', 'vocabulary size:', len(vocabulary))
        
        # Building test set
        test_docs=read_files(path+os.sep+problem,unk_folder)
        test_texts = [text for i,(text,label) in enumerate(test_docs)]
        test_data = vectorizer.transform(test_texts)
        test_data = test_data.astype(float)
        for i,v in enumerate(test_texts):
            test_data[i]=test_data[i]/len(test_texts[i])
        print('\t', len(test_texts), 'unknown texts')
        
        
        # Applying SVM
        max_abs_scaler = preprocessing.MaxAbsScaler()
        scaled_train_data = max_abs_scaler.fit_transform(train_data)
        scaled_test_data = max_abs_scaler.transform(test_data)
        clf=CalibratedClassifierCV(OneVsRestClassifier(SVC(C=1,gamma='auto')),cv=5) # change cv for faster runtime
        clf.fit(scaled_train_data, train_labels)
        predictions=clf.predict(scaled_test_data)
        proba=clf.predict_proba(scaled_test_data)
        
        # Reject option (used in open-set cases)
        count=0
        for i,p in enumerate(predictions):
            sproba=sorted(proba[i],reverse=True)
            if sproba[0]-sproba[1]<pt:
                predictions[i]=u'<UNK>'
                count=count+1
        print('\t',count,'texts left unattributed')
        
        # Saving output data
        out_data=[]
        unk_filelist = glob.glob(path+os.sep+problem+os.sep+unk_folder+os.sep+'*.txt')
        pathlen=len(path+os.sep+problem+os.sep+unk_folder+os.sep)
        for i,v in enumerate(predictions):
            out_data.append({'unknown-text': unk_filelist[i][pathlen:], 'predicted-author': v})
        with open(outpath+os.sep+'answers-'+problem+'.json', 'w') as f:
            json.dump(out_data, f, indent=4)
        print('\t', 'answers saved to file','answers-'+problem+'.json')
    print('elapsed time:', time.time() - start_time)

def main():
    
    folder = "pan19-cross-domain-authorship-attribution-training-dataset-2019-01-23"
    outpath = "outpath"
    
    baseline(folder, outpath)

if __name__ == '__main__':
    main()

problem00001
	 language:  en
	 9 candidate authors
	 63 known texts
	 vocabulary size: 4152
	 561 unknown texts
	 195 texts left unattributed
	 answers saved to file answers-problem00001.json
problem00002
	 language:  en
	 9 candidate authors
	 63 known texts
	 vocabulary size: 4176
	 137 unknown texts
	 61 texts left unattributed
	 answers saved to file answers-problem00002.json
problem00003
	 language:  en
	 9 candidate authors
	 63 known texts
	 vocabulary size: 4049
	 211 unknown texts
	 130 texts left unattributed
	 answers saved to file answers-problem00003.json
problem00004
	 language:  en
	 9 candidate authors
	 63 known texts
	 vocabulary size: 4255
	 273 unknown texts
	 173 texts left unattributed
	 answers saved to file answers-problem00004.json
problem00005
	 language:  en
	 9 candidate authors
	 63 known texts
	 vocabulary size: 4070
	 264 unknown texts
	 117 texts left unattributed
	 answers saved to file answers-problem00005.json
problem00006
	 language:  fr
	 9 candidat