# Feature extraction

In this notebook we will learn how to extract different features from a text and how to combine them. It's pretty simple, but if you have this part well organized, it will be really useful in the near future. So, let's get started!

In [2]:
import nltk
from sklearn.pipeline import FeatureUnion
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn import preprocessing
from scipy.sparse import coo_matrix, hstack
from copy import deepcopy
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import os
import glob
import json
import argparse
import time
import codecs
from collections import defaultdict
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import BernoulliNB, GaussianNB
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

In [4]:
import csv
import re
import random
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [5]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [6]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
import os

In [8]:
def open_file(path):
    with open(path, 'r+', encoding='utf8') as f:
        return '\n'.join([line.strip() for line in f])
    
def process_dir_files(path):
    dir_files = []
    for file in os.listdir(path):
        current = os.path.join(path, file)
        if os.path.isfile(current):
            dir_files.append(open_file(current))
    return dir_files
                     

#train_sents= process_dir_files('pan18-cross-domain-authorship-attribution-training-dataset-2017-12-02/problem00001/candidate00001')



In [9]:
from nltk.tokenize import word_tokenize

def get_pos(text):
    pos_tags= nltk.pos_tag(word_tokenize(text))
    # print(len(pos_tags))
    pos_tags = [word_tag[1] for word_tag in pos_tags]
    pos_text = ' '.join(pos_tags)
    return pos_tags

#get_pos(train_sents[0])

In [10]:
def get_pos_ngrams(sents):
    pos_tags= [nltk.pos_tag(word_tokenize(sents[ind])) for ind, item in enumerate(sents) if item != '']
    pos_sents = []
    for sent in pos_tags:
        #print(sent)
        pos = ' '.join([pos_tag[1] for pos_tag in sent])
        #print(pos, '\n')
        pos_sents.append(pos)
    vectorizer = CountVectorizer(ngram_range = (1,1))
    vectorizer.fit(pos_sents)
    return vectorizer


#pos_vectorizer = get_pos_ngrams(train_sents)
#pos_ngram = pos_vectorizer.transform(train_sents)
#pos_ngram

In [11]:
# -*- coding: utf-8 -*-

"""
 A baseline authorship attribution method 
 based on a character n-gram representation
 and a linear SVM classifier.
 It has a reject option to leave documents unattributed
 (when the probabilities of the two most likely training classes are too close)
 
 Questions/comments: stamatatos@aegean.gr

 It can be applied to datasets of PAN-19 cross-domain authorship attribution task
 See details here: http://pan.webis.de/clef19/pan19-web/author-identification.html
 Dependencies:
 - Python 2.7 or 3.6 (we recommend the Anaconda Python distribution)
 - scikit-learn

 Usage from command line: 
    > python pan19-cdaa-baseline.py -i EVALUATION-DIRECTORY -o OUTPUT-DIRECTORY [-n N-GRAM-ORDER] [-ft FREQUENCY-THRESHOLD] [-pt PROBABILITY-THRESHOLD]
 EVALUATION-DIRECTORY (str) is the main folder of a PAN-19 collection of attribution problems
 OUTPUT-DIRECTORY (str) is an existing folder where the predictions are saved in the PAN-19 format
 Optional parameters of the model:
   N-GRAM-ORDER (int) is the length of character n-grams (default=3)
   FREQUENCY-THRESHOLD (int) is the cutoff threshold used to filter out rare n-grams (default=5)
   PROBABILITY-THRESHOLD (float) is the threshold for the reject option assigning test documents to the <UNK> class (default=0.1)
                                 Let P1 and P2 be the two maximum probabilities of training classes for a test document. If P1-P2<pt then the test document is assigned to the <UNK> class.
   
 Example:

     >  python pan19-cdaa-baseline-svm.py -i ".\pan19-cross-domain-authorship-attribution-training-dataset-2019-01-23\" -o ".\a
nswers-trigram\" -n 3
"""

from __future__ import print_function
import os
import glob
import json
import argparse
import time
import codecs
from collections import defaultdict
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.calibration import CalibratedClassifierCV

def represent_text(text,n,pos=False):
    # Extracts all character n-grams from  a 'text'
    # if pos is True, extracts POS n-grams
    if n>0:
        if pos is True:
            text = get_pos(text)
            tokens = [' '.join(text[i:i+n]) for i in range(len(text)-n+1)]
            #print(tokens)
        else:
            tokens = [text[i:i+n] for i in range(len(text)-n+1)]
    frequency = defaultdict(int)
    for token in tokens:
        frequency[token] += 1
    return frequency

#represent_text(train_sents[0], 2)
#represent_text(train_sents[0], 2, pos=True)
                           

In [12]:
def read_files(path,label):
    # Reads all text files located in the 'path' and assigns them to 'label' class
    files = glob.glob(path+os.sep+label+os.sep+'*.txt')
    texts=[]
    for i,v in enumerate(files):
        f=codecs.open(v,'r',encoding='utf-8')
        texts.append((f.read(),label))
        f.close()
    return texts

def extract_vocabulary(texts,n,ft,pos=False):
    # Extracts all characer 'n'-grams occurring at least 'ft' times in a set of 'texts'
    occurrences=defaultdict(int) 
    for (text,label) in texts:
        text_occurrences = {}
        if isinstance(n, int):
            for x in range(1,n+1):
                text_occurrences.update(represent_text(text,x,pos=pos))
        else:
            pass
        for ngram in text_occurrences:
            if ngram in occurrences:
                occurrences[ngram]+=text_occurrences[ngram]
            else:
                occurrences[ngram]=text_occurrences[ngram]
    vocabulary=[]
    for i in occurrences.keys():
        if occurrences[i]>=ft:
            vocabulary.append(i)
    return vocabulary

'''
vocab = extract_vocabulary([(x,i) for i, x in enumerate(train_sents)], 2, 5, pos=True)
print(len(vocab))
print(vocab)
vectorizer = CountVectorizer(vocabulary=[x.lower() for  x in vocab])
print([' '.join(get_pos(text)) for text in train_sents])

train_data = vectorizer.fit_transform([' '.join(get_pos(text)) for text in train_sents])
print(vectorizer.get_feature_names())
train_data = train_data.astype(float)
print(train_data.shape)
print(train_data.toarray())
'''

"\nvocab = extract_vocabulary([(x,i) for i, x in enumerate(train_sents)], 2, 5, pos=True)\nprint(len(vocab))\nprint(vocab)\nvectorizer = CountVectorizer(vocabulary=[x.lower() for  x in vocab])\nprint([' '.join(get_pos(text)) for text in train_sents])\n\ntrain_data = vectorizer.fit_transform([' '.join(get_pos(text)) for text in train_sents])\nprint(vectorizer.get_feature_names())\ntrain_data = train_data.astype(float)\nprint(train_data.shape)\nprint(train_data.toarray())\n"

In [13]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [30]:
class Feature_Extractor():
    '''
    Performs feature extraction on input docs.
    '''
    def __init__(self, n, ft):
        self.n = n
        self.ft = ft
        
    def fit_transform(self, docs):
        ## Char-level n-grams ##
        char_vocab = extract_vocabulary(docs,self.n,self.ft)
        self.char_vectorizer = CountVectorizer(analyzer='char', ngram_range=(self.n,self.n),
                                          lowercase=False, vocabulary=char_vocab
                                              )
        char_data, self.char_vectorizer = self._fit_transform(self.char_vectorizer, docs)
        print(char_data.shape)
        # print(char_data.toarray())

        ## POS n-grams ##
        pos_vocab = [x.lower() for x in extract_vocabulary(docs,2,self.ft,pos=True)]
        # print(pos_vocab)
        self.pos_vectorizer = CountVectorizer(ngram_range=(1,2), vocabulary=pos_vocab
                                             )
        print('\t', 'pos vocabulary size:', len(pos_vocab), 'char vocabulary size:', len(char_vocab))
        pos_data, self.pos_vectorizer = self._fit_transform(self.pos_vectorizer, docs, pos_replace=True)
        
        ## Word n-grams ##
        self.word_vectorizer = CountVectorizer(ngram_range=(2,3))
        word_data, self.word_vectorizer = self._fit_transform(self.word_vectorizer, docs)
        
        ## Lexical Diversity
        lex_div = self.lexical_diversity(docs)
        print(pos_data.toarray()[:,1])
        print('pos data: %s char data: %s word data: %s'%(pos_data.shape, char_data.shape, word_data.shape))
        feature_data = self.combine_features((lex_div, pos_data, char_data, word_data 
                                             )) 
        return feature_data
    
    def combine_features(self, feat_tuple):
        feature_data = hstack(feat_tuple)
        return feature_data
    
    def replace_words_POS(self, texts):
        return [' '.join(get_pos(text)) for text in texts]
    
    def lexical_diversity(self, docs):
        lex_div = np.array([len(set(text)) / len(text) for (text,label) in docs]).reshape(len(docs), 1)
        print('lexical diversity:', lex_div.shape)
        return lex_div
    
    def _fit_transform(self, vectorizer, docs, pos_replace=False):
        texts = [text for i,(text,label) in enumerate(docs)]
        if pos_replace is True:
            texts = self.replace_words_POS(texts) # replace words in text with POS
        vec_data = vectorizer.fit_transform(texts)
        vec_data = vec_data.astype(float)
        return vec_data, vectorizer
    
    def _transform(self, vectorizer, docs, pos_replace=False):
        texts = [text for i,(text,label) in enumerate(docs)]
        if pos_replace is True:
            texts = self.replace_words_POS(texts) # replace words in text with POS
        vec_data = vectorizer.transform(texts)
        vec_data = vec_data.astype(float)
        return vec_data
    
    def transform(self, docs):
        char_data = self._transform(self.char_vectorizer, docs)
        word_data = self._transform(self.word_vectorizer, docs)
        pos_data = self._transform(self.pos_vectorizer, docs, pos_replace=True)
        lex_div = self.lexical_diversity(docs)
        feature_data = self.combine_features((lex_div, pos_data, char_data, word_data
                                             )) 
        return feature_data
        
def write_results(path, problem, unk_folder, predictions):
    # Saving output data
    out_data=[]
    unk_filelist = glob.glob(path+os.sep+problem+os.sep+unk_folder+os.sep+'*.txt')
    pathlen=len(path+os.sep+problem+os.sep+unk_folder+os.sep)
    for i,v in enumerate(predictions):
        out_data.append({'unknown-text': unk_filelist[i][pathlen:], 'predicted-author': v})
    with open(outpath+os.sep+'answers-'+problem+'.json', 'w') as f:
        json.dump(out_data, f, indent=4)
    print('\t', 'answers saved to file','answers-'+problem+'.json')
    

def get_baseline_model_fn(num_in, num_out):
    # initialize baseline model fn with num features and num predicted categories
    def baseline_model():
        # create model
        model = Sequential()
        model.add(Dense(8, input_dim=num_in, activation='relu'))
        model.add(Dense(num_out, activation='softmax'))
        # Compile model
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model
    return baseline_model


def baseline_keras(path, outpath, n=3, ft=5, pt=0.1, feature_selection=False, 
             open_set=False, c=1, feat_sel_percent=None, clf=None, calibration=True):
    start_time = time.time()
    # Reading information about the collection
    infocollection = path+os.sep+'collection-info.json'
    problems = []
    language = []
    with open(infocollection, 'r') as f:
        for attrib in json.load(f):
            problems.append(attrib['problem-name'])
            language.append(attrib['language'])
    problem_scores = []
    for index,problem in enumerate(problems):
        print(problem)
        # Reading information about the problem
        infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
        candidates = []
        with open(infoproblem, 'r') as f:
            fj = json.load(f)
            unk_folder = fj['unknown-folder']
            for attrib in fj['candidate-authors']:
                candidates.append(attrib['author-name'])
        # Building training set
        docs=[]
        for candidate in candidates:
            docs.extend(read_files(path+os.sep+problem,candidate))
        train_labels = np.array([label for i,(text,label) in enumerate(docs)])
        print('\t', 'language: ', language[index])
        print('\t', len(candidates), 'candidate authors')
        print('\t', len(docs), 'known texts')
        
        ###### Applying Classifiers #####
        if calibration is True:
            clf=CalibratedClassifierCV(OneVsRestClassifier(SVC(C=c)))
        else:
            clf=OneVsRestClassifier(SVC(C=c))
        skf = StratifiedKFold(n_splits=5,random_state=442)
        scores = []
        print("Do cross validation.")
        for train_index, test_index in skf.split(docs, train_labels):
            X_train_docs = [docs[i] for i in train_index]
            X_test_docs = [docs[i] for i in test_index]
            y_train, y_test = train_labels[train_index], train_labels[test_index]
            ##### Extract X features ####
            feat_extractor = Feature_Extractor(n, ft)
            X_train = feat_extractor.fit_transform(X_train_docs)
            X_test = feat_extractor.transform(X_test_docs)
            print('train shape:', X_train.shape, 'test shape:', X_test.shape)
            if feature_selection is True:
                ####### Feature Selection - Fit #######
                print("training before feature selection:", X_train.shape)
                print("testing before feature selection:", X_test.shape)
                #sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
                #train_data = sel.fit_transform(train_data)
                # We use the default selection function: the 10% most significant features
                sel = SelectPercentile(f_classif, percentile=feat_sel_percent)
                X_train = sel.fit_transform(X_train, y_train)
                X_test = sel.transform(X_test)
                #sel = SelectKBest(chi2, k=100000)
                #train_data = sel.fit_transform(train_data, train_labels)
                print("training after feature selection:", X_train.shape)
                print("testing after feature selection:", X_test.shape)
            max_abs_scaler = preprocessing.MaxAbsScaler()
            X_train = max_abs_scaler.fit_transform(X_train)
            X_test = max_abs_scaler.transform(X_test)
            baseline_model = get_baseline_model_fn(X_train.shape[1], len(candidates))
            clf = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)
            ##### Convert y to 1-hot encoding ####
            y_encoder = LabelEncoder()
            y_train = y_encoder.fit_transform(y_train)
            # convert integers to dummy variables (i.e. one hot encoded)
            y_train = np_utils.to_categorical(y_train)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            print("y_pred y_test:", y_pred.shape, y_test.shape)
            y_pred = y_encoder.inverse_transform(y_pred)
            print(y_pred)
            print(y_test)
            accuracy = accuracy_score(y_test, y_pred)
            scores.append(accuracy)
            print('accuracy:', accuracy)
            #write_results(path, problem, unk_folder, predictions)
        mean_score = np.mean(scores)
        print(problem, 'MEAN ACCURACY SCORES:', mean_score)
        problem_scores.append(mean_score)
    print('MEAN SCORES ACCROSS PROBLEMS:', np.mean(problem_scores))
    # todo: also add stdev of scores
    print('elapsed time:', time.time() - start_time)

base_dir='pan18-cross-domain-authorship-attribution-training-dataset-2017-12-02'
out_dir = base_dir+os.sep+'output-dir'
eval_dir = base_dir+os.sep+'eval-dir'
#params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': False, 'c':0.1, 'feat_sel_percent': None, 'clf': 'SVC'}
params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': True, 'c':0.1, 'feat_sel_percent': 85, 'clf': 'SVC', 'calibration': True}
#params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': True, 'c':1, 'feat_sel_percent': 85, 'clf': 'SVC'}
#params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': True, 'c':10, 'feat_sel_percent': 85, 'clf': 'SVC'}
baseline_keras(base_dir,out_dir,**params)
%timeit
    

problem00001
	 language:  en
	 20 candidate authors
	 140 known texts
Do cross validation.
(100, 52773)
	 pos vocabulary size: 806 char vocabulary size: 52773
lexical diversity: (100, 1)
[ 21.  25.   6.  37.  39.  52.  16.  48.  59.  49.  99.  85.  47.  55.
  49.  50.  37.  31.  42.  61.  87.  31.  40.  24.  59.  46.  43.  68.
  47.  85.  42. 120.  94. 110. 127. 107.  31.  59. 108.  95.  77.  80.
  75. 118.  98.  32.  54.  46.  64.  46.  54.  47.  86.  52.  68.  78.
  57.  77.  90.  77.  62.  54.  55.  74.  86.  55.  55.  34.  61.  28.
  70.  57.  44.  40.  66.  16.  13.  42.  21.  68.  62.  42. 105.  84.
  38.  92.  93.  91.  87. 107.  45.  88.  57. 103.  65.  67.  38.  49.
  73.  76.]
pos data: (100, 806) char data: (100, 52773) word data: (100, 116893)
lexical diversity: (40, 1)
train shape: (100, 170473) test shape: (40, 170473)
training before feature selection: (100, 170473)
testing before feature selection: (40, 170473)
training after feature selection: (100, 144902)
testing aft

  f = msb / msw


y_pred y_test: (40,) (40,)
['candidate00013' 'candidate00013' 'candidate00012' 'candidate00013'
 'candidate00013' 'candidate00013' 'candidate00013' 'candidate00013'
 'candidate00013' 'candidate00013' 'candidate00012' 'candidate00013'
 'candidate00013' 'candidate00013' 'candidate00013' 'candidate00013'
 'candidate00012' 'candidate00012' 'candidate00013' 'candidate00013'
 'candidate00013' 'candidate00013' 'candidate00013' 'candidate00013'
 'candidate00013' 'candidate00013' 'candidate00013' 'candidate00013'
 'candidate00013' 'candidate00013' 'candidate00013' 'candidate00013'
 'candidate00013' 'candidate00013' 'candidate00012' 'candidate00013'
 'candidate00013' 'candidate00013' 'candidate00012' 'candidate00013']
['candidate00001' 'candidate00001' 'candidate00002' 'candidate00002'
 'candidate00003' 'candidate00003' 'candidate00004' 'candidate00004'
 'candidate00005' 'candidate00005' 'candidate00006' 'candidate00006'
 'candidate00007' 'candidate00007' 'candidate00008' 'candidate00008'
 'cand

  f = msb / msw


y_pred y_test: (40,) (40,)
['candidate00005' 'candidate00005' 'candidate00005' 'candidate00005'
 'candidate00005' 'candidate00005' 'candidate00005' 'candidate00005'
 'candidate00005' 'candidate00005' 'candidate00005' 'candidate00005'
 'candidate00005' 'candidate00005' 'candidate00014' 'candidate00005'
 'candidate00005' 'candidate00005' 'candidate00005' 'candidate00005'
 'candidate00005' 'candidate00005' 'candidate00005' 'candidate00005'
 'candidate00005' 'candidate00005' 'candidate00005' 'candidate00005'
 'candidate00005' 'candidate00005' 'candidate00005' 'candidate00005'
 'candidate00005' 'candidate00005' 'candidate00005' 'candidate00005'
 'candidate00005' 'candidate00005' 'candidate00005' 'candidate00005']
['candidate00001' 'candidate00001' 'candidate00002' 'candidate00002'
 'candidate00003' 'candidate00003' 'candidate00004' 'candidate00004'
 'candidate00005' 'candidate00005' 'candidate00006' 'candidate00006'
 'candidate00007' 'candidate00007' 'candidate00008' 'candidate00008'
 'cand

  f = msb / msw


training after feature selection: (120, 167314)
testing after feature selection: (20, 167314)
y_pred y_test: (20,) (20,)
['candidate00018' 'candidate00017' 'candidate00020' 'candidate00017'
 'candidate00018' 'candidate00020' 'candidate00017' 'candidate00018'
 'candidate00017' 'candidate00010' 'candidate00017' 'candidate00020'
 'candidate00017' 'candidate00018' 'candidate00017' 'candidate00013'
 'candidate00020' 'candidate00018' 'candidate00017' 'candidate00018']
['candidate00001' 'candidate00002' 'candidate00003' 'candidate00004'
 'candidate00005' 'candidate00006' 'candidate00007' 'candidate00008'
 'candidate00009' 'candidate00010' 'candidate00011' 'candidate00012'
 'candidate00013' 'candidate00014' 'candidate00015' 'candidate00016'
 'candidate00017' 'candidate00018' 'candidate00019' 'candidate00020']
accuracy: 0.1
(120, 58489)
	 pos vocabulary size: 831 char vocabulary size: 58489
lexical diversity: (120, 1)
[ 63.  80.  70.  71.  74.  72.  64.  76.  97.  98.  87.  74. 101. 104.
 106. 

  f = msb / msw


training after feature selection: (120, 167459)
testing after feature selection: (20, 167459)
y_pred y_test: (20,) (20,)
['candidate00020' 'candidate00016' 'candidate00020' 'candidate00020'
 'candidate00016' 'candidate00020' 'candidate00020' 'candidate00020'
 'candidate00020' 'candidate00016' 'candidate00015' 'candidate00016'
 'candidate00020' 'candidate00020' 'candidate00020' 'candidate00020'
 'candidate00020' 'candidate00016' 'candidate00016' 'candidate00016']
['candidate00001' 'candidate00002' 'candidate00003' 'candidate00004'
 'candidate00005' 'candidate00006' 'candidate00007' 'candidate00008'
 'candidate00009' 'candidate00010' 'candidate00011' 'candidate00012'
 'candidate00013' 'candidate00014' 'candidate00015' 'candidate00016'
 'candidate00017' 'candidate00018' 'candidate00019' 'candidate00020']
accuracy: 0.0
(120, 58686)
	 pos vocabulary size: 835 char vocabulary size: 58686
lexical diversity: (120, 1)
[ 63.  80.  70.  71.  74.  79.  64.  76.  97.  98.  87.  74. 101. 104.
 106. 

  f = msb / msw


training after feature selection: (120, 167773)
testing after feature selection: (20, 167773)
y_pred y_test: (20,) (20,)
['candidate00004' 'candidate00004' 'candidate00004' 'candidate00014'
 'candidate00014' 'candidate00014' 'candidate00004' 'candidate00004'
 'candidate00009' 'candidate00004' 'candidate00004' 'candidate00004'
 'candidate00014' 'candidate00004' 'candidate00014' 'candidate00004'
 'candidate00004' 'candidate00004' 'candidate00004' 'candidate00004']
['candidate00001' 'candidate00002' 'candidate00003' 'candidate00004'
 'candidate00005' 'candidate00006' 'candidate00007' 'candidate00008'
 'candidate00009' 'candidate00010' 'candidate00011' 'candidate00012'
 'candidate00013' 'candidate00014' 'candidate00015' 'candidate00016'
 'candidate00017' 'candidate00018' 'candidate00019' 'candidate00020']
accuracy: 0.05
problem00001 MEAN ACCURACY SCORES: 0.05
problem00002
	 language:  en
	 5 candidate authors
	 35 known texts
Do cross validation.
(25, 20940)
	 pos vocabulary size: 559 char

  f = msb / msw


y_pred y_test: (10,) (10,)
['candidate00004' 'candidate00004' 'candidate00002' 'candidate00005'
 'candidate00005' 'candidate00005' 'candidate00004' 'candidate00005'
 'candidate00005' 'candidate00005']
['candidate00001' 'candidate00001' 'candidate00002' 'candidate00002'
 'candidate00003' 'candidate00003' 'candidate00004' 'candidate00004'
 'candidate00005' 'candidate00005']
accuracy: 0.4
(25, 21107)
	 pos vocabulary size: 573 char vocabulary size: 21107
lexical diversity: (25, 1)
[29. 40. 41. 37. 29. 12. 28. 11. 25. 12. 40. 17. 43. 16. 31. 33. 18. 31.
 38. 47. 28. 25. 18. 39. 24.]
pos data: (25, 573) char data: (25, 21107) word data: (25, 32519)
lexical diversity: (10, 1)
train shape: (25, 54200) test shape: (10, 54200)
training before feature selection: (25, 54200)
testing before feature selection: (10, 54200)
training after feature selection: (25, 46070)
testing after feature selection: (10, 46070)


  f = msb / msw


y_pred y_test: (10,) (10,)
['candidate00001' 'candidate00001' 'candidate00005' 'candidate00004'
 'candidate00005' 'candidate00004' 'candidate00005' 'candidate00005'
 'candidate00005' 'candidate00005']
['candidate00001' 'candidate00001' 'candidate00002' 'candidate00002'
 'candidate00003' 'candidate00003' 'candidate00004' 'candidate00004'
 'candidate00005' 'candidate00005']
accuracy: 0.4
(30, 24171)
	 pos vocabulary size: 596 char vocabulary size: 24171
lexical diversity: (30, 1)
[29. 40. 36. 24. 37. 29. 12. 28. 19. 12. 25. 12. 40. 17. 39. 13. 16. 31.
 33. 18. 40. 43. 38. 47. 28. 25. 20. 33. 39. 24.]
pos data: (30, 596) char data: (30, 24171) word data: (30, 38451)
lexical diversity: (5, 1)
train shape: (30, 63219) test shape: (5, 63219)
training before feature selection: (30, 63219)
testing before feature selection: (5, 63219)
training after feature selection: (30, 49363)
testing after feature selection: (5, 49363)


  f = msb / msw


y_pred y_test: (5,) (5,)
['candidate00001' 'candidate00005' 'candidate00003' 'candidate00001'
 'candidate00005']
['candidate00001' 'candidate00002' 'candidate00003' 'candidate00004'
 'candidate00005']
accuracy: 0.6
(30, 23884)
	 pos vocabulary size: 592 char vocabulary size: 23884
lexical diversity: (30, 1)
[29. 40. 36. 24. 41. 29. 12. 28. 19. 12. 11. 12. 40. 17. 39. 13. 43. 31.
 33. 18. 40. 43. 31. 47. 28. 25. 20. 33. 18. 24.]
pos data: (30, 592) char data: (30, 23884) word data: (30, 38443)
lexical diversity: (5, 1)
train shape: (30, 62920) test shape: (5, 62920)
training before feature selection: (30, 62920)
testing before feature selection: (5, 62920)
training after feature selection: (30, 53482)
testing after feature selection: (5, 53482)


  f = msb / msw


y_pred y_test: (5,) (5,)
['candidate00004' 'candidate00004' 'candidate00004' 'candidate00004'
 'candidate00004']
['candidate00001' 'candidate00002' 'candidate00003' 'candidate00004'
 'candidate00005']
accuracy: 0.2
(30, 24084)
	 pos vocabulary size: 601 char vocabulary size: 24084
lexical diversity: (30, 1)
[29. 40. 36. 24. 41. 37. 12. 28. 19. 12. 11. 25. 40. 17. 39. 13. 43. 16.
 33. 18. 40. 43. 31. 38. 28. 25. 20. 33. 18. 39.]
pos data: (30, 601) char data: (30, 24084) word data: (30, 38670)
lexical diversity: (5, 1)
train shape: (30, 63356) test shape: (5, 63356)
training before feature selection: (30, 63356)
testing before feature selection: (5, 63356)
training after feature selection: (30, 53852)
testing after feature selection: (5, 53852)


  f = msb / msw


y_pred y_test: (5,) (5,)
['candidate00001' 'candidate00003' 'candidate00003' 'candidate00001'
 'candidate00005']
['candidate00001' 'candidate00002' 'candidate00003' 'candidate00004'
 'candidate00005']
accuracy: 0.6
problem00002 MEAN ACCURACY SCORES: 0.43999999999999995
MEAN SCORES ACCROSS PROBLEMS: 0.24499999999999997
elapsed time: 484.9901819229126


In [None]:
def baseline_crossval(path, outpath, n=3, ft=5, pt=0.1, feature_selection=False, 
             open_set=False, c=1, feat_sel_percent=None, clf=None, calibration=True, clf_params={}):
    start_time = time.time()
    # Reading information about the collection
    infocollection = path+os.sep+'collection-info.json'
    problems = []
    language = []
    with open(infocollection, 'r') as f:
        for attrib in json.load(f):
            problems.append(attrib['problem-name'])
            language.append(attrib['language'])
    problem_scores = []
    for index,problem in enumerate(problems):
        print(problem)
        # Reading information about the problem
        infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
        candidates = []
        with open(infoproblem, 'r') as f:
            fj = json.load(f)
            unk_folder = fj['unknown-folder']
            for attrib in fj['candidate-authors']:
                candidates.append(attrib['author-name'])
        # Building training set
        docs=[]
        for candidate in candidates:
            docs.extend(read_files(path+os.sep+problem,candidate))
        train_labels = np.array([label for i,(text,label) in enumerate(docs)])
        print('\t', 'language: ', language[index])
        print('\t', len(candidates), 'candidate authors')
        print('\t', len(docs), 'known texts')
        
        ###### Applying Classifiers #####
        if calibration is True:
            clf=CalibratedClassifierCV(OneVsRestClassifier(clf(**clf_params)))
        else:
            clf=OneVsRestClassifier(clf(**clf_params))
        skf = StratifiedKFold(n_splits=7,random_state=442)
        scores = []
        for train_index, test_index in skf.split(docs, train_labels):
            X_train_docs = [docs[i] for i in train_index]
            X_test_docs = [docs[i] for i in test_index]
            y_train, y_test = train_labels[train_index], train_labels[test_index]
            feat_extractor = Feature_Extractor(n, ft)
            print('labels:', y_train)
            X_train = feat_extractor.fit_transform(X_train_docs)
            X_test = feat_extractor.transform(X_test_docs)
            print('train shape:', X_train.shape, 'test shape:', X_test.shape)
            if feature_selection is True:
                ####### Feature Selection - Fit #######
                print("training before feature selection:", X_train.shape)
                print("testing before feature selection:", X_test.shape)
                #sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
                #train_data = sel.fit_transform(train_data)
                # We use the default selection function: the 10% most significant features
                sel = SelectPercentile(f_classif, percentile=feat_sel_percent)
                X_train = sel.fit_transform(X_train, y_train)
                X_test = sel.transform(X_test)
                #sel = SelectKBest(chi2, k=100000)
                #train_data = sel.fit_transform(train_data, train_labels)
                print("training after feature selection:", X_train.shape)
                print("testing after feature selection:", X_test.shape)
            max_abs_scaler = preprocessing.MaxAbsScaler()
            X_train = max_abs_scaler.fit_transform(X_train)
            X_test = max_abs_scaler.transform(X_test)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            scores.append(accuracy_score(y_test, y_pred))
            #write_results(path, problem, unk_folder, predictions)
        mean_score = np.mean(scores)
        print(problem, 'MEAN ACCURACY SCORES:', mean_score)
        problem_scores.append(mean_score)
    print('MEAN SCORES ACCROSS PROBLEMS:', np.mean(problem_scores))
    # todo: also add stdev of scores
    print('elapsed time:', time.time() - start_time)

base_dir='pan18-cross-domain-authorship-attribution-training-dataset-2017-12-02'
out_dir = base_dir+os.sep+'output-dir'
eval_dir = base_dir+os.sep+'eval-dir'
#params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': False, 'c':0.1, 'feat_sel_percent': None, 'clf': 'SVC'}
params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': True, 'clf_params': {'C': 0.1}, 'feat_sel_percent': 85, 'clf': SVC, 'calibration': True}
#params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': True, 'c':1, 'feat_sel_percent': 85, 'clf': 'SVC'}
#params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': True, 'c':10, 'feat_sel_percent': 85, 'clf': 'SVC'}
baseline_crossval(base_dir,out_dir,**params)
%timeit

problem00001
	 language:  en
	 20 candidate authors
	 140 known texts
labels: ['candidate00001' 'candidate00001' 'candidate00001' 'candidate00001'
 'candidate00001' 'candidate00001' 'candidate00002' 'candidate00002'
 'candidate00002' 'candidate00002' 'candidate00002' 'candidate00002'
 'candidate00003' 'candidate00003' 'candidate00003' 'candidate00003'
 'candidate00003' 'candidate00003' 'candidate00004' 'candidate00004'
 'candidate00004' 'candidate00004' 'candidate00004' 'candidate00004'
 'candidate00005' 'candidate00005' 'candidate00005' 'candidate00005'
 'candidate00005' 'candidate00005' 'candidate00006' 'candidate00006'
 'candidate00006' 'candidate00006' 'candidate00006' 'candidate00006'
 'candidate00007' 'candidate00007' 'candidate00007' 'candidate00007'
 'candidate00007' 'candidate00007' 'candidate00008' 'candidate00008'
 'candidate00008' 'candidate00008' 'candidate00008' 'candidate00008'
 'candidate00009' 'candidate00009' 'candidate00009' 'candidate00009'
 'candidate00009' 'candid

  f = msb / msw


training after feature selection: (120, 167852)
testing after feature selection: (20, 167852)




labels: ['candidate00001' 'candidate00001' 'candidate00001' 'candidate00001'
 'candidate00001' 'candidate00001' 'candidate00002' 'candidate00002'
 'candidate00002' 'candidate00002' 'candidate00002' 'candidate00002'
 'candidate00003' 'candidate00003' 'candidate00003' 'candidate00003'
 'candidate00003' 'candidate00003' 'candidate00004' 'candidate00004'
 'candidate00004' 'candidate00004' 'candidate00004' 'candidate00004'
 'candidate00005' 'candidate00005' 'candidate00005' 'candidate00005'
 'candidate00005' 'candidate00005' 'candidate00006' 'candidate00006'
 'candidate00006' 'candidate00006' 'candidate00006' 'candidate00006'
 'candidate00007' 'candidate00007' 'candidate00007' 'candidate00007'
 'candidate00007' 'candidate00007' 'candidate00008' 'candidate00008'
 'candidate00008' 'candidate00008' 'candidate00008' 'candidate00008'
 'candidate00009' 'candidate00009' 'candidate00009' 'candidate00009'
 'candidate00009' 'candidate00009' 'candidate00010' 'candidate00010'
 'candidate00010' 'candida

In [None]:
#params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': False, 'c':0.1, 'feat_sel_percent': None, 'clf': 'SVC'}
params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': True, 'c':0.1, 'feat_sel_percent': 85, 'clf': 'SVC', 'calibration': True}
#params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': True, 'c':1, 'feat_sel_percent': 85, 'clf': 'SVC'}
#params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': True, 'c':10, 'feat_sel_percent': 85, 'clf': 'SVC'}
baseline(base_dir,out_dir,**params)
%timeit

In [33]:
def baseline_old(path, outpath, n=3, ft=5, pt=0.1, feature_selection=False, 
             open_set=False, c=1, feat_sel_percent=None, clf=None):
    start_time = time.time()
    # Reading information about the collection
    infocollection = path+os.sep+'collection-info.json'
    problems = []
    language = []
    with open(infocollection, 'r') as f:
        for attrib in json.load(f):
            problems.append(attrib['problem-name'])
            language.append(attrib['language'])
    for index,problem in enumerate(problems):
        print(problem)
        # Reading information about the problem
        infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
        candidates = []
        with open(infoproblem, 'r') as f:
            fj = json.load(f)
            unk_folder = fj['unknown-folder']
            for attrib in fj['candidate-authors']:
                candidates.append(attrib['author-name'])
        # Building training set
        train_docs=[]
        for candidate in candidates:
            train_docs.extend(read_files(path+os.sep+problem,candidate))
        train_labels = [label for i,(text,label) in enumerate(train_docs)]
        #### Feature Extraction ###
        ###### Fit-Transform Training Set #######
        feat_extractor = Feature_Extractor(n, ft)
        train_data = feat_extractor.fit_transform(train_docs)
        if feature_selection is True:
            ####### Feature Selection - Fit #######
            print("training before feature selection:", train_data.shape)
            #sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
            #train_data = sel.fit_transform(train_data)
            # We use the default selection function: the 10% most significant features
            sel = SelectPercentile(f_classif, percentile=feat_sel_percent)
            train_data = sel.fit_transform(train_data, train_labels)
            #sel = SelectKBest(chi2, k=100000)
            #train_data = sel.fit_transform(train_data, train_labels)
            print("training after feature selection:", train_data.shape)
        print('\t', 'language: ', language[index])
        print('\t', len(candidates), 'candidate authors')
        print('\t', len(train_docs), 'known texts')
        
        ###### Transform Test Set #######
        test_docs=read_files(path+os.sep+problem,unk_folder)
        test_data = feat_extractor.transform(test_docs)
        if feature_selection is True:
            ####### Feature Selection #######
            print("test before feature selection:", test_data.shape)
            test_data = sel.transform(test_data)
            print("test after feature selection:", test_data.shape)
        print('\t', len(test_docs), 'unknown texts')
        
        ###### Applying Classifiers #####
        max_abs_scaler = preprocessing.MaxAbsScaler()
        scaled_train_data = max_abs_scaler.fit_transform(train_data)
        scaled_test_data = max_abs_scaler.transform(test_data)
        clf=CalibratedClassifierCV(OneVsRestClassifier(SVC(C=c)))
        clf.fit(scaled_train_data, train_labels)
        predictions=clf.predict(scaled_test_data)
        proba=clf.predict_proba(scaled_test_data)
        if open_set is True:
            # Reject option (used in open-set cases)
            count=0
            for i,p in enumerate(predictions):
                sproba=sorted(proba[i],reverse=True)
                if sproba[0]-sproba[1]<pt:
                    predictions[i]=u'<UNK>'
                    count=count+1
            print('\t',count,'texts left unattributed')
        # Saving output data
        out_data=[]
        unk_filelist = glob.glob(path+os.sep+problem+os.sep+unk_folder+os.sep+'*.txt')
        pathlen=len(path+os.sep+problem+os.sep+unk_folder+os.sep)
        for i,v in enumerate(predictions):
            out_data.append({'unknown-text': unk_filelist[i][pathlen:], 'predicted-author': v})
        with open(outpath+os.sep+'answers-'+problem+'.json', 'w') as f:
            json.dump(out_data, f, indent=4)
        print('\t', 'answers saved to file','answers-'+problem+'.json')
    print('elapsed time:', time.time() - start_time)

base_dir='pan18-cross-domain-authorship-attribution-training-dataset-2017-12-02'
out_dir = base_dir+os.sep+'output-dir'
eval_dir = base_dir+os.sep+'eval-dir'
params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': True, 'c':0.1, 'feat_sel_percent': 85, 'clf': 'SVC'}
#params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': True, 'c':1, 'feat_sel_percent': 85, 'clf': 'SVC'}
baseline_old(base_dir,out_dir,**params)
%timeit

problem00001
(140, 64239)
	 pos vocabulary size: 861 char vocabulary size: 64239
lexical diversity: (140, 1)
[ 63.  80.  70.  71.  74.  79.  72.  64.  76.  97.  98.  87.  74.  74.
 101. 104. 106. 112. 103. 101.  95.  87.  79.  64.  69. 102. 103.  80.
  77. 111.  87.  85. 115.  82. 111.  39.  45.  72.  61.  73.  73.  59.
  92.  59.  78.  64.  73.  81.  76.  75.  72.  95.  67.  95.  85.  63.
  66.  74.  84.  73.  89.  81.  67. 100.  75.  62.  49.  89.  78. 100.
  79.  82.  69.  89.  73.  97.  71.  86.  85. 104.  99.  97.  87. 100.
 100.  55.  75.  67.  62.  68.  74.  98.  84.  91.  96. 103.  74. 103.
  98. 116.  56.  77.  62.  86.  78.  68.  73.  70.  76.  49.  69.  81.
  58.  74.  66.  87.  75.  76.  63.  90. 101.  77.  79.  84.  94.  83.
  80.  83.  56.  85.  61.  80.  88.  64.  78.  84.  95. 109.  71.  87.]
pos data: (140, 861) char data: (140, 64239) word data: (140, 158555)
training before feature selection: (140, 223656)


  f = msb / msw


training after feature selection: (140, 190107)
	 language:  en
	 20 candidate authors
	 140 known texts
lexical diversity: (105, 1)
test before feature selection: (105, 223656)
test after feature selection: (105, 190107)
	 105 unknown texts




	 answers saved to file answers-problem00001.json
problem00002
(35, 26761)
	 pos vocabulary size: 619 char vocabulary size: 26761
lexical diversity: (35, 1)
[29. 40. 36. 24. 41. 37. 29. 12. 28. 19. 12. 11. 25. 12. 40. 17. 39. 13.
 43. 16. 31. 33. 18. 40. 43. 31. 38. 47. 28. 25. 20. 33. 18. 39. 24.]
pos data: (35, 619) char data: (35, 26761) word data: (35, 44504)
training before feature selection: (35, 71885)
training after feature selection: (35, 56848)
	 language:  en
	 5 candidate authors
	 35 known texts


  f = msb / msw


lexical diversity: (21, 1)
test before feature selection: (21, 71885)
test after feature selection: (21, 56848)
	 21 unknown texts




	 answers saved to file answers-problem00002.json
elapsed time: 42.82914471626282


In [31]:
params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': True, 'c':0.1, 'feat_sel_percent': 85, 'clf': 'Keras Neural Network'}

In [None]:
params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': True, 'c':0.1, 'feat_sel_percent': 85, 'clf': 'Keras Neural Network'}
#params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': False, 'c':0.1, 'feat_sel_percent': None, 'clf': 'SVC'}
#params = {'n': 5,'ft': 3,'pt': 0.05,'feature_selection': True, 'c':1, 'feat_sel_percent': 85, 'clf': 'SVC'}
baseline_old(base_dir,out_dir,**params)
%timeit

In [32]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
# Evaluation script for the Cross-Domain Authorship Attribution task @PAN2019.
We use the F1 metric (macro-average) as implemented in scikit-learn:
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
We include the following ad hoc rules:
- If authors are predicted which were not seen during training,
  these predictions will count as false predictions ('<UNK>' class)
  and they will negatively effect performance.
- If texts are left unattributed they will assigned to the ('<UNK>'
  class) and they will negatively effect performance.
- The <UNK> class is excluded from the macro-average across classes.
- If multiple test attributions are given for a single unknown document,
  only the first one will be taken into consideration.

Dependencies:
- Python 2.7 or 3.6 (we recommend the Anaconda Python distribution)
- scikit-learn

Usage from the command line:
>>> python pan19-cdaa-evaluator.py -i COLLECTION -a ANSWERS -o OUTPUT
where
    COLLECTION is the path to the main folder of the evaluation collection
    ANSWERS is the path to the answers folder of a submitted method
    OUTPUT is the path to the folder where the results of the evaluation will be saved

Example: 
>>>  python pan19-cdaa-evaluator.py -i ".\pan19-cross-domain-authorship-attribution-training-dataset-2019-01-23\" -a ".\answ
ers-unigram" -o ".\eval-unigram\"

# References:
@article{scikit-learn,
 title={Scikit-learn: Machine Learning in {P}ython},
 author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
 journal={Journal of Machine Learning Research},
 volume={12},
 pages={2825--2830},
 year={2011}
}
"""

import argparse
import os
import json
import warnings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

def eval_measures(gt, pred):
    """Compute macro-averaged F1-scores, macro-averaged precision, 
    macro-averaged recall, and micro-averaged accuracy according the ad hoc
    rules discussed at the top of this file.
    Parameters
    ----------
    gt : dict
        Ground truth, where keys indicate text file names
        (e.g. `unknown00002.txt`), and values represent
        author labels (e.g. `candidate00003`)
    pred : dict
        Predicted attribution, where keys indicate text file names
        (e.g. `unknown00002.txt`), and values represent
        author labels (e.g. `candidate00003`)
    Returns
    -------
    f1 : float
        Macro-averaged F1-score
    precision : float
        Macro-averaged precision
    recall : float
        Macro-averaged recall
    accuracy : float
        Micro-averaged F1-score
    """

    actual_authors = list(gt.values())
    encoder = LabelEncoder().fit(['<UNK>'] + actual_authors)

    text_ids, gold_authors, silver_authors = [], [], []
    for text_id in sorted(gt):
        text_ids.append(text_id)
        gold_authors.append(gt[text_id])
        try:
            silver_authors.append(pred[text_id])
        except KeyError:
            # missing attributions get <UNK>:
            silver_authors.append('<UNK>')

    assert len(text_ids) == len(gold_authors)
    assert len(text_ids) == len(silver_authors)

    # replace non-existent silver authors with '<UNK>':
    silver_authors = [a if a in encoder.classes_ else '<UNK>' 
                      for a in silver_authors]

    gold_author_ints = encoder.transform(gold_authors)
    silver_author_ints = encoder.transform(silver_authors)

    # get F1 for individual classes (and suppress warnings):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        labels=list(set(gold_author_ints))
        # Exclude the <UNK> class
        for x in labels:
            if encoder.inverse_transform(np.array([x]))=='<UNK>':
                labels.remove(x)
        f1 = f1_score(gold_author_ints,
                  silver_author_ints,
                  labels,
                  average='macro')
        precision = precision_score(gold_author_ints,
                  silver_author_ints,
                  labels,
                  average='macro')
        recall = recall_score(gold_author_ints,
                  silver_author_ints,
                  labels,
                  average='macro')
        accuracy = accuracy_score(gold_author_ints,
                  silver_author_ints)

    return f1,precision,recall

def evaluate(ground_truth_file,predictions_file):
    # Calculates evaluation measures for a single attribution problem
    gt = {}
    with open(ground_truth_file, 'r') as f:
        for attrib in json.load(f)['ground_truth']:
            gt[attrib['unknown-text']] = attrib['true-author']

    pred = {}
    with open(predictions_file, 'r') as f:
        for attrib in json.load(f):
            if attrib['unknown-text'] not in pred:
                pred[attrib['unknown-text']] = attrib['predicted-author']
    f1,precision,recall =  eval_measures(gt,pred)
    return round(f1,3), round(precision,3), round(recall,3)

def evaluate_all(path_collection,path_answers,path_out,params):
    # Calculates evaluation measures for a PAN-18 collection of attribution problems
    infocollection = path_collection+os.sep+'collection-info.json'
    problems = []
    data = []
    with open(infocollection, 'r') as f:
        for attrib in json.load(f):
            problems.append(attrib['problem-name'])
    scores=[];
    for problem in problems:
        prob_data = deepcopy(params)
        f1,precision,recall=evaluate(path_collection+os.sep+problem+os.sep+'ground-truth.json',path_answers+os.sep+'answers-'+problem+'.json')
        scores.append(f1)
        prob_data.update({'problem-name': problem, 'macro-f1': round(f1,3), 'macro-precision': round(precision,3), 'macro-recall': round(recall,3)})
        if os.path.isfile('metrics.csv'):
            with open('metrics.csv', 'a') as f:  # Just use 'w' mode in 3.x
                w = csv.DictWriter(f, prob_data.keys())
                w.writerow(prob_data)
        else:
            with open('metrics.csv', 'w') as f:  # Just use 'w' mode in 3.x
                w = csv.DictWriter(f, prob_data.keys())
                w.writeheader()
                w.writerow(prob_data)
        data.append(prob_data)
        print(str(problem),'Macro-F1:',round(f1,3))
    overall_score=sum(scores)/len(scores)
    # Saving data to output files (out.json and evaluation.prototext)
    with open(path_out+os.sep+'out.json', 'w') as f:
        json.dump({'problems': data, 'overall_score': round(overall_score,3)}, f, indent=4, sort_keys=True)
    print('Overall score:', round(overall_score,3))
    prototext='measure {\n key: "mean macro-f1"\n value: "'+str(round(overall_score,3))+'"\n}\n'
    with open(path_out+os.sep+'evaluation.prototext', 'w') as f:
        f.write(prototext)
    return pd.read_csv('metrics.csv')
        
params
evaluate_all(base_dir,out_dir,eval_dir, params)


problem00001 Macro-F1: 0.641
problem00002 Macro-F1: 0.783
Overall score: 0.712


Unnamed: 0,n,ft,pt,feature_selection,c,feat_sel_percent,clf,calibration,problem-name,macro-f1,macro-precision,macro-recall
0,5,3,0.05,True,0.1,85,SVC,True,problem00001,0.641,0.621,0.784
1,5,3,0.05,True,0.1,85,SVC,True,problem00002,0.783,0.783,0.783
2,5,3,0.05,True,0.1,85,SVC,True,problem00001,0.641,0.621,0.784
3,5,3,0.05,True,0.1,85,SVC,True,problem00002,0.783,0.783,0.783
4,5,3,0.05,True,0.1,85,Keras Neural Network,problem00001,0.641,0.621,0.784,
5,5,3,0.05,True,0.1,85,Keras Neural Network,problem00002,0.783,0.783,0.783,


In [None]:
df = pd.read_csv('metrics.csv')
df

In [None]:
df.set_index('c', inplace=True)
df

In [None]:
df[df['feature_selection'] == True].groupby(['problem-name'])['macro-recall', 'macro-precision'].plot(legend=True)